{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02, "eval_steps": 1000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1e-05, "grad_norm": 0.5270103216171265, "learning_rate": 5e-06, "loss": 0.1809, "loss/crossentropy": 2.8736772537231445, "loss/hidden": 0.0, "loss/logits": 0.18086732551455498, "loss/reg": 0.7982062101364136, "step": 1 }, { "epoch": 2e-05, "grad_norm": 0.4062630534172058, "learning_rate": 1e-05, "loss": 0.1918, "loss/crossentropy": 2.731119930744171, "loss/hidden": 0.0, "loss/logits": 0.1918385736644268, "loss/reg": 0.7974178791046143, "step": 2 }, { "epoch": 3e-05, "grad_norm": 0.4462648928165436, "learning_rate": 1.5e-05, "loss": 0.1994, "loss/crossentropy": 2.6421037912368774, "loss/hidden": 0.0, "loss/logits": 0.19944751635193825, "loss/reg": 0.7965840101242065, "step": 3 }, { "epoch": 4e-05, "grad_norm": 0.4140595495700836, "learning_rate": 2e-05, "loss": 0.2034, "loss/crossentropy": 2.7314128279685974, "loss/hidden": 0.0, "loss/logits": 0.20340709760785103, "loss/reg": 0.7956677675247192, "step": 4 }, { "epoch": 5e-05, "grad_norm": 0.44375380873680115, "learning_rate": 2.5e-05, "loss": 0.184, "loss/crossentropy": 2.7737303376197815, "loss/hidden": 0.0, "loss/logits": 0.1840168982744217, "loss/reg": 0.7949960231781006, "step": 5 }, { "epoch": 6e-05, "grad_norm": 0.49336883425712585, "learning_rate": 3e-05, "loss": 0.2128, "loss/crossentropy": 2.831206738948822, "loss/hidden": 0.0, "loss/logits": 0.2127683237195015, "loss/reg": 0.7942349910736084, "step": 6 }, { "epoch": 7e-05, "grad_norm": 0.46829503774642944, "learning_rate": 3.5e-05, "loss": 0.2038, "loss/crossentropy": 2.8482624292373657, "loss/hidden": 0.0, "loss/logits": 0.20383895188570023, "loss/reg": 0.7933483719825745, "step": 7 }, { "epoch": 8e-05, "grad_norm": 0.390006422996521, "learning_rate": 4e-05, "loss": 0.1849, "loss/crossentropy": 2.698530077934265, "loss/hidden": 0.0, "loss/logits": 0.18488221988081932, "loss/reg": 0.7926012277603149, "step": 8 }, { "epoch": 9e-05, "grad_norm": 0.40908268094062805, "learning_rate": 4.5e-05, "loss": 0.1945, "loss/crossentropy": 2.6987050771713257, "loss/hidden": 0.0, "loss/logits": 0.19453026726841927, "loss/reg": 0.7919560670852661, "step": 9 }, { "epoch": 0.0001, "grad_norm": 0.39829590916633606, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.8313575387001038, "loss/hidden": 0.0, "loss/logits": 0.18499354273080826, "loss/reg": 0.7910984754562378, "step": 10 }, { "epoch": 0.00011, "grad_norm": 0.37264055013656616, "learning_rate": 5e-05, "loss": 0.1666, "loss/crossentropy": 2.7429500222206116, "loss/hidden": 0.0, "loss/logits": 0.1665678545832634, "loss/reg": 0.7902828454971313, "step": 11 }, { "epoch": 0.00012, "grad_norm": 0.43996304273605347, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.738922595977783, "loss/hidden": 0.0, "loss/logits": 0.18249627202749252, "loss/reg": 0.7895547747612, "step": 12 }, { "epoch": 0.00013, "grad_norm": 0.41080981492996216, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.792860269546509, "loss/hidden": 0.0, "loss/logits": 0.18292934447526932, "loss/reg": 0.7890053987503052, "step": 13 }, { "epoch": 0.00014, "grad_norm": 0.3841441869735718, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.763161778450012, "loss/hidden": 0.0, "loss/logits": 0.17825737595558167, "loss/reg": 0.7885928153991699, "step": 14 }, { "epoch": 0.00015, "grad_norm": 0.3976081311702728, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.598387062549591, "loss/hidden": 0.0, "loss/logits": 0.1797671727836132, "loss/reg": 0.7878777384757996, "step": 15 }, { "epoch": 0.00016, "grad_norm": 0.39082610607147217, "grad_norm_var": 0.001811404450588372, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.829587936401367, "loss/hidden": 0.0, "loss/logits": 0.17502149939537048, "loss/reg": 0.7873101234436035, "step": 16 }, { "epoch": 0.00017, "grad_norm": 0.39959263801574707, "grad_norm_var": 0.0010849748136619616, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.791169047355652, "loss/hidden": 0.0, "loss/logits": 0.18430949375033379, "loss/reg": 0.7868510484695435, "step": 17 }, { "epoch": 0.00018, "grad_norm": 0.40876370668411255, "grad_norm_var": 0.001081932124929852, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.715011775493622, "loss/hidden": 0.0, "loss/logits": 0.1847250908613205, "loss/reg": 0.7865160703659058, "step": 18 }, { "epoch": 0.00019, "grad_norm": 0.4193006753921509, "grad_norm_var": 0.0010211432423421337, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.8628916144371033, "loss/hidden": 0.0, "loss/logits": 0.18271736428141594, "loss/reg": 0.7861402630805969, "step": 19 }, { "epoch": 0.0002, "grad_norm": 0.44665443897247314, "grad_norm_var": 0.0010833189087384397, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.6717955470085144, "loss/hidden": 0.0, "loss/logits": 0.17897583916783333, "loss/reg": 0.7855270504951477, "step": 20 }, { "epoch": 0.00021, "grad_norm": 0.38835349678993225, "grad_norm_var": 0.0010780315882545134, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.708828330039978, "loss/hidden": 0.0, "loss/logits": 0.18135912343859673, "loss/reg": 0.7850039005279541, "step": 21 }, { "epoch": 0.00022, "grad_norm": 0.36269623041152954, "grad_norm_var": 0.0007555412431051511, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.7537136673927307, "loss/hidden": 0.0, "loss/logits": 0.17225974053144455, "loss/reg": 0.7845560312271118, "step": 22 }, { "epoch": 0.00023, "grad_norm": 0.4036427140235901, "grad_norm_var": 0.0004749527944069687, "learning_rate": 5e-05, "loss": 0.1728, "loss/crossentropy": 2.8187889456748962, "loss/hidden": 0.0, "loss/logits": 0.1727624014019966, "loss/reg": 0.783955991268158, "step": 23 }, { "epoch": 0.00024, "grad_norm": 0.5961300134658813, "grad_norm_var": 0.002817287943831544, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.7439865469932556, "loss/hidden": 0.0, "loss/logits": 0.1830729991197586, "loss/reg": 0.7834941744804382, "step": 24 }, { "epoch": 0.00025, "grad_norm": 0.3856281638145447, "grad_norm_var": 0.002867928263253694, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.7871618270874023, "loss/hidden": 0.0, "loss/logits": 0.17662257328629494, "loss/reg": 0.7828161716461182, "step": 25 }, { "epoch": 0.00026, "grad_norm": 0.40820741653442383, "grad_norm_var": 0.0028548798491390285, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.6090471744537354, "loss/hidden": 0.0, "loss/logits": 0.1743181087076664, "loss/reg": 0.782204806804657, "step": 26 }, { "epoch": 0.00027, "grad_norm": 0.4131574034690857, "grad_norm_var": 0.0027370986652148913, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.7366899847984314, "loss/hidden": 0.0, "loss/logits": 0.17878013476729393, "loss/reg": 0.7813587784767151, "step": 27 }, { "epoch": 0.00028, "grad_norm": 0.3993757367134094, "grad_norm_var": 0.0027102008794313162, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 2.8000608086586, "loss/hidden": 0.0, "loss/logits": 0.17697696760296822, "loss/reg": 0.780660092830658, "step": 28 }, { "epoch": 0.00029, "grad_norm": 0.4057616889476776, "grad_norm_var": 0.002713557663591652, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.900210976600647, "loss/hidden": 0.0, "loss/logits": 0.17615076526999474, "loss/reg": 0.7802259922027588, "step": 29 }, { "epoch": 0.0003, "grad_norm": 1.358260154724121, "grad_norm_var": 0.05825711439926228, "learning_rate": 5e-05, "loss": 0.2241, "loss/crossentropy": 2.8913666009902954, "loss/hidden": 0.0, "loss/logits": 0.22410936281085014, "loss/reg": 0.7796356678009033, "step": 30 }, { "epoch": 0.00031, "grad_norm": 0.39594557881355286, "grad_norm_var": 0.05827422064650657, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.7084341645240784, "loss/hidden": 0.0, "loss/logits": 0.17226646095514297, "loss/reg": 0.7787068486213684, "step": 31 }, { "epoch": 0.00032, "grad_norm": 0.4812188744544983, "grad_norm_var": 0.05778373991656244, "learning_rate": 5e-05, "loss": 0.2044, "loss/crossentropy": 2.6707186102867126, "loss/hidden": 0.0, "loss/logits": 0.20440414175391197, "loss/reg": 0.7783492207527161, "step": 32 }, { "epoch": 0.00033, "grad_norm": 0.4579252004623413, "grad_norm_var": 0.057374579522911306, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.7614511847496033, "loss/hidden": 0.0, "loss/logits": 0.1840612068772316, "loss/reg": 0.7775547504425049, "step": 33 }, { "epoch": 0.00034, "grad_norm": 0.5211064219474792, "grad_norm_var": 0.05704856861913689, "learning_rate": 5e-05, "loss": 0.1863, "loss/crossentropy": 2.8084943890571594, "loss/hidden": 0.0, "loss/logits": 0.18629974499344826, "loss/reg": 0.7766918540000916, "step": 34 }, { "epoch": 0.00035, "grad_norm": 0.3911280930042267, "grad_norm_var": 0.057364535345787185, "learning_rate": 5e-05, "loss": 0.1691, "loss/crossentropy": 2.7717766761779785, "loss/hidden": 0.0, "loss/logits": 0.1690610833466053, "loss/reg": 0.775999128818512, "step": 35 }, { "epoch": 0.00036, "grad_norm": 0.4046674966812134, "grad_norm_var": 0.05770869624819515, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.8008471727371216, "loss/hidden": 0.0, "loss/logits": 0.17295262217521667, "loss/reg": 0.7752785682678223, "step": 36 }, { "epoch": 0.00037, "grad_norm": 0.4518427550792694, "grad_norm_var": 0.0571355050139908, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 2.8252168893814087, "loss/hidden": 0.0, "loss/logits": 0.18964741379022598, "loss/reg": 0.7747377753257751, "step": 37 }, { "epoch": 0.00038, "grad_norm": 0.3834437429904938, "grad_norm_var": 0.05681081544799931, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.8683040738105774, "loss/hidden": 0.0, "loss/logits": 0.17346005886793137, "loss/reg": 0.7740543484687805, "step": 38 }, { "epoch": 0.00039, "grad_norm": 0.36665084958076477, "grad_norm_var": 0.05732765252691353, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.8117648363113403, "loss/hidden": 0.0, "loss/logits": 0.1820988766849041, "loss/reg": 0.7734794616699219, "step": 39 }, { "epoch": 0.0004, "grad_norm": 0.356393039226532, "grad_norm_var": 0.057488269670974514, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.839358627796173, "loss/hidden": 0.0, "loss/logits": 0.17431866750121117, "loss/reg": 0.7729082703590393, "step": 40 }, { "epoch": 0.00041, "grad_norm": 0.4137648642063141, "grad_norm_var": 0.05720698784095859, "learning_rate": 5e-05, "loss": 0.1881, "loss/crossentropy": 2.7743460536003113, "loss/hidden": 0.0, "loss/logits": 0.18814967200160027, "loss/reg": 0.7723925113677979, "step": 41 }, { "epoch": 0.00042, "grad_norm": 0.381852924823761, "grad_norm_var": 0.05748704589840505, "learning_rate": 5e-05, "loss": 0.1693, "loss/crossentropy": 2.807208240032196, "loss/hidden": 0.0, "loss/logits": 0.16934632509946823, "loss/reg": 0.7718072533607483, "step": 42 }, { "epoch": 0.00043, "grad_norm": 0.37129101157188416, "grad_norm_var": 0.057935705101604874, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 2.7850345373153687, "loss/hidden": 0.0, "loss/logits": 0.17699582874774933, "loss/reg": 0.7712724804878235, "step": 43 }, { "epoch": 0.00044, "grad_norm": 0.36275389790534973, "grad_norm_var": 0.05837067502173371, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.7764599919319153, "loss/hidden": 0.0, "loss/logits": 0.17179790511727333, "loss/reg": 0.7707749009132385, "step": 44 }, { "epoch": 0.00045, "grad_norm": 0.39623042941093445, "grad_norm_var": 0.0584567187863227, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.7227214574813843, "loss/hidden": 0.0, "loss/logits": 0.18169544637203217, "loss/reg": 0.7701820135116577, "step": 45 }, { "epoch": 0.00046, "grad_norm": 0.3701828420162201, "grad_norm_var": 0.0022424702029780826, "learning_rate": 5e-05, "loss": 0.1838, "loss/crossentropy": 2.7904654145240784, "loss/hidden": 0.0, "loss/logits": 0.1837669126689434, "loss/reg": 0.7697222828865051, "step": 46 }, { "epoch": 0.00047, "grad_norm": 0.35735759139060974, "grad_norm_var": 0.0023906090579918883, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.825423777103424, "loss/hidden": 0.0, "loss/logits": 0.17226213216781616, "loss/reg": 0.7689950466156006, "step": 47 }, { "epoch": 0.00048, "grad_norm": 0.39745163917541504, "grad_norm_var": 0.0019693731653592857, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.906936526298523, "loss/hidden": 0.0, "loss/logits": 0.1874341331422329, "loss/reg": 0.7681844234466553, "step": 48 }, { "epoch": 0.00049, "grad_norm": 0.3815530836582184, "grad_norm_var": 0.001733911862827997, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.7593725323677063, "loss/hidden": 0.0, "loss/logits": 0.17896374315023422, "loss/reg": 0.767394483089447, "step": 49 }, { "epoch": 0.0005, "grad_norm": 0.43525999784469604, "grad_norm_var": 0.0007422541135807869, "learning_rate": 5e-05, "loss": 0.1888, "loss/crossentropy": 2.797346353530884, "loss/hidden": 0.0, "loss/logits": 0.18881763890385628, "loss/reg": 0.7664579153060913, "step": 50 }, { "epoch": 0.00051, "grad_norm": 0.375160276889801, "grad_norm_var": 0.000753369495519074, "learning_rate": 5e-05, "loss": 0.1682, "loss/crossentropy": 2.7356534600257874, "loss/hidden": 0.0, "loss/logits": 0.1681925468146801, "loss/reg": 0.7653323411941528, "step": 51 }, { "epoch": 0.00052, "grad_norm": 0.36905786395072937, "grad_norm_var": 0.0007528498147797991, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.8263298869132996, "loss/hidden": 0.0, "loss/logits": 0.1756824254989624, "loss/reg": 0.7643814086914062, "step": 52 }, { "epoch": 0.00053, "grad_norm": 0.43510788679122925, "grad_norm_var": 0.0006226350014423657, "learning_rate": 5e-05, "loss": 0.1902, "loss/crossentropy": 2.8569173216819763, "loss/hidden": 0.0, "loss/logits": 0.19024847447872162, "loss/reg": 0.763468861579895, "step": 53 }, { "epoch": 0.00054, "grad_norm": 0.35045096278190613, "grad_norm_var": 0.0006957299300366297, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.721441149711609, "loss/hidden": 0.0, "loss/logits": 0.18204239383339882, "loss/reg": 0.7624586224555969, "step": 54 }, { "epoch": 0.00055, "grad_norm": 0.3694230318069458, "grad_norm_var": 0.0006903400176884511, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.7874931693077087, "loss/hidden": 0.0, "loss/logits": 0.18268746510148048, "loss/reg": 0.7619009613990784, "step": 55 }, { "epoch": 0.00056, "grad_norm": 0.41356557607650757, "grad_norm_var": 0.0006940520926006166, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 2.7819440960884094, "loss/hidden": 0.0, "loss/logits": 0.17120670154690742, "loss/reg": 0.7609783411026001, "step": 56 }, { "epoch": 0.00057, "grad_norm": 0.36727696657180786, "grad_norm_var": 0.0006587543298347024, "learning_rate": 5e-05, "loss": 0.1737, "loss/crossentropy": 2.7824912071228027, "loss/hidden": 0.0, "loss/logits": 0.1736629232764244, "loss/reg": 0.7603233456611633, "step": 57 }, { "epoch": 0.00058, "grad_norm": 0.46450212597846985, "grad_norm_var": 0.0010689284185112613, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.6764307022094727, "loss/hidden": 0.0, "loss/logits": 0.18205635994672775, "loss/reg": 0.7595146894454956, "step": 58 }, { "epoch": 0.00059, "grad_norm": 0.3684505224227905, "grad_norm_var": 0.0010759650833925606, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.6815553307533264, "loss/hidden": 0.0, "loss/logits": 0.18161746487021446, "loss/reg": 0.758629560470581, "step": 59 }, { "epoch": 0.0006, "grad_norm": 0.3717678487300873, "grad_norm_var": 0.0010502664825839355, "learning_rate": 5e-05, "loss": 0.1877, "loss/crossentropy": 2.6754263639450073, "loss/hidden": 0.0, "loss/logits": 0.1877485141158104, "loss/reg": 0.7579225897789001, "step": 60 }, { "epoch": 0.00061, "grad_norm": 0.3490316569805145, "grad_norm_var": 0.0011435244005569315, "learning_rate": 5e-05, "loss": 0.1731, "loss/crossentropy": 2.952909231185913, "loss/hidden": 0.0, "loss/logits": 0.17311423271894455, "loss/reg": 0.7573174238204956, "step": 61 }, { "epoch": 0.00062, "grad_norm": 0.3769521713256836, "grad_norm_var": 0.0011321347540370953, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.8419917225837708, "loss/hidden": 0.0, "loss/logits": 0.17401066422462463, "loss/reg": 0.7566163539886475, "step": 62 }, { "epoch": 0.00063, "grad_norm": 0.3968483805656433, "grad_norm_var": 0.0010766940230082956, "learning_rate": 5e-05, "loss": 0.1956, "loss/crossentropy": 2.744890511035919, "loss/hidden": 0.0, "loss/logits": 0.19561778008937836, "loss/reg": 0.7562834620475769, "step": 63 }, { "epoch": 0.00064, "grad_norm": 0.3647221624851227, "grad_norm_var": 0.001106179146660813, "learning_rate": 5e-05, "loss": 0.171, "loss/crossentropy": 2.671845555305481, "loss/hidden": 0.0, "loss/logits": 0.17096153646707535, "loss/reg": 0.7556965351104736, "step": 64 }, { "epoch": 0.00065, "grad_norm": 0.36414504051208496, "grad_norm_var": 0.0011373455641630504, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.8465015292167664, "loss/hidden": 0.0, "loss/logits": 0.17246084287762642, "loss/reg": 0.754684329032898, "step": 65 }, { "epoch": 0.00066, "grad_norm": 0.45451879501342773, "grad_norm_var": 0.0012877051681206453, "learning_rate": 5e-05, "loss": 0.2184, "loss/crossentropy": 2.6697899103164673, "loss/hidden": 0.0, "loss/logits": 0.2184288650751114, "loss/reg": 0.7540901899337769, "step": 66 }, { "epoch": 0.00067, "grad_norm": 0.36097726225852966, "grad_norm_var": 0.0013225468602833504, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.7340784072875977, "loss/hidden": 0.0, "loss/logits": 0.1794683113694191, "loss/reg": 0.7532380223274231, "step": 67 }, { "epoch": 0.00068, "grad_norm": 0.3915371596813202, "grad_norm_var": 0.001303200108164489, "learning_rate": 5e-05, "loss": 0.1857, "loss/crossentropy": 2.689909040927887, "loss/hidden": 0.0, "loss/logits": 0.1857033111155033, "loss/reg": 0.7523663640022278, "step": 68 }, { "epoch": 0.00069, "grad_norm": 0.37674814462661743, "grad_norm_var": 0.0011452637775346219, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.868057429790497, "loss/hidden": 0.0, "loss/logits": 0.17943349108099937, "loss/reg": 0.7514806985855103, "step": 69 }, { "epoch": 0.0007, "grad_norm": 0.437984436750412, "grad_norm_var": 0.0012348387155401578, "learning_rate": 5e-05, "loss": 0.2008, "loss/crossentropy": 2.851492941379547, "loss/hidden": 0.0, "loss/logits": 0.200757447630167, "loss/reg": 0.7506260275840759, "step": 70 }, { "epoch": 0.00071, "grad_norm": 0.34920695424079895, "grad_norm_var": 0.001313900990699072, "learning_rate": 5e-05, "loss": 0.1702, "loss/crossentropy": 2.6923986673355103, "loss/hidden": 0.0, "loss/logits": 0.17021964490413666, "loss/reg": 0.7496992349624634, "step": 71 }, { "epoch": 0.00072, "grad_norm": 0.37103593349456787, "grad_norm_var": 0.0012820598647036144, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.8815476298332214, "loss/hidden": 0.0, "loss/logits": 0.17802581191062927, "loss/reg": 0.7488979697227478, "step": 72 }, { "epoch": 0.00073, "grad_norm": 0.36377376317977905, "grad_norm_var": 0.0012912717751293694, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.708466947078705, "loss/hidden": 0.0, "loss/logits": 0.17466137930750847, "loss/reg": 0.7479438185691833, "step": 73 }, { "epoch": 0.00074, "grad_norm": 0.36635515093803406, "grad_norm_var": 0.0008547391029340989, "learning_rate": 5e-05, "loss": 0.1739, "loss/crossentropy": 2.6161566972732544, "loss/hidden": 0.0, "loss/logits": 0.17386648431420326, "loss/reg": 0.7471716403961182, "step": 74 }, { "epoch": 0.00075, "grad_norm": 0.35312220454216003, "grad_norm_var": 0.0008909917765315911, "learning_rate": 5e-05, "loss": 0.1738, "loss/crossentropy": 2.7570412158966064, "loss/hidden": 0.0, "loss/logits": 0.17380788922309875, "loss/reg": 0.7465042471885681, "step": 75 }, { "epoch": 0.00076, "grad_norm": 0.37259989976882935, "grad_norm_var": 0.0008903386088192633, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.6272838711738586, "loss/hidden": 0.0, "loss/logits": 0.18285975605249405, "loss/reg": 0.7458574771881104, "step": 76 }, { "epoch": 0.00077, "grad_norm": 0.38441765308380127, "grad_norm_var": 0.0008314629066926001, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.8397573828697205, "loss/hidden": 0.0, "loss/logits": 0.18597644940018654, "loss/reg": 0.7451755404472351, "step": 77 }, { "epoch": 0.00078, "grad_norm": 0.359567791223526, "grad_norm_var": 0.0008581324612414527, "learning_rate": 5e-05, "loss": 0.161, "loss/crossentropy": 2.8208199739456177, "loss/hidden": 0.0, "loss/logits": 0.16097503155469894, "loss/reg": 0.7444452047348022, "step": 78 }, { "epoch": 0.00079, "grad_norm": 0.36594003438949585, "grad_norm_var": 0.0008452022739110129, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.739391326904297, "loss/hidden": 0.0, "loss/logits": 0.1808476373553276, "loss/reg": 0.7435556650161743, "step": 79 }, { "epoch": 0.0008, "grad_norm": 0.3372277319431305, "grad_norm_var": 0.0009385243318465649, "learning_rate": 5e-05, "loss": 0.1614, "loss/crossentropy": 2.6478440165519714, "loss/hidden": 0.0, "loss/logits": 0.16136135160923004, "loss/reg": 0.7427063584327698, "step": 80 }, { "epoch": 0.00081, "grad_norm": 0.3764096796512604, "grad_norm_var": 0.0009292387210300234, "learning_rate": 5e-05, "loss": 0.1643, "loss/crossentropy": 2.7635027170181274, "loss/hidden": 0.0, "loss/logits": 0.1642814762890339, "loss/reg": 0.7419465184211731, "step": 81 }, { "epoch": 0.00082, "grad_norm": 0.3896169066429138, "grad_norm_var": 0.0005159683951443505, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.825999677181244, "loss/hidden": 0.0, "loss/logits": 0.18756844848394394, "loss/reg": 0.7408118844032288, "step": 82 }, { "epoch": 0.00083, "grad_norm": 0.36173221468925476, "grad_norm_var": 0.0005148660238783465, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 2.767561435699463, "loss/hidden": 0.0, "loss/logits": 0.170509684830904, "loss/reg": 0.7399515509605408, "step": 83 }, { "epoch": 0.00084, "grad_norm": 0.3493421673774719, "grad_norm_var": 0.0005180811227328164, "learning_rate": 5e-05, "loss": 0.172, "loss/crossentropy": 2.631771445274353, "loss/hidden": 0.0, "loss/logits": 0.1720210611820221, "loss/reg": 0.7389203906059265, "step": 84 }, { "epoch": 0.00085, "grad_norm": 0.4025665819644928, "grad_norm_var": 0.0005840317234738953, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.8273849487304688, "loss/hidden": 0.0, "loss/logits": 0.1720929592847824, "loss/reg": 0.7380579113960266, "step": 85 }, { "epoch": 0.00086, "grad_norm": 0.35690951347351074, "grad_norm_var": 0.0002740620503845529, "learning_rate": 5e-05, "loss": 0.1623, "loss/crossentropy": 2.8248605132102966, "loss/hidden": 0.0, "loss/logits": 0.16232208162546158, "loss/reg": 0.7371649742126465, "step": 86 }, { "epoch": 0.00087, "grad_norm": 0.3388076424598694, "grad_norm_var": 0.00030443737859618957, "learning_rate": 5e-05, "loss": 0.1685, "loss/crossentropy": 2.8332438468933105, "loss/hidden": 0.0, "loss/logits": 0.16849439963698387, "loss/reg": 0.7361722588539124, "step": 87 }, { "epoch": 0.00088, "grad_norm": 0.3534338176250458, "grad_norm_var": 0.00031101848729750967, "learning_rate": 5e-05, "loss": 0.1744, "loss/crossentropy": 2.825754165649414, "loss/hidden": 0.0, "loss/logits": 0.1744290553033352, "loss/reg": 0.7352069616317749, "step": 88 }, { "epoch": 0.00089, "grad_norm": 0.37284842133522034, "grad_norm_var": 0.00031530001643930903, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.74732905626297, "loss/hidden": 0.0, "loss/logits": 0.17155374586582184, "loss/reg": 0.7346110939979553, "step": 89 }, { "epoch": 0.0009, "grad_norm": 0.38669511675834656, "grad_norm_var": 0.0003446802067784582, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.8434954285621643, "loss/hidden": 0.0, "loss/logits": 0.16730739921331406, "loss/reg": 0.7339598536491394, "step": 90 }, { "epoch": 0.00091, "grad_norm": 0.3595193326473236, "grad_norm_var": 0.0003359745873844096, "learning_rate": 5e-05, "loss": 0.1706, "loss/crossentropy": 2.828117251396179, "loss/hidden": 0.0, "loss/logits": 0.1705867424607277, "loss/reg": 0.7330529689788818, "step": 91 }, { "epoch": 0.00092, "grad_norm": 0.3549748361110687, "grad_norm_var": 0.00034158877031712127, "learning_rate": 5e-05, "loss": 0.1679, "loss/crossentropy": 2.8331961035728455, "loss/hidden": 0.0, "loss/logits": 0.16791727021336555, "loss/reg": 0.732403576374054, "step": 92 }, { "epoch": 0.00093, "grad_norm": 0.4107847511768341, "grad_norm_var": 0.0004511058841404532, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.663391947746277, "loss/hidden": 0.0, "loss/logits": 0.1845145784318447, "loss/reg": 0.7319260239601135, "step": 93 }, { "epoch": 0.00094, "grad_norm": 0.3825834393501282, "grad_norm_var": 0.0004605663667711794, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.919118583202362, "loss/hidden": 0.0, "loss/logits": 0.1816139817237854, "loss/reg": 0.7312389016151428, "step": 94 }, { "epoch": 0.00095, "grad_norm": 0.37356749176979065, "grad_norm_var": 0.00046138341320389373, "learning_rate": 5e-05, "loss": 0.1744, "loss/crossentropy": 2.9054930210113525, "loss/hidden": 0.0, "loss/logits": 0.1743943840265274, "loss/reg": 0.7306094169616699, "step": 95 }, { "epoch": 0.00096, "grad_norm": 0.3637838661670685, "grad_norm_var": 0.0003922921136017406, "learning_rate": 5e-05, "loss": 0.1656, "loss/crossentropy": 2.743872106075287, "loss/hidden": 0.0, "loss/logits": 0.16557178646326065, "loss/reg": 0.7297105193138123, "step": 96 }, { "epoch": 0.00097, "grad_norm": 0.37616968154907227, "grad_norm_var": 0.00039211775676157857, "learning_rate": 5e-05, "loss": 0.1626, "loss/crossentropy": 2.8960859179496765, "loss/hidden": 0.0, "loss/logits": 0.1625775806605816, "loss/reg": 0.7290924191474915, "step": 97 }, { "epoch": 0.00098, "grad_norm": 0.4479593336582184, "grad_norm_var": 0.0007509737250548043, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.674549698829651, "loss/hidden": 0.0, "loss/logits": 0.18489592522382736, "loss/reg": 0.7281277775764465, "step": 98 }, { "epoch": 0.00099, "grad_norm": 0.37653249502182007, "grad_norm_var": 0.0007395083585161278, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.7631226778030396, "loss/hidden": 0.0, "loss/logits": 0.18166892230510712, "loss/reg": 0.7276618480682373, "step": 99 }, { "epoch": 0.001, "grad_norm": 0.3714850842952728, "grad_norm_var": 0.0006932053172160798, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.6640121936798096, "loss/hidden": 0.0, "loss/logits": 0.18403339013457298, "loss/reg": 0.7267426252365112, "step": 100 }, { "epoch": 0.00101, "grad_norm": 0.38296735286712646, "grad_norm_var": 0.0006498502401679781, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 2.825900375843048, "loss/hidden": 0.0, "loss/logits": 0.19007476046681404, "loss/reg": 0.7260308861732483, "step": 101 }, { "epoch": 0.00102, "grad_norm": 0.3539445102214813, "grad_norm_var": 0.0006577743963030754, "learning_rate": 5e-05, "loss": 0.171, "loss/crossentropy": 2.812491714954376, "loss/hidden": 0.0, "loss/logits": 0.1710238680243492, "loss/reg": 0.7255330085754395, "step": 102 }, { "epoch": 0.00103, "grad_norm": 0.40590155124664307, "grad_norm_var": 0.0006119657070469094, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.851368725299835, "loss/hidden": 0.0, "loss/logits": 0.18671603128314018, "loss/reg": 0.7246612906455994, "step": 103 }, { "epoch": 0.00104, "grad_norm": 0.3915748596191406, "grad_norm_var": 0.0005699621901156841, "learning_rate": 5e-05, "loss": 0.1703, "loss/crossentropy": 2.8360594511032104, "loss/hidden": 0.0, "loss/logits": 0.1702640950679779, "loss/reg": 0.7237181663513184, "step": 104 }, { "epoch": 0.00105, "grad_norm": 0.3639037013053894, "grad_norm_var": 0.0005858243677191492, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.729031264781952, "loss/hidden": 0.0, "loss/logits": 0.18198926746845245, "loss/reg": 0.722902238368988, "step": 105 }, { "epoch": 0.00106, "grad_norm": 0.3361886143684387, "grad_norm_var": 0.0007095755276376181, "learning_rate": 5e-05, "loss": 0.1692, "loss/crossentropy": 2.6542755365371704, "loss/hidden": 0.0, "loss/logits": 0.16922985389828682, "loss/reg": 0.7223352193832397, "step": 106 }, { "epoch": 0.00107, "grad_norm": 0.3290923237800598, "grad_norm_var": 0.0008433869570156127, "learning_rate": 5e-05, "loss": 0.1661, "loss/crossentropy": 2.731805145740509, "loss/hidden": 0.0, "loss/logits": 0.16609882190823555, "loss/reg": 0.7216607332229614, "step": 107 }, { "epoch": 0.00108, "grad_norm": 0.37744325399398804, "grad_norm_var": 0.0008109381838819818, "learning_rate": 5e-05, "loss": 0.171, "loss/crossentropy": 2.8411410450935364, "loss/hidden": 0.0, "loss/logits": 0.17101451382040977, "loss/reg": 0.7212023735046387, "step": 108 }, { "epoch": 0.00109, "grad_norm": 0.37820249795913696, "grad_norm_var": 0.0007337435467792456, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.8888030648231506, "loss/hidden": 0.0, "loss/logits": 0.17888950183987617, "loss/reg": 0.720390796661377, "step": 109 }, { "epoch": 0.0011, "grad_norm": 0.3815077841281891, "grad_norm_var": 0.0007328295306416136, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.7249990105628967, "loss/hidden": 0.0, "loss/logits": 0.18083461001515388, "loss/reg": 0.7198383212089539, "step": 110 }, { "epoch": 0.00111, "grad_norm": 0.36524108052253723, "grad_norm_var": 0.0007394623927214017, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.6892932653427124, "loss/hidden": 0.0, "loss/logits": 0.17633714899420738, "loss/reg": 0.719275712966919, "step": 111 }, { "epoch": 0.00112, "grad_norm": 0.3892326056957245, "grad_norm_var": 0.000741479081146501, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.566103994846344, "loss/hidden": 0.0, "loss/logits": 0.17570063471794128, "loss/reg": 0.7187307476997375, "step": 112 }, { "epoch": 0.00113, "grad_norm": 0.3487829864025116, "grad_norm_var": 0.0007903259995211464, "learning_rate": 5e-05, "loss": 0.1555, "loss/crossentropy": 2.739705502986908, "loss/hidden": 0.0, "loss/logits": 0.15549717471003532, "loss/reg": 0.7182660698890686, "step": 113 }, { "epoch": 0.00114, "grad_norm": 0.35256490111351013, "grad_norm_var": 0.0004310617748792785, "learning_rate": 5e-05, "loss": 0.1668, "loss/crossentropy": 2.8146185874938965, "loss/hidden": 0.0, "loss/logits": 0.16678886860609055, "loss/reg": 0.7178277969360352, "step": 114 }, { "epoch": 0.00115, "grad_norm": 0.3593945801258087, "grad_norm_var": 0.0004322871420656682, "learning_rate": 5e-05, "loss": 0.1732, "loss/crossentropy": 2.7010533213615417, "loss/hidden": 0.0, "loss/logits": 0.1731957569718361, "loss/reg": 0.7176061272621155, "step": 115 }, { "epoch": 0.00116, "grad_norm": 0.376709520816803, "grad_norm_var": 0.00043644566009559314, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.7798763513565063, "loss/hidden": 0.0, "loss/logits": 0.17804645374417305, "loss/reg": 0.7172375917434692, "step": 116 }, { "epoch": 0.00117, "grad_norm": 0.3759397566318512, "grad_norm_var": 0.0004257801964626514, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.7694573998451233, "loss/hidden": 0.0, "loss/logits": 0.17414850741624832, "loss/reg": 0.7171550393104553, "step": 117 }, { "epoch": 0.00118, "grad_norm": 0.37221357226371765, "grad_norm_var": 0.0004127643424552592, "learning_rate": 5e-05, "loss": 0.1764, "loss/crossentropy": 2.872999370098114, "loss/hidden": 0.0, "loss/logits": 0.17635848745703697, "loss/reg": 0.7168817520141602, "step": 118 }, { "epoch": 0.00119, "grad_norm": 0.3515457510948181, "grad_norm_var": 0.0003299339262575381, "learning_rate": 5e-05, "loss": 0.1686, "loss/crossentropy": 2.7989805340766907, "loss/hidden": 0.0, "loss/logits": 0.1686333641409874, "loss/reg": 0.7163890600204468, "step": 119 }, { "epoch": 0.0012, "grad_norm": 0.355618417263031, "grad_norm_var": 0.00028619092261970754, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.841488778591156, "loss/hidden": 0.0, "loss/logits": 0.17504336684942245, "loss/reg": 0.7158587574958801, "step": 120 }, { "epoch": 0.00121, "grad_norm": 0.43512770533561707, "grad_norm_var": 0.0006085139086032995, "learning_rate": 5e-05, "loss": 0.1996, "loss/crossentropy": 2.8789313435554504, "loss/hidden": 0.0, "loss/logits": 0.19958820566534996, "loss/reg": 0.7154721617698669, "step": 121 }, { "epoch": 0.00122, "grad_norm": 0.3879215121269226, "grad_norm_var": 0.0005577334184881822, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.858753979206085, "loss/hidden": 0.0, "loss/logits": 0.17983190342783928, "loss/reg": 0.7151558995246887, "step": 122 }, { "epoch": 0.00123, "grad_norm": 0.36982110142707825, "grad_norm_var": 0.00043364802604664023, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.7747685313224792, "loss/hidden": 0.0, "loss/logits": 0.17964857816696167, "loss/reg": 0.7149569988250732, "step": 123 }, { "epoch": 0.00124, "grad_norm": 0.3786149322986603, "grad_norm_var": 0.00043433748671258823, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.913690507411957, "loss/hidden": 0.0, "loss/logits": 0.18172713741660118, "loss/reg": 0.714292585849762, "step": 124 }, { "epoch": 0.00125, "grad_norm": 0.4001312851905823, "grad_norm_var": 0.000477695663449836, "learning_rate": 5e-05, "loss": 0.1694, "loss/crossentropy": 2.4212290942668915, "loss/hidden": 0.0, "loss/logits": 0.16936707124114037, "loss/reg": 0.7138825058937073, "step": 125 }, { "epoch": 0.00126, "grad_norm": 0.3809413015842438, "grad_norm_var": 0.0004772259151903454, "learning_rate": 5e-05, "loss": 0.1668, "loss/crossentropy": 2.9370239973068237, "loss/hidden": 0.0, "loss/logits": 0.16679024323821068, "loss/reg": 0.713337779045105, "step": 126 }, { "epoch": 0.00127, "grad_norm": 0.34950533509254456, "grad_norm_var": 0.0005131508596505759, "learning_rate": 5e-05, "loss": 0.1679, "loss/crossentropy": 2.8456231355667114, "loss/hidden": 0.0, "loss/logits": 0.16785800084471703, "loss/reg": 0.71268630027771, "step": 127 }, { "epoch": 0.00128, "grad_norm": 0.35916969180107117, "grad_norm_var": 0.0005085951826837094, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.7199816703796387, "loss/hidden": 0.0, "loss/logits": 0.17480477318167686, "loss/reg": 0.711790144443512, "step": 128 }, { "epoch": 0.00129, "grad_norm": 0.3852020502090454, "grad_norm_var": 0.0004781453279944614, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.822324216365814, "loss/hidden": 0.0, "loss/logits": 0.1810167133808136, "loss/reg": 0.7112712264060974, "step": 129 }, { "epoch": 0.0013, "grad_norm": 0.3799841105937958, "grad_norm_var": 0.00044530193043132253, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.592529594898224, "loss/hidden": 0.0, "loss/logits": 0.17983509600162506, "loss/reg": 0.7102628350257874, "step": 130 }, { "epoch": 0.00131, "grad_norm": 0.39435604214668274, "grad_norm_var": 0.0004437530279179924, "learning_rate": 5e-05, "loss": 0.1754, "loss/crossentropy": 2.730157434940338, "loss/hidden": 0.0, "loss/logits": 0.17542827129364014, "loss/reg": 0.7097166776657104, "step": 131 }, { "epoch": 0.00132, "grad_norm": 0.4004809558391571, "grad_norm_var": 0.0004740291218968573, "learning_rate": 5e-05, "loss": 0.1858, "loss/crossentropy": 3.037446081638336, "loss/hidden": 0.0, "loss/logits": 0.18577765673398972, "loss/reg": 0.7088049054145813, "step": 132 }, { "epoch": 0.00133, "grad_norm": 0.36326301097869873, "grad_norm_var": 0.0004905736490476542, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.7059656977653503, "loss/hidden": 0.0, "loss/logits": 0.17547782137989998, "loss/reg": 0.7079591155052185, "step": 133 }, { "epoch": 0.00134, "grad_norm": 0.3696005642414093, "grad_norm_var": 0.0004933625381868811, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.705469310283661, "loss/hidden": 0.0, "loss/logits": 0.18300171568989754, "loss/reg": 0.7070783376693726, "step": 134 }, { "epoch": 0.00135, "grad_norm": 0.369745671749115, "grad_norm_var": 0.0004478547976245621, "learning_rate": 5e-05, "loss": 0.1727, "loss/crossentropy": 2.796743154525757, "loss/hidden": 0.0, "loss/logits": 0.17267876863479614, "loss/reg": 0.706299364566803, "step": 135 }, { "epoch": 0.00136, "grad_norm": 0.9704347252845764, "grad_norm_var": 0.022076750770700673, "learning_rate": 5e-05, "loss": 0.2183, "loss/crossentropy": 3.161501705646515, "loss/hidden": 0.0, "loss/logits": 0.21831253170967102, "loss/reg": 0.7052822709083557, "step": 136 }, { "epoch": 0.00137, "grad_norm": 0.49051499366760254, "grad_norm_var": 0.022392065042272716, "learning_rate": 5e-05, "loss": 0.1921, "loss/crossentropy": 2.8244537115097046, "loss/hidden": 0.0, "loss/logits": 0.19208021834492683, "loss/reg": 0.7046045660972595, "step": 137 }, { "epoch": 0.00138, "grad_norm": 0.3787124454975128, "grad_norm_var": 0.022439032133147886, "learning_rate": 5e-05, "loss": 0.1653, "loss/crossentropy": 2.734581470489502, "loss/hidden": 0.0, "loss/logits": 0.16531135886907578, "loss/reg": 0.7039092779159546, "step": 138 }, { "epoch": 0.00139, "grad_norm": 0.41016826033592224, "grad_norm_var": 0.022263946678371956, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.7766221165657043, "loss/hidden": 0.0, "loss/logits": 0.17632220312952995, "loss/reg": 0.7032301425933838, "step": 139 }, { "epoch": 0.0014, "grad_norm": 0.4240967929363251, "grad_norm_var": 0.02211921107794302, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.8693822622299194, "loss/hidden": 0.0, "loss/logits": 0.1729997918009758, "loss/reg": 0.7027043700218201, "step": 140 }, { "epoch": 0.00141, "grad_norm": 0.44461771845817566, "grad_norm_var": 0.02208563923367276, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.7406617999076843, "loss/hidden": 0.0, "loss/logits": 0.1716134212911129, "loss/reg": 0.7022019028663635, "step": 141 }, { "epoch": 0.00142, "grad_norm": 0.4856080412864685, "grad_norm_var": 0.02209372273555612, "learning_rate": 5e-05, "loss": 0.1863, "loss/crossentropy": 2.92539644241333, "loss/hidden": 0.0, "loss/logits": 0.1863207444548607, "loss/reg": 0.7016287446022034, "step": 142 }, { "epoch": 0.00143, "grad_norm": 0.4170268476009369, "grad_norm_var": 0.02160027343751687, "learning_rate": 5e-05, "loss": 0.1667, "loss/crossentropy": 2.943239390850067, "loss/hidden": 0.0, "loss/logits": 0.166742492467165, "loss/reg": 0.7010935544967651, "step": 143 }, { "epoch": 0.00144, "grad_norm": 0.560448944568634, "grad_norm_var": 0.02195809350616609, "learning_rate": 5e-05, "loss": 0.1672, "loss/crossentropy": 2.8288047909736633, "loss/hidden": 0.0, "loss/logits": 0.1672464720904827, "loss/reg": 0.7006227970123291, "step": 144 }, { "epoch": 0.00145, "grad_norm": 0.39022010564804077, "grad_norm_var": 0.02191446180867314, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.8854493498802185, "loss/hidden": 0.0, "loss/logits": 0.1825825609266758, "loss/reg": 0.7002730369567871, "step": 145 }, { "epoch": 0.00146, "grad_norm": 0.4051852524280548, "grad_norm_var": 0.021708542250207977, "learning_rate": 5e-05, "loss": 0.1685, "loss/crossentropy": 2.7675100564956665, "loss/hidden": 0.0, "loss/logits": 0.16847975179553032, "loss/reg": 0.6996915936470032, "step": 146 }, { "epoch": 0.00147, "grad_norm": 0.413181871175766, "grad_norm_var": 0.021579335882167275, "learning_rate": 5e-05, "loss": 0.1719, "loss/crossentropy": 2.831291675567627, "loss/hidden": 0.0, "loss/logits": 0.17188836634159088, "loss/reg": 0.6993570327758789, "step": 147 }, { "epoch": 0.00148, "grad_norm": 0.3865763545036316, "grad_norm_var": 0.02169403672512673, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.6290199756622314, "loss/hidden": 0.0, "loss/logits": 0.17158953100442886, "loss/reg": 0.6988732814788818, "step": 148 }, { "epoch": 0.00149, "grad_norm": 0.4035678803920746, "grad_norm_var": 0.02130277488638885, "learning_rate": 5e-05, "loss": 0.1679, "loss/crossentropy": 2.835815966129303, "loss/hidden": 0.0, "loss/logits": 0.16792400181293488, "loss/reg": 0.6982746720314026, "step": 149 }, { "epoch": 0.0015, "grad_norm": 0.3754025995731354, "grad_norm_var": 0.021236893636948084, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.731720507144928, "loss/hidden": 0.0, "loss/logits": 0.17424210906028748, "loss/reg": 0.6975677609443665, "step": 150 }, { "epoch": 0.00151, "grad_norm": 0.373666375875473, "grad_norm_var": 0.02119179990426129, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.7545694708824158, "loss/hidden": 0.0, "loss/logits": 0.18673796206712723, "loss/reg": 0.6968193650245667, "step": 151 }, { "epoch": 0.00152, "grad_norm": 0.3768315017223358, "grad_norm_var": 0.002663948314070371, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.771127223968506, "loss/hidden": 0.0, "loss/logits": 0.17662332952022552, "loss/reg": 0.6964417695999146, "step": 152 }, { "epoch": 0.00153, "grad_norm": 0.367779940366745, "grad_norm_var": 0.0024676734716507407, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.8379569053649902, "loss/hidden": 0.0, "loss/logits": 0.17223728448152542, "loss/reg": 0.6960185766220093, "step": 153 }, { "epoch": 0.00154, "grad_norm": 0.3623608946800232, "grad_norm_var": 0.002559831961509481, "learning_rate": 5e-05, "loss": 0.1603, "loss/crossentropy": 2.6386294960975647, "loss/hidden": 0.0, "loss/logits": 0.160268172621727, "loss/reg": 0.6956042647361755, "step": 154 }, { "epoch": 0.00155, "grad_norm": 0.36842337250709534, "grad_norm_var": 0.002680590833751166, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.6607457399368286, "loss/hidden": 0.0, "loss/logits": 0.1708206795156002, "loss/reg": 0.6953121423721313, "step": 155 }, { "epoch": 0.00156, "grad_norm": 0.3845401108264923, "grad_norm_var": 0.0027023868697072556, "learning_rate": 5e-05, "loss": 0.1648, "loss/crossentropy": 2.8014814853668213, "loss/hidden": 0.0, "loss/logits": 0.16483300551772118, "loss/reg": 0.6945253014564514, "step": 156 }, { "epoch": 0.00157, "grad_norm": 0.4067936837673187, "grad_norm_var": 0.002603172717212928, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.6495731472969055, "loss/hidden": 0.0, "loss/logits": 0.18006890267133713, "loss/reg": 0.6942071914672852, "step": 157 }, { "epoch": 0.00158, "grad_norm": 0.4112524688243866, "grad_norm_var": 0.002148086815628088, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.6930444836616516, "loss/hidden": 0.0, "loss/logits": 0.18408403918147087, "loss/reg": 0.6936368942260742, "step": 158 }, { "epoch": 0.00159, "grad_norm": 0.39026692509651184, "grad_norm_var": 0.0021328176175901435, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.7543540000915527, "loss/hidden": 0.0, "loss/logits": 0.18266991525888443, "loss/reg": 0.6928218603134155, "step": 159 }, { "epoch": 0.0016, "grad_norm": 0.34440305829048157, "grad_norm_var": 0.0003858333419344088, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.666127622127533, "loss/hidden": 0.0, "loss/logits": 0.17170501500368118, "loss/reg": 0.6924263834953308, "step": 160 }, { "epoch": 0.00161, "grad_norm": 0.42731061577796936, "grad_norm_var": 0.0004974906509257693, "learning_rate": 5e-05, "loss": 0.1895, "loss/crossentropy": 2.9741435050964355, "loss/hidden": 0.0, "loss/logits": 0.18945498019456863, "loss/reg": 0.6919137239456177, "step": 161 }, { "epoch": 0.00162, "grad_norm": 0.35118570923805237, "grad_norm_var": 0.0005512991441295986, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.7802159786224365, "loss/hidden": 0.0, "loss/logits": 0.16624341905117035, "loss/reg": 0.6914498805999756, "step": 162 }, { "epoch": 0.00163, "grad_norm": 0.3795294463634491, "grad_norm_var": 0.0004910126787373533, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.7658523321151733, "loss/hidden": 0.0, "loss/logits": 0.1798836812376976, "loss/reg": 0.6908239722251892, "step": 163 }, { "epoch": 0.00164, "grad_norm": 0.35127007961273193, "grad_norm_var": 0.0005467572640604453, "learning_rate": 5e-05, "loss": 0.1687, "loss/crossentropy": 2.6695101261138916, "loss/hidden": 0.0, "loss/logits": 0.16873782873153687, "loss/reg": 0.6900719404220581, "step": 164 }, { "epoch": 0.00165, "grad_norm": 0.3678815960884094, "grad_norm_var": 0.0005126012400185471, "learning_rate": 5e-05, "loss": 0.1625, "loss/crossentropy": 2.7602951526641846, "loss/hidden": 0.0, "loss/logits": 0.16252683475613594, "loss/reg": 0.6894409656524658, "step": 165 }, { "epoch": 0.00166, "grad_norm": 0.41981780529022217, "grad_norm_var": 0.0006238825228471375, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.6286023259162903, "loss/hidden": 0.0, "loss/logits": 0.17411665618419647, "loss/reg": 0.688869297504425, "step": 166 }, { "epoch": 0.00167, "grad_norm": 0.3656597137451172, "grad_norm_var": 0.0006348717691261959, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.630964517593384, "loss/hidden": 0.0, "loss/logits": 0.17346174269914627, "loss/reg": 0.6885258555412292, "step": 167 }, { "epoch": 0.00168, "grad_norm": 0.38391461968421936, "grad_norm_var": 0.0006352920630523766, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.7943974137306213, "loss/hidden": 0.0, "loss/logits": 0.18646075949072838, "loss/reg": 0.6878592371940613, "step": 168 }, { "epoch": 0.00169, "grad_norm": 0.4038885235786438, "grad_norm_var": 0.00065722904435123, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.734152913093567, "loss/hidden": 0.0, "loss/logits": 0.19264819100499153, "loss/reg": 0.6875417232513428, "step": 169 }, { "epoch": 0.0017, "grad_norm": 0.3950437903404236, "grad_norm_var": 0.0006366381143967074, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.8189674615859985, "loss/hidden": 0.0, "loss/logits": 0.18270733579993248, "loss/reg": 0.687127411365509, "step": 170 }, { "epoch": 0.00171, "grad_norm": 0.3725055456161499, "grad_norm_var": 0.0006289571226276973, "learning_rate": 5e-05, "loss": 0.1728, "loss/crossentropy": 2.69544118642807, "loss/hidden": 0.0, "loss/logits": 0.17280659452080727, "loss/reg": 0.6868485808372498, "step": 171 }, { "epoch": 0.00172, "grad_norm": 0.3815220594406128, "grad_norm_var": 0.0006295923546466871, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.7692622542381287, "loss/hidden": 0.0, "loss/logits": 0.17604272067546844, "loss/reg": 0.6865261793136597, "step": 172 }, { "epoch": 0.00173, "grad_norm": 0.3643653392791748, "grad_norm_var": 0.0006160716024297094, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.8946388959884644, "loss/hidden": 0.0, "loss/logits": 0.16618424281477928, "loss/reg": 0.6858268976211548, "step": 173 }, { "epoch": 0.00174, "grad_norm": 0.5440117716789246, "grad_norm_var": 0.0022378559009343492, "learning_rate": 5e-05, "loss": 0.1697, "loss/crossentropy": 2.894260048866272, "loss/hidden": 0.0, "loss/logits": 0.1696738563477993, "loss/reg": 0.6854670643806458, "step": 174 }, { "epoch": 0.00175, "grad_norm": 0.39967331290245056, "grad_norm_var": 0.002243518711865026, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.930911064147949, "loss/hidden": 0.0, "loss/logits": 0.18498113378882408, "loss/reg": 0.6849290728569031, "step": 175 }, { "epoch": 0.00176, "grad_norm": 0.3877590596675873, "grad_norm_var": 0.0020930863780073733, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.7704232335090637, "loss/hidden": 0.0, "loss/logits": 0.17786122485995293, "loss/reg": 0.6845774054527283, "step": 176 }, { "epoch": 0.00177, "grad_norm": 0.3919845223426819, "grad_norm_var": 0.0020116346618934207, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.9073404669761658, "loss/hidden": 0.0, "loss/logits": 0.176034078001976, "loss/reg": 0.6841259598731995, "step": 177 }, { "epoch": 0.00178, "grad_norm": 0.4280073642730713, "grad_norm_var": 0.0019701003079437267, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.872975468635559, "loss/hidden": 0.0, "loss/logits": 0.17160987108945847, "loss/reg": 0.6835975646972656, "step": 178 }, { "epoch": 0.00179, "grad_norm": 0.4368959963321686, "grad_norm_var": 0.002049402735067322, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.8378754258155823, "loss/hidden": 0.0, "loss/logits": 0.17144308984279633, "loss/reg": 0.683133602142334, "step": 179 }, { "epoch": 0.0018, "grad_norm": 0.4063540995121002, "grad_norm_var": 0.0018838065485936326, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.7291943430900574, "loss/hidden": 0.0, "loss/logits": 0.17731739208102226, "loss/reg": 0.6829569339752197, "step": 180 }, { "epoch": 0.00181, "grad_norm": 0.4153757095336914, "grad_norm_var": 0.0018018895995528046, "learning_rate": 5e-05, "loss": 0.191, "loss/crossentropy": 2.6606562733650208, "loss/hidden": 0.0, "loss/logits": 0.19101158529520035, "loss/reg": 0.68250972032547, "step": 181 }, { "epoch": 0.00182, "grad_norm": 0.4478014409542084, "grad_norm_var": 0.0019022069501489843, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.9209048748016357, "loss/hidden": 0.0, "loss/logits": 0.18151485547423363, "loss/reg": 0.6823130249977112, "step": 182 }, { "epoch": 0.00183, "grad_norm": 0.3665817677974701, "grad_norm_var": 0.0018970796223750383, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.709440767765045, "loss/hidden": 0.0, "loss/logits": 0.17252622544765472, "loss/reg": 0.6818681955337524, "step": 183 }, { "epoch": 0.00184, "grad_norm": 0.35465773940086365, "grad_norm_var": 0.0020439680833119274, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.7819241881370544, "loss/hidden": 0.0, "loss/logits": 0.17179736122488976, "loss/reg": 0.6816492080688477, "step": 184 }, { "epoch": 0.00185, "grad_norm": 0.3441713750362396, "grad_norm_var": 0.002283876890223254, "learning_rate": 5e-05, "loss": 0.1696, "loss/crossentropy": 2.7352775931358337, "loss/hidden": 0.0, "loss/logits": 0.1695723831653595, "loss/reg": 0.6816331148147583, "step": 185 }, { "epoch": 0.00186, "grad_norm": 0.38126233220100403, "grad_norm_var": 0.0023090706802151714, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.6879822611808777, "loss/hidden": 0.0, "loss/logits": 0.18103832751512527, "loss/reg": 0.6814532279968262, "step": 186 }, { "epoch": 0.00187, "grad_norm": 0.3787057101726532, "grad_norm_var": 0.0022875592365704024, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.8167298436164856, "loss/hidden": 0.0, "loss/logits": 0.1718391291797161, "loss/reg": 0.6813297271728516, "step": 187 }, { "epoch": 0.00188, "grad_norm": 0.36861035227775574, "grad_norm_var": 0.0023329239228645827, "learning_rate": 5e-05, "loss": 0.1726, "loss/crossentropy": 2.7486817240715027, "loss/hidden": 0.0, "loss/logits": 0.17257895320653915, "loss/reg": 0.681362509727478, "step": 188 }, { "epoch": 0.00189, "grad_norm": 0.36589452624320984, "grad_norm_var": 0.0023255977975845444, "learning_rate": 5e-05, "loss": 0.1691, "loss/crossentropy": 2.7861050367355347, "loss/hidden": 0.0, "loss/logits": 0.16913071274757385, "loss/reg": 0.6810048818588257, "step": 189 }, { "epoch": 0.0019, "grad_norm": 0.3718152642250061, "grad_norm_var": 0.0008978484496580415, "learning_rate": 5e-05, "loss": 0.1754, "loss/crossentropy": 2.9670495986938477, "loss/hidden": 0.0, "loss/logits": 0.175355713814497, "loss/reg": 0.6807791590690613, "step": 190 }, { "epoch": 0.00191, "grad_norm": 0.35371074080467224, "grad_norm_var": 0.0009727279362613306, "learning_rate": 5e-05, "loss": 0.1648, "loss/crossentropy": 2.7362327575683594, "loss/hidden": 0.0, "loss/logits": 0.16481352970004082, "loss/reg": 0.6803683638572693, "step": 191 }, { "epoch": 0.00192, "grad_norm": 0.3793174624443054, "grad_norm_var": 0.000976861155079271, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.7576581835746765, "loss/hidden": 0.0, "loss/logits": 0.17326407879590988, "loss/reg": 0.6801426410675049, "step": 192 }, { "epoch": 0.00193, "grad_norm": 0.3451940715312958, "grad_norm_var": 0.001082265390337141, "learning_rate": 5e-05, "loss": 0.1751, "loss/crossentropy": 2.7474693655967712, "loss/hidden": 0.0, "loss/logits": 0.17509106546640396, "loss/reg": 0.6794537901878357, "step": 193 }, { "epoch": 0.00194, "grad_norm": 0.38012632727622986, "grad_norm_var": 0.0009447454267972888, "learning_rate": 5e-05, "loss": 0.1737, "loss/crossentropy": 2.9815833568573, "loss/hidden": 0.0, "loss/logits": 0.17372199892997742, "loss/reg": 0.6790013909339905, "step": 194 }, { "epoch": 0.00195, "grad_norm": 0.37810343503952026, "grad_norm_var": 0.000722844262981311, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.6913029551506042, "loss/hidden": 0.0, "loss/logits": 0.1864505223929882, "loss/reg": 0.6783783435821533, "step": 195 }, { "epoch": 0.00196, "grad_norm": 0.3766270577907562, "grad_norm_var": 0.0006631349171885385, "learning_rate": 5e-05, "loss": 0.1654, "loss/crossentropy": 2.788192331790924, "loss/hidden": 0.0, "loss/logits": 0.16537169739603996, "loss/reg": 0.6778481006622314, "step": 196 }, { "epoch": 0.00197, "grad_norm": 0.4204989969730377, "grad_norm_var": 0.0006920166259448284, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.8584505915641785, "loss/hidden": 0.0, "loss/logits": 0.18528714403510094, "loss/reg": 0.6771920919418335, "step": 197 }, { "epoch": 0.00198, "grad_norm": 0.3730599880218506, "grad_norm_var": 0.0003238006419232035, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.8100345134735107, "loss/hidden": 0.0, "loss/logits": 0.1672576330602169, "loss/reg": 0.676322340965271, "step": 198 }, { "epoch": 0.00199, "grad_norm": 0.3785562515258789, "grad_norm_var": 0.00032547505049643026, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.659552276134491, "loss/hidden": 0.0, "loss/logits": 0.170790895819664, "loss/reg": 0.6757908463478088, "step": 199 }, { "epoch": 0.002, "grad_norm": 0.4481094181537628, "grad_norm_var": 0.0006565274590131195, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.7589192390441895, "loss/hidden": 0.0, "loss/logits": 0.18503189086914062, "loss/reg": 0.6753812432289124, "step": 200 }, { "epoch": 0.00201, "grad_norm": 0.4667153060436249, "grad_norm_var": 0.0010466850460476907, "learning_rate": 5e-05, "loss": 0.1652, "loss/crossentropy": 3.162939667701721, "loss/hidden": 0.0, "loss/logits": 0.165226511657238, "loss/reg": 0.6747428774833679, "step": 201 }, { "epoch": 0.00202, "grad_norm": 0.3977454602718353, "grad_norm_var": 0.0010545850707442406, "learning_rate": 5e-05, "loss": 0.1852, "loss/crossentropy": 2.798895478248596, "loss/hidden": 0.0, "loss/logits": 0.18524322658777237, "loss/reg": 0.6746957898139954, "step": 202 }, { "epoch": 0.00203, "grad_norm": 0.35303735733032227, "grad_norm_var": 0.00112218089236773, "learning_rate": 5e-05, "loss": 0.1671, "loss/crossentropy": 2.879501461982727, "loss/hidden": 0.0, "loss/logits": 0.1670902520418167, "loss/reg": 0.6744116544723511, "step": 203 }, { "epoch": 0.00204, "grad_norm": 0.36885592341423035, "grad_norm_var": 0.0011216539077677722, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 2.7386632561683655, "loss/hidden": 0.0, "loss/logits": 0.17121050879359245, "loss/reg": 0.6739590764045715, "step": 204 }, { "epoch": 0.00205, "grad_norm": 0.3499481678009033, "grad_norm_var": 0.0011778186905237243, "learning_rate": 5e-05, "loss": 0.1628, "loss/crossentropy": 2.7857924699783325, "loss/hidden": 0.0, "loss/logits": 0.16278192773461342, "loss/reg": 0.6734333634376526, "step": 205 }, { "epoch": 0.00206, "grad_norm": 0.3964221477508545, "grad_norm_var": 0.0011762140398453172, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.79485422372818, "loss/hidden": 0.0, "loss/logits": 0.18456053361296654, "loss/reg": 0.6729143857955933, "step": 206 }, { "epoch": 0.00207, "grad_norm": 0.40056177973747253, "grad_norm_var": 0.001115591345996884, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.7311152815818787, "loss/hidden": 0.0, "loss/logits": 0.1793210655450821, "loss/reg": 0.6723442077636719, "step": 207 }, { "epoch": 0.00208, "grad_norm": 0.3572394847869873, "grad_norm_var": 0.0011725128955176781, "learning_rate": 5e-05, "loss": 0.1622, "loss/crossentropy": 2.694546699523926, "loss/hidden": 0.0, "loss/logits": 0.16219522804021835, "loss/reg": 0.671970784664154, "step": 208 }, { "epoch": 0.00209, "grad_norm": 0.35748156905174255, "grad_norm_var": 0.0011135800302239595, "learning_rate": 5e-05, "loss": 0.1661, "loss/crossentropy": 2.6286928057670593, "loss/hidden": 0.0, "loss/logits": 0.16605138033628464, "loss/reg": 0.6714334487915039, "step": 209 }, { "epoch": 0.0021, "grad_norm": 0.5893498659133911, "grad_norm_var": 0.003638400957963824, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.65495902299881, "loss/hidden": 0.0, "loss/logits": 0.17752325907349586, "loss/reg": 0.6708847880363464, "step": 210 }, { "epoch": 0.00211, "grad_norm": 0.402476966381073, "grad_norm_var": 0.0036018699571415493, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.718632161617279, "loss/hidden": 0.0, "loss/logits": 0.16728456690907478, "loss/reg": 0.6703056693077087, "step": 211 }, { "epoch": 0.00212, "grad_norm": 0.40408509969711304, "grad_norm_var": 0.00355502710923468, "learning_rate": 5e-05, "loss": 0.1687, "loss/crossentropy": 2.6840004920959473, "loss/hidden": 0.0, "loss/logits": 0.1687101610004902, "loss/reg": 0.6696141958236694, "step": 212 }, { "epoch": 0.00213, "grad_norm": 0.46999391913414, "grad_norm_var": 0.003816959221830507, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.933770179748535, "loss/hidden": 0.0, "loss/logits": 0.17744172737002373, "loss/reg": 0.6691727638244629, "step": 213 }, { "epoch": 0.00214, "grad_norm": 0.4497075378894806, "grad_norm_var": 0.0038362346290080986, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 2.884861409664154, "loss/hidden": 0.0, "loss/logits": 0.17972879484295845, "loss/reg": 0.6685946583747864, "step": 214 }, { "epoch": 0.00215, "grad_norm": 0.4725947380065918, "grad_norm_var": 0.003970946655874267, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.7148566842079163, "loss/hidden": 0.0, "loss/logits": 0.1823594570159912, "loss/reg": 0.6678350567817688, "step": 215 }, { "epoch": 0.00216, "grad_norm": 0.3531753420829773, "grad_norm_var": 0.004150197714351977, "learning_rate": 5e-05, "loss": 0.1607, "loss/crossentropy": 2.8040355443954468, "loss/hidden": 0.0, "loss/logits": 0.1607324182987213, "loss/reg": 0.6670687198638916, "step": 216 }, { "epoch": 0.00217, "grad_norm": 0.3904282748699188, "grad_norm_var": 0.003955728405321291, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.583940625190735, "loss/hidden": 0.0, "loss/logits": 0.1762019246816635, "loss/reg": 0.6662251353263855, "step": 217 }, { "epoch": 0.00218, "grad_norm": 0.41466209292411804, "grad_norm_var": 0.003952584516121797, "learning_rate": 5e-05, "loss": 0.1931, "loss/crossentropy": 2.6736027598381042, "loss/hidden": 0.0, "loss/logits": 0.1931401677429676, "loss/reg": 0.6657095551490784, "step": 218 }, { "epoch": 0.00219, "grad_norm": 0.3585696220397949, "grad_norm_var": 0.003913861864300646, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.7951349020004272, "loss/hidden": 0.0, "loss/logits": 0.1673244759440422, "loss/reg": 0.6648316383361816, "step": 219 }, { "epoch": 0.0022, "grad_norm": 0.3895896375179291, "grad_norm_var": 0.0038312112496423152, "learning_rate": 5e-05, "loss": 0.1818, "loss/crossentropy": 2.711538791656494, "loss/hidden": 0.0, "loss/logits": 0.18181969970464706, "loss/reg": 0.6639692187309265, "step": 220 }, { "epoch": 0.00221, "grad_norm": 0.3833865821361542, "grad_norm_var": 0.0036343906659896063, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.7108349204063416, "loss/hidden": 0.0, "loss/logits": 0.17578613013029099, "loss/reg": 0.6633884310722351, "step": 221 }, { "epoch": 0.00222, "grad_norm": 0.4483087360858917, "grad_norm_var": 0.0036958672434330374, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.7732558846473694, "loss/hidden": 0.0, "loss/logits": 0.17597418650984764, "loss/reg": 0.6626039743423462, "step": 222 }, { "epoch": 0.00223, "grad_norm": 0.38295701146125793, "grad_norm_var": 0.0037493649851021104, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.8829389214515686, "loss/hidden": 0.0, "loss/logits": 0.17588848248124123, "loss/reg": 0.6619814038276672, "step": 223 }, { "epoch": 0.00224, "grad_norm": 0.37519922852516174, "grad_norm_var": 0.003633602965619135, "learning_rate": 5e-05, "loss": 0.1724, "loss/crossentropy": 2.652667462825775, "loss/hidden": 0.0, "loss/logits": 0.17242146655917168, "loss/reg": 0.6614968776702881, "step": 224 }, { "epoch": 0.00225, "grad_norm": 0.4597223401069641, "grad_norm_var": 0.0035011540204966255, "learning_rate": 5e-05, "loss": 0.1908, "loss/crossentropy": 2.876725196838379, "loss/hidden": 0.0, "loss/logits": 0.19080688059329987, "loss/reg": 0.6610702872276306, "step": 225 }, { "epoch": 0.00226, "grad_norm": 0.4324163496494293, "grad_norm_var": 0.0015285134686143565, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.7941476106643677, "loss/hidden": 0.0, "loss/logits": 0.1680818684399128, "loss/reg": 0.6605817079544067, "step": 226 }, { "epoch": 0.00227, "grad_norm": 0.434567928314209, "grad_norm_var": 0.0015533947140733862, "learning_rate": 5e-05, "loss": 0.1727, "loss/crossentropy": 2.755871832370758, "loss/hidden": 0.0, "loss/logits": 0.17265953496098518, "loss/reg": 0.6598991751670837, "step": 227 }, { "epoch": 0.00228, "grad_norm": 0.43576470017433167, "grad_norm_var": 0.0015754632935476706, "learning_rate": 5e-05, "loss": 0.1877, "loss/crossentropy": 2.6969935297966003, "loss/hidden": 0.0, "loss/logits": 0.18768608570098877, "loss/reg": 0.659035861492157, "step": 228 }, { "epoch": 0.00229, "grad_norm": 0.40100330114364624, "grad_norm_var": 0.0013734190770464969, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.622776746749878, "loss/hidden": 0.0, "loss/logits": 0.18265309557318687, "loss/reg": 0.6584252119064331, "step": 229 }, { "epoch": 0.0023, "grad_norm": 0.39311620593070984, "grad_norm_var": 0.0012843668657693374, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.8122403621673584, "loss/hidden": 0.0, "loss/logits": 0.18154963850975037, "loss/reg": 0.658037543296814, "step": 230 }, { "epoch": 0.00231, "grad_norm": 0.36856982111930847, "grad_norm_var": 0.0010625624756416153, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.760325074195862, "loss/hidden": 0.0, "loss/logits": 0.17502610757946968, "loss/reg": 0.6573387980461121, "step": 231 }, { "epoch": 0.00232, "grad_norm": 0.37491267919540405, "grad_norm_var": 0.0009524987891143533, "learning_rate": 5e-05, "loss": 0.1697, "loss/crossentropy": 2.7856059074401855, "loss/hidden": 0.0, "loss/logits": 0.16968399658799171, "loss/reg": 0.6570595502853394, "step": 232 }, { "epoch": 0.00233, "grad_norm": 0.36535993218421936, "grad_norm_var": 0.0010327874020304292, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.596544563770294, "loss/hidden": 0.0, "loss/logits": 0.1715911477804184, "loss/reg": 0.6560950875282288, "step": 233 }, { "epoch": 0.00234, "grad_norm": 0.3861640989780426, "grad_norm_var": 0.0010321337616675864, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.9150387048721313, "loss/hidden": 0.0, "loss/logits": 0.1769140511751175, "loss/reg": 0.6556986570358276, "step": 234 }, { "epoch": 0.00235, "grad_norm": 0.4403046667575836, "grad_norm_var": 0.0010052419875462792, "learning_rate": 5e-05, "loss": 0.1952, "loss/crossentropy": 2.6827760338783264, "loss/hidden": 0.0, "loss/logits": 0.1951572559773922, "loss/reg": 0.6551769971847534, "step": 235 }, { "epoch": 0.00236, "grad_norm": 0.4245564639568329, "grad_norm_var": 0.0010123350605352114, "learning_rate": 5e-05, "loss": 0.1655, "loss/crossentropy": 2.9303322434425354, "loss/hidden": 0.0, "loss/logits": 0.16554568335413933, "loss/reg": 0.654656708240509, "step": 236 }, { "epoch": 0.00237, "grad_norm": 0.388759970664978, "grad_norm_var": 0.0009974765523548812, "learning_rate": 5e-05, "loss": 0.1606, "loss/crossentropy": 2.832023561000824, "loss/hidden": 0.0, "loss/logits": 0.16056211292743683, "loss/reg": 0.6540954113006592, "step": 237 }, { "epoch": 0.00238, "grad_norm": 0.41993674635887146, "grad_norm_var": 0.0008914441801375461, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.7374658584594727, "loss/hidden": 0.0, "loss/logits": 0.18690869584679604, "loss/reg": 0.6537821292877197, "step": 238 }, { "epoch": 0.00239, "grad_norm": 0.39730304479599, "grad_norm_var": 0.0008617474116029052, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.638366162776947, "loss/hidden": 0.0, "loss/logits": 0.1766064204275608, "loss/reg": 0.653330385684967, "step": 239 }, { "epoch": 0.0024, "grad_norm": 0.3524472415447235, "grad_norm_var": 0.000987852143074619, "learning_rate": 5e-05, "loss": 0.1704, "loss/crossentropy": 2.790610194206238, "loss/hidden": 0.0, "loss/logits": 0.1703929863870144, "loss/reg": 0.6529704332351685, "step": 240 }, { "epoch": 0.00241, "grad_norm": 0.4325718581676483, "grad_norm_var": 0.0008346728942040138, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.7924094200134277, "loss/hidden": 0.0, "loss/logits": 0.17433909326791763, "loss/reg": 0.6523910164833069, "step": 241 }, { "epoch": 0.00242, "grad_norm": 0.4209471046924591, "grad_norm_var": 0.0007978865097764242, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.7974682450294495, "loss/hidden": 0.0, "loss/logits": 0.18506118655204773, "loss/reg": 0.6519649624824524, "step": 242 }, { "epoch": 0.00243, "grad_norm": 0.3542281985282898, "grad_norm_var": 0.000855293855762614, "learning_rate": 5e-05, "loss": 0.17, "loss/crossentropy": 2.750208020210266, "loss/hidden": 0.0, "loss/logits": 0.16999417543411255, "loss/reg": 0.65160071849823, "step": 243 }, { "epoch": 0.00244, "grad_norm": 0.3767947554588318, "grad_norm_var": 0.0007697802694895728, "learning_rate": 5e-05, "loss": 0.1734, "loss/crossentropy": 2.709768235683441, "loss/hidden": 0.0, "loss/logits": 0.17342101782560349, "loss/reg": 0.6510489583015442, "step": 244 }, { "epoch": 0.00245, "grad_norm": 0.3745100200176239, "grad_norm_var": 0.0007873591972725043, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.6971641778945923, "loss/hidden": 0.0, "loss/logits": 0.1819848120212555, "loss/reg": 0.6506956219673157, "step": 245 }, { "epoch": 0.00246, "grad_norm": 0.35764631628990173, "grad_norm_var": 0.0008602639155582422, "learning_rate": 5e-05, "loss": 0.1655, "loss/crossentropy": 2.7022847533226013, "loss/hidden": 0.0, "loss/logits": 0.16549457237124443, "loss/reg": 0.650374174118042, "step": 246 }, { "epoch": 0.00247, "grad_norm": 0.3772222697734833, "grad_norm_var": 0.0008405794206748991, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.8548359274864197, "loss/hidden": 0.0, "loss/logits": 0.17599471285939217, "loss/reg": 0.6496487259864807, "step": 247 }, { "epoch": 0.00248, "grad_norm": 0.3940555453300476, "grad_norm_var": 0.0008243891814430786, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.7054933309555054, "loss/hidden": 0.0, "loss/logits": 0.18652822077274323, "loss/reg": 0.6493033766746521, "step": 248 }, { "epoch": 0.00249, "grad_norm": 0.3884677588939667, "grad_norm_var": 0.000777453215041742, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.7695940732955933, "loss/hidden": 0.0, "loss/logits": 0.18154451251029968, "loss/reg": 0.6485955119132996, "step": 249 }, { "epoch": 0.0025, "grad_norm": 0.3557443618774414, "grad_norm_var": 0.0008624861411241932, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.7343984246253967, "loss/hidden": 0.0, "loss/logits": 0.16622266545891762, "loss/reg": 0.6481377482414246, "step": 250 }, { "epoch": 0.00251, "grad_norm": 0.3897189795970917, "grad_norm_var": 0.0006896577417839112, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.7266823649406433, "loss/hidden": 0.0, "loss/logits": 0.19054722413420677, "loss/reg": 0.6476472020149231, "step": 251 }, { "epoch": 0.00252, "grad_norm": 0.3550770878791809, "grad_norm_var": 0.0006509244563425498, "learning_rate": 5e-05, "loss": 0.1652, "loss/crossentropy": 2.761192739009857, "loss/hidden": 0.0, "loss/logits": 0.16522908583283424, "loss/reg": 0.6472364068031311, "step": 252 }, { "epoch": 0.00253, "grad_norm": 0.3814965486526489, "grad_norm_var": 0.0006490933109925323, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.714184880256653, "loss/hidden": 0.0, "loss/logits": 0.18124865368008614, "loss/reg": 0.6466885805130005, "step": 253 }, { "epoch": 0.00254, "grad_norm": 0.38684922456741333, "grad_norm_var": 0.0005546108749522549, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.7751963138580322, "loss/hidden": 0.0, "loss/logits": 0.17888568341732025, "loss/reg": 0.6462445259094238, "step": 254 }, { "epoch": 0.00255, "grad_norm": 0.402862012386322, "grad_norm_var": 0.0005686686079565906, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.669034719467163, "loss/hidden": 0.0, "loss/logits": 0.18275145441293716, "loss/reg": 0.6458649635314941, "step": 255 }, { "epoch": 0.00256, "grad_norm": 0.3681320548057556, "grad_norm_var": 0.0005237254329865141, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 2.9366363286972046, "loss/hidden": 0.0, "loss/logits": 0.17491309717297554, "loss/reg": 0.6452223658561707, "step": 256 }, { "epoch": 0.00257, "grad_norm": 0.34907689690589905, "grad_norm_var": 0.0003994477584627417, "learning_rate": 5e-05, "loss": 0.164, "loss/crossentropy": 2.7810651063919067, "loss/hidden": 0.0, "loss/logits": 0.16401175409555435, "loss/reg": 0.6447550654411316, "step": 257 }, { "epoch": 0.00258, "grad_norm": 0.34686240553855896, "grad_norm_var": 0.00030888582224503823, "learning_rate": 5e-05, "loss": 0.1575, "loss/crossentropy": 2.7976897954940796, "loss/hidden": 0.0, "loss/logits": 0.15751435607671738, "loss/reg": 0.6442734599113464, "step": 258 }, { "epoch": 0.00259, "grad_norm": 0.4449962377548218, "grad_norm_var": 0.0006036300942528568, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.7374861240386963, "loss/hidden": 0.0, "loss/logits": 0.1891784816980362, "loss/reg": 0.643750786781311, "step": 259 }, { "epoch": 0.0026, "grad_norm": 0.34992745518684387, "grad_norm_var": 0.0006534020233418018, "learning_rate": 5e-05, "loss": 0.165, "loss/crossentropy": 2.643241226673126, "loss/hidden": 0.0, "loss/logits": 0.16503603011369705, "loss/reg": 0.6430286169052124, "step": 260 }, { "epoch": 0.00261, "grad_norm": 0.3633124530315399, "grad_norm_var": 0.0006640832525579861, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.8266258239746094, "loss/hidden": 0.0, "loss/logits": 0.17570256814360619, "loss/reg": 0.6423153281211853, "step": 261 }, { "epoch": 0.00262, "grad_norm": 0.373819500207901, "grad_norm_var": 0.000641466716852553, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.549090623855591, "loss/hidden": 0.0, "loss/logits": 0.1790906935930252, "loss/reg": 0.6415820717811584, "step": 262 }, { "epoch": 0.00263, "grad_norm": 0.3755287826061249, "grad_norm_var": 0.000641533971293029, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.72516793012619, "loss/hidden": 0.0, "loss/logits": 0.17459767684340477, "loss/reg": 0.6409164071083069, "step": 263 }, { "epoch": 0.00264, "grad_norm": 0.3942868709564209, "grad_norm_var": 0.0006420750734748607, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.9032419323921204, "loss/hidden": 0.0, "loss/logits": 0.18272600322961807, "loss/reg": 0.6398616433143616, "step": 264 }, { "epoch": 0.00265, "grad_norm": 0.3677438199520111, "grad_norm_var": 0.0006362212568079043, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.7018914818763733, "loss/hidden": 0.0, "loss/logits": 0.1741430051624775, "loss/reg": 0.6394724249839783, "step": 265 }, { "epoch": 0.00266, "grad_norm": 0.36462029814720154, "grad_norm_var": 0.0006179549216998444, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.721017599105835, "loss/hidden": 0.0, "loss/logits": 0.1757526956498623, "loss/reg": 0.6391441226005554, "step": 266 }, { "epoch": 0.00267, "grad_norm": 0.34848400950431824, "grad_norm_var": 0.0006482176890186709, "learning_rate": 5e-05, "loss": 0.165, "loss/crossentropy": 2.833631992340088, "loss/hidden": 0.0, "loss/logits": 0.16502562910318375, "loss/reg": 0.6385470032691956, "step": 267 }, { "epoch": 0.00268, "grad_norm": 0.39300742745399475, "grad_norm_var": 0.0006458898076078986, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.710054636001587, "loss/hidden": 0.0, "loss/logits": 0.17915990576148033, "loss/reg": 0.638047456741333, "step": 268 }, { "epoch": 0.00269, "grad_norm": 0.364775151014328, "grad_norm_var": 0.0006504145762147417, "learning_rate": 5e-05, "loss": 0.1625, "loss/crossentropy": 2.716250777244568, "loss/hidden": 0.0, "loss/logits": 0.16248183324933052, "loss/reg": 0.6376389861106873, "step": 269 }, { "epoch": 0.0027, "grad_norm": 0.40892544388771057, "grad_norm_var": 0.0007168041400044789, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.953763723373413, "loss/hidden": 0.0, "loss/logits": 0.18213526904582977, "loss/reg": 0.6371251940727234, "step": 270 }, { "epoch": 0.00271, "grad_norm": 0.382039874792099, "grad_norm_var": 0.0006693877345664216, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.814274847507477, "loss/hidden": 0.0, "loss/logits": 0.18282637745141983, "loss/reg": 0.6366888284683228, "step": 271 }, { "epoch": 0.00272, "grad_norm": 0.3807635009288788, "grad_norm_var": 0.0006682624875640642, "learning_rate": 5e-05, "loss": 0.1665, "loss/crossentropy": 2.680392026901245, "loss/hidden": 0.0, "loss/logits": 0.16653598099946976, "loss/reg": 0.6363694071769714, "step": 272 }, { "epoch": 0.00273, "grad_norm": 0.3969613313674927, "grad_norm_var": 0.0006428013286217375, "learning_rate": 5e-05, "loss": 0.1861, "loss/crossentropy": 2.758915603160858, "loss/hidden": 0.0, "loss/logits": 0.18609648942947388, "loss/reg": 0.63584965467453, "step": 273 }, { "epoch": 0.00274, "grad_norm": 0.366432785987854, "grad_norm_var": 0.0005841752824937411, "learning_rate": 5e-05, "loss": 0.1648, "loss/crossentropy": 2.743411898612976, "loss/hidden": 0.0, "loss/logits": 0.1647852137684822, "loss/reg": 0.6358827352523804, "step": 274 }, { "epoch": 0.00275, "grad_norm": 0.42674562335014343, "grad_norm_var": 0.00044616485828952537, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.8236257433891296, "loss/hidden": 0.0, "loss/logits": 0.18169068917632103, "loss/reg": 0.6354775428771973, "step": 275 }, { "epoch": 0.00276, "grad_norm": 0.3811703324317932, "grad_norm_var": 0.00038778924331944373, "learning_rate": 5e-05, "loss": 0.18, "loss/crossentropy": 2.7405359148979187, "loss/hidden": 0.0, "loss/logits": 0.18003957346081734, "loss/reg": 0.6354997158050537, "step": 276 }, { "epoch": 0.00277, "grad_norm": 0.3675341010093689, "grad_norm_var": 0.00037920678786991605, "learning_rate": 5e-05, "loss": 0.1674, "loss/crossentropy": 2.66787451505661, "loss/hidden": 0.0, "loss/logits": 0.1673969253897667, "loss/reg": 0.6353681683540344, "step": 277 }, { "epoch": 0.00278, "grad_norm": 0.3559475541114807, "grad_norm_var": 0.0004158094934959185, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.7069908380508423, "loss/hidden": 0.0, "loss/logits": 0.1680920049548149, "loss/reg": 0.6356846690177917, "step": 278 }, { "epoch": 0.00279, "grad_norm": 0.3813311755657196, "grad_norm_var": 0.00041469792720990977, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.899284601211548, "loss/hidden": 0.0, "loss/logits": 0.17750264704227448, "loss/reg": 0.6354326605796814, "step": 279 }, { "epoch": 0.0028, "grad_norm": 0.42007914185523987, "grad_norm_var": 0.0005052422673438195, "learning_rate": 5e-05, "loss": 0.1998, "loss/crossentropy": 2.7194249033927917, "loss/hidden": 0.0, "loss/logits": 0.19977117702364922, "loss/reg": 0.6355002522468567, "step": 280 }, { "epoch": 0.00281, "grad_norm": 0.396452397108078, "grad_norm_var": 0.0005034847944099697, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.6941859126091003, "loss/hidden": 0.0, "loss/logits": 0.17761104926466942, "loss/reg": 0.6349981427192688, "step": 281 }, { "epoch": 0.00282, "grad_norm": 0.3804742991924286, "grad_norm_var": 0.0004793813792285939, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.7918182015419006, "loss/hidden": 0.0, "loss/logits": 0.18056635931134224, "loss/reg": 0.635048508644104, "step": 282 }, { "epoch": 0.00283, "grad_norm": 2.981536865234375, "grad_norm_var": 0.42116479064994294, "learning_rate": 5e-05, "loss": 0.2765, "loss/crossentropy": 3.0118654370307922, "loss/hidden": 0.0, "loss/logits": 0.276485089212656, "loss/reg": 0.6346772313117981, "step": 283 }, { "epoch": 0.00284, "grad_norm": 0.4781090319156647, "grad_norm_var": 0.4198472787195769, "learning_rate": 5e-05, "loss": 0.1666, "loss/crossentropy": 2.7697991132736206, "loss/hidden": 0.0, "loss/logits": 0.16664018109440804, "loss/reg": 0.6343128085136414, "step": 284 }, { "epoch": 0.00285, "grad_norm": 0.4624028503894806, "grad_norm_var": 0.41797553732271253, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.769831657409668, "loss/hidden": 0.0, "loss/logits": 0.1735757514834404, "loss/reg": 0.6336681246757507, "step": 285 }, { "epoch": 0.00286, "grad_norm": 0.5845192670822144, "grad_norm_var": 0.41635547134799455, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.78686660528183, "loss/hidden": 0.0, "loss/logits": 0.1775919608771801, "loss/reg": 0.6330624222755432, "step": 286 }, { "epoch": 0.00287, "grad_norm": 0.5300150513648987, "grad_norm_var": 0.4139878089488218, "learning_rate": 5e-05, "loss": 0.1704, "loss/crossentropy": 2.798245668411255, "loss/hidden": 0.0, "loss/logits": 0.17041611298918724, "loss/reg": 0.6322849988937378, "step": 287 }, { "epoch": 0.00288, "grad_norm": 0.44377729296684265, "grad_norm_var": 0.41255652635602885, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.8045517802238464, "loss/hidden": 0.0, "loss/logits": 0.17426270619034767, "loss/reg": 0.6315992474555969, "step": 288 }, { "epoch": 0.00289, "grad_norm": 0.5256833434104919, "grad_norm_var": 0.4103717998278528, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.6426761746406555, "loss/hidden": 0.0, "loss/logits": 0.18660663813352585, "loss/reg": 0.6308422684669495, "step": 289 }, { "epoch": 0.0029, "grad_norm": 0.41769924759864807, "grad_norm_var": 0.40898983200559763, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.710031270980835, "loss/hidden": 0.0, "loss/logits": 0.1759370006620884, "loss/reg": 0.6303769946098328, "step": 290 }, { "epoch": 0.00291, "grad_norm": 1.3105566501617432, "grad_norm_var": 0.43788334600592654, "learning_rate": 5e-05, "loss": 0.2347, "loss/crossentropy": 2.9097766280174255, "loss/hidden": 0.0, "loss/logits": 0.2347467504441738, "loss/reg": 0.6295793056488037, "step": 291 }, { "epoch": 0.00292, "grad_norm": 0.5462478399276733, "grad_norm_var": 0.433645693618187, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.8268609046936035, "loss/hidden": 0.0, "loss/logits": 0.17975060269236565, "loss/reg": 0.6286742687225342, "step": 292 }, { "epoch": 0.00293, "grad_norm": 0.5217469334602356, "grad_norm_var": 0.42908970134178903, "learning_rate": 5e-05, "loss": 0.1726, "loss/crossentropy": 2.9223158955574036, "loss/hidden": 0.0, "loss/logits": 0.17257456853985786, "loss/reg": 0.6275652647018433, "step": 293 }, { "epoch": 0.00294, "grad_norm": 0.5931916236877441, "grad_norm_var": 0.42264044362042863, "learning_rate": 5e-05, "loss": 0.1895, "loss/crossentropy": 2.748913884162903, "loss/hidden": 0.0, "loss/logits": 0.18950844556093216, "loss/reg": 0.6262880563735962, "step": 294 }, { "epoch": 0.00295, "grad_norm": 0.471230685710907, "grad_norm_var": 0.4194952509163778, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.784366488456726, "loss/hidden": 0.0, "loss/logits": 0.1855837032198906, "loss/reg": 0.6255433559417725, "step": 295 }, { "epoch": 0.00296, "grad_norm": 0.42228224873542786, "grad_norm_var": 0.41941583014433703, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.724033832550049, "loss/hidden": 0.0, "loss/logits": 0.18501237779855728, "loss/reg": 0.6248136758804321, "step": 296 }, { "epoch": 0.00297, "grad_norm": 0.3666977882385254, "grad_norm_var": 0.4206421779632946, "learning_rate": 5e-05, "loss": 0.1604, "loss/crossentropy": 2.8014946579933167, "loss/hidden": 0.0, "loss/logits": 0.16037723049521446, "loss/reg": 0.6241582036018372, "step": 297 }, { "epoch": 0.00298, "grad_norm": 0.35462960600852966, "grad_norm_var": 0.42174971296933267, "learning_rate": 5e-05, "loss": 0.1614, "loss/crossentropy": 2.879434108734131, "loss/hidden": 0.0, "loss/logits": 0.16138103231787682, "loss/reg": 0.6234741806983948, "step": 298 }, { "epoch": 0.00299, "grad_norm": 0.430354505777359, "grad_norm_var": 0.04841827925276686, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.920572876930237, "loss/hidden": 0.0, "loss/logits": 0.17457816377282143, "loss/reg": 0.6230826377868652, "step": 299 }, { "epoch": 0.003, "grad_norm": 0.46609118580818176, "grad_norm_var": 0.04850836635932592, "learning_rate": 5e-05, "loss": 0.1964, "loss/crossentropy": 2.8018062114715576, "loss/hidden": 0.0, "loss/logits": 0.19644274562597275, "loss/reg": 0.6221161484718323, "step": 300 }, { "epoch": 0.00301, "grad_norm": 0.364413857460022, "grad_norm_var": 0.049964807759293454, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.7433618903160095, "loss/hidden": 0.0, "loss/logits": 0.17234495654702187, "loss/reg": 0.621139645576477, "step": 301 }, { "epoch": 0.00302, "grad_norm": 0.39687469601631165, "grad_norm_var": 0.05059679958010766, "learning_rate": 5e-05, "loss": 0.1879, "loss/crossentropy": 2.7207915782928467, "loss/hidden": 0.0, "loss/logits": 0.18788620457053185, "loss/reg": 0.6205485463142395, "step": 302 }, { "epoch": 0.00303, "grad_norm": 0.3822523355484009, "grad_norm_var": 0.05156892076359274, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.7587072253227234, "loss/hidden": 0.0, "loss/logits": 0.18357843533158302, "loss/reg": 0.6199692487716675, "step": 303 }, { "epoch": 0.00304, "grad_norm": 0.36131444573402405, "grad_norm_var": 0.05262153461827286, "learning_rate": 5e-05, "loss": 0.1665, "loss/crossentropy": 2.814681887626648, "loss/hidden": 0.0, "loss/logits": 0.1664714254438877, "loss/reg": 0.6191954016685486, "step": 304 }, { "epoch": 0.00305, "grad_norm": 0.41829943656921387, "grad_norm_var": 0.052913003893064504, "learning_rate": 5e-05, "loss": 0.1974, "loss/crossentropy": 2.635455906391144, "loss/hidden": 0.0, "loss/logits": 0.19739071652293205, "loss/reg": 0.6186562776565552, "step": 305 }, { "epoch": 0.00306, "grad_norm": 0.38273823261260986, "grad_norm_var": 0.05332172808990864, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.71915340423584, "loss/hidden": 0.0, "loss/logits": 0.18016913533210754, "loss/reg": 0.6179852485656738, "step": 306 }, { "epoch": 0.00307, "grad_norm": 0.5604112148284912, "grad_norm_var": 0.006100738276351691, "learning_rate": 5e-05, "loss": 0.2451, "loss/crossentropy": 2.8575262427330017, "loss/hidden": 0.0, "loss/logits": 0.2450963258743286, "loss/reg": 0.6177489161491394, "step": 307 }, { "epoch": 0.00308, "grad_norm": 0.3664649724960327, "grad_norm_var": 0.005572150731927768, "learning_rate": 5e-05, "loss": 0.1682, "loss/crossentropy": 2.8378955721855164, "loss/hidden": 0.0, "loss/logits": 0.16824688389897346, "loss/reg": 0.6171808242797852, "step": 308 }, { "epoch": 0.00309, "grad_norm": 0.4527394771575928, "grad_norm_var": 0.005013534657549442, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.7741609811782837, "loss/hidden": 0.0, "loss/logits": 0.18833715841174126, "loss/reg": 0.6165443658828735, "step": 309 }, { "epoch": 0.0031, "grad_norm": 0.40747499465942383, "grad_norm_var": 0.0029889062143138617, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.9125460982322693, "loss/hidden": 0.0, "loss/logits": 0.18286899477243423, "loss/reg": 0.6161845326423645, "step": 310 }, { "epoch": 0.00311, "grad_norm": 0.3714599907398224, "grad_norm_var": 0.0028333129211201053, "learning_rate": 5e-05, "loss": 0.1644, "loss/crossentropy": 2.749736964702606, "loss/hidden": 0.0, "loss/logits": 0.1643512099981308, "loss/reg": 0.6160363554954529, "step": 311 }, { "epoch": 0.00312, "grad_norm": 0.34232425689697266, "grad_norm_var": 0.0030649698453003066, "learning_rate": 5e-05, "loss": 0.162, "loss/crossentropy": 2.6927778124809265, "loss/hidden": 0.0, "loss/logits": 0.16197257861495018, "loss/reg": 0.6159820556640625, "step": 312 }, { "epoch": 0.00313, "grad_norm": 0.39075613021850586, "grad_norm_var": 0.0029893988200142257, "learning_rate": 5e-05, "loss": 0.1645, "loss/crossentropy": 2.7432020902633667, "loss/hidden": 0.0, "loss/logits": 0.1645125299692154, "loss/reg": 0.6157231330871582, "step": 313 }, { "epoch": 0.00314, "grad_norm": 0.3791449964046478, "grad_norm_var": 0.0028687299387803716, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.8996816277503967, "loss/hidden": 0.0, "loss/logits": 0.174019493162632, "loss/reg": 0.6155524253845215, "step": 314 }, { "epoch": 0.00315, "grad_norm": 0.3920113444328308, "grad_norm_var": 0.002828794368874136, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.8300365209579468, "loss/hidden": 0.0, "loss/logits": 0.1818542405962944, "loss/reg": 0.6151819825172424, "step": 315 }, { "epoch": 0.00316, "grad_norm": 0.47432082891464233, "grad_norm_var": 0.002903163577606908, "learning_rate": 5e-05, "loss": 0.1678, "loss/crossentropy": 2.55305939912796, "loss/hidden": 0.0, "loss/logits": 0.1678045243024826, "loss/reg": 0.6147633194923401, "step": 316 }, { "epoch": 0.00317, "grad_norm": 0.4371914863586426, "grad_norm_var": 0.0028628039704878074, "learning_rate": 5e-05, "loss": 0.1917, "loss/crossentropy": 2.7877643704414368, "loss/hidden": 0.0, "loss/logits": 0.19173792004585266, "loss/reg": 0.6146110892295837, "step": 317 }, { "epoch": 0.00318, "grad_norm": 0.3844268023967743, "grad_norm_var": 0.0028896854981510954, "learning_rate": 5e-05, "loss": 0.1893, "loss/crossentropy": 2.7709254026412964, "loss/hidden": 0.0, "loss/logits": 0.18933358788490295, "loss/reg": 0.6143233180046082, "step": 318 }, { "epoch": 0.00319, "grad_norm": 0.38410118222236633, "grad_norm_var": 0.0028839320840938877, "learning_rate": 5e-05, "loss": 0.1686, "loss/crossentropy": 2.7451619505882263, "loss/hidden": 0.0, "loss/logits": 0.1685892753303051, "loss/reg": 0.6140788197517395, "step": 319 }, { "epoch": 0.0032, "grad_norm": 0.35669389367103577, "grad_norm_var": 0.0029131494828378403, "learning_rate": 5e-05, "loss": 0.1654, "loss/crossentropy": 2.8895342350006104, "loss/hidden": 0.0, "loss/logits": 0.1653740592300892, "loss/reg": 0.6138609647750854, "step": 320 }, { "epoch": 0.00321, "grad_norm": 0.38920775055885315, "grad_norm_var": 0.0029194419904121196, "learning_rate": 5e-05, "loss": 0.168, "loss/crossentropy": 2.9069936275482178, "loss/hidden": 0.0, "loss/logits": 0.1679585911333561, "loss/reg": 0.6134458780288696, "step": 321 }, { "epoch": 0.00322, "grad_norm": 0.5546866059303284, "grad_norm_var": 0.004269175059001995, "learning_rate": 5e-05, "loss": 0.1991, "loss/crossentropy": 2.705416262149811, "loss/hidden": 0.0, "loss/logits": 0.19911042973399162, "loss/reg": 0.6131358742713928, "step": 322 }, { "epoch": 0.00323, "grad_norm": 0.4151782989501953, "grad_norm_var": 0.0027757974621781755, "learning_rate": 5e-05, "loss": 0.1873, "loss/crossentropy": 2.9403539299964905, "loss/hidden": 0.0, "loss/logits": 0.18726158887147903, "loss/reg": 0.6127427816390991, "step": 323 }, { "epoch": 0.00324, "grad_norm": 0.37726715207099915, "grad_norm_var": 0.002725951965290311, "learning_rate": 5e-05, "loss": 0.1675, "loss/crossentropy": 2.741024434566498, "loss/hidden": 0.0, "loss/logits": 0.16754918545484543, "loss/reg": 0.612453043460846, "step": 324 }, { "epoch": 0.00325, "grad_norm": 0.460709810256958, "grad_norm_var": 0.0027787304444566413, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.9416339993476868, "loss/hidden": 0.0, "loss/logits": 0.18194128945469856, "loss/reg": 0.6119142174720764, "step": 325 }, { "epoch": 0.00326, "grad_norm": 0.5346088409423828, "grad_norm_var": 0.0037917204693333558, "learning_rate": 5e-05, "loss": 0.201, "loss/crossentropy": 2.8080431818962097, "loss/hidden": 0.0, "loss/logits": 0.20102210342884064, "loss/reg": 0.6111528277397156, "step": 326 }, { "epoch": 0.00327, "grad_norm": 0.384436696767807, "grad_norm_var": 0.0037264688090846896, "learning_rate": 5e-05, "loss": 0.1635, "loss/crossentropy": 2.8529574275016785, "loss/hidden": 0.0, "loss/logits": 0.1634748913347721, "loss/reg": 0.6106207370758057, "step": 327 }, { "epoch": 0.00328, "grad_norm": 0.3995545506477356, "grad_norm_var": 0.003368469111584634, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.8986655473709106, "loss/hidden": 0.0, "loss/logits": 0.17555934190750122, "loss/reg": 0.6097209453582764, "step": 328 }, { "epoch": 0.00329, "grad_norm": 0.41318589448928833, "grad_norm_var": 0.0033135208516677653, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.5839784741401672, "loss/hidden": 0.0, "loss/logits": 0.17906590551137924, "loss/reg": 0.609329342842102, "step": 329 }, { "epoch": 0.0033, "grad_norm": 0.43385687470436096, "grad_norm_var": 0.003194947853213063, "learning_rate": 5e-05, "loss": 0.1912, "loss/crossentropy": 2.6864006519317627, "loss/hidden": 0.0, "loss/logits": 0.1911519281566143, "loss/reg": 0.6088165044784546, "step": 330 }, { "epoch": 0.00331, "grad_norm": 0.5417336821556091, "grad_norm_var": 0.003948127358906665, "learning_rate": 5e-05, "loss": 0.2079, "loss/crossentropy": 2.978734254837036, "loss/hidden": 0.0, "loss/logits": 0.2079460583627224, "loss/reg": 0.6080896258354187, "step": 331 }, { "epoch": 0.00332, "grad_norm": 0.4092332124710083, "grad_norm_var": 0.0038614437861692274, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.851550281047821, "loss/hidden": 0.0, "loss/logits": 0.18075324967503548, "loss/reg": 0.6078136563301086, "step": 332 }, { "epoch": 0.00333, "grad_norm": 0.39699894189834595, "grad_norm_var": 0.0039225542176170355, "learning_rate": 5e-05, "loss": 0.1764, "loss/crossentropy": 2.7266076803207397, "loss/hidden": 0.0, "loss/logits": 0.17641152441501617, "loss/reg": 0.6075788140296936, "step": 333 }, { "epoch": 0.00334, "grad_norm": 0.4009898006916046, "grad_norm_var": 0.0038451458215533864, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.7337945103645325, "loss/hidden": 0.0, "loss/logits": 0.17545060440897942, "loss/reg": 0.607309103012085, "step": 334 }, { "epoch": 0.00335, "grad_norm": 0.38226065039634705, "grad_norm_var": 0.0038561986486378985, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.582574784755707, "loss/hidden": 0.0, "loss/logits": 0.17662141099572182, "loss/reg": 0.6068791747093201, "step": 335 }, { "epoch": 0.00336, "grad_norm": 0.3516082167625427, "grad_norm_var": 0.003906277433529448, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.7098079323768616, "loss/hidden": 0.0, "loss/logits": 0.1722240261733532, "loss/reg": 0.6065097451210022, "step": 336 }, { "epoch": 0.00337, "grad_norm": 0.37325382232666016, "grad_norm_var": 0.00400437380839878, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.7094212770462036, "loss/hidden": 0.0, "loss/logits": 0.17519311234354973, "loss/reg": 0.6060733199119568, "step": 337 }, { "epoch": 0.00338, "grad_norm": 0.3805215656757355, "grad_norm_var": 0.002931539161083722, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.8095386624336243, "loss/hidden": 0.0, "loss/logits": 0.17144013196229935, "loss/reg": 0.6060055494308472, "step": 338 }, { "epoch": 0.00339, "grad_norm": 0.40252378582954407, "grad_norm_var": 0.0029428706529153006, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.6204344034194946, "loss/hidden": 0.0, "loss/logits": 0.19056188315153122, "loss/reg": 0.6057007908821106, "step": 339 }, { "epoch": 0.0034, "grad_norm": 0.35972386598587036, "grad_norm_var": 0.0030507682525669675, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.7573349475860596, "loss/hidden": 0.0, "loss/logits": 0.16808072850108147, "loss/reg": 0.6051066517829895, "step": 340 }, { "epoch": 0.00341, "grad_norm": 0.6770856976509094, "grad_norm_var": 0.007322345454351027, "learning_rate": 5e-05, "loss": 0.2068, "loss/crossentropy": 2.8460573554039, "loss/hidden": 0.0, "loss/logits": 0.20682615041732788, "loss/reg": 0.6048007011413574, "step": 341 }, { "epoch": 0.00342, "grad_norm": 0.3791329264640808, "grad_norm_var": 0.006614805666506843, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.86782443523407, "loss/hidden": 0.0, "loss/logits": 0.1793392337858677, "loss/reg": 0.6042160391807556, "step": 342 }, { "epoch": 0.00343, "grad_norm": 0.3785547912120819, "grad_norm_var": 0.0066431970035098304, "learning_rate": 5e-05, "loss": 0.1838, "loss/crossentropy": 2.863288164138794, "loss/hidden": 0.0, "loss/logits": 0.1837516613304615, "loss/reg": 0.6035563349723816, "step": 343 }, { "epoch": 0.00344, "grad_norm": 0.44364485144615173, "grad_norm_var": 0.006659117932864241, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.7911864519119263, "loss/hidden": 0.0, "loss/logits": 0.17731711640954018, "loss/reg": 0.6029053926467896, "step": 344 }, { "epoch": 0.00345, "grad_norm": 0.41362714767456055, "grad_norm_var": 0.006658713359330071, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.7916712164878845, "loss/hidden": 0.0, "loss/logits": 0.17212076112627983, "loss/reg": 0.6022576093673706, "step": 345 }, { "epoch": 0.00346, "grad_norm": 0.4076172709465027, "grad_norm_var": 0.0066543044206645256, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.7311403155326843, "loss/hidden": 0.0, "loss/logits": 0.1757332980632782, "loss/reg": 0.6018686294555664, "step": 346 }, { "epoch": 0.00347, "grad_norm": 0.4019361436367035, "grad_norm_var": 0.0055816528822431825, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.7973012924194336, "loss/hidden": 0.0, "loss/logits": 0.18056273832917213, "loss/reg": 0.6012129187583923, "step": 347 }, { "epoch": 0.00348, "grad_norm": 0.3733069896697998, "grad_norm_var": 0.00566560886109283, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.8665059208869934, "loss/hidden": 0.0, "loss/logits": 0.17407039552927017, "loss/reg": 0.6003652811050415, "step": 348 }, { "epoch": 0.00349, "grad_norm": 0.3940782845020294, "grad_norm_var": 0.00567029915279762, "learning_rate": 5e-05, "loss": 0.1778, "loss/crossentropy": 2.7297754287719727, "loss/hidden": 0.0, "loss/logits": 0.17782450839877129, "loss/reg": 0.5997918248176575, "step": 349 }, { "epoch": 0.0035, "grad_norm": 0.36849528551101685, "grad_norm_var": 0.005764462263543957, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.6991987228393555, "loss/hidden": 0.0, "loss/logits": 0.17474957555532455, "loss/reg": 0.5992726683616638, "step": 350 }, { "epoch": 0.00351, "grad_norm": 0.37783846259117126, "grad_norm_var": 0.005779363831323384, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.74223130941391, "loss/hidden": 0.0, "loss/logits": 0.17424119636416435, "loss/reg": 0.598996639251709, "step": 351 }, { "epoch": 0.00352, "grad_norm": 0.34857141971588135, "grad_norm_var": 0.005801633514813468, "learning_rate": 5e-05, "loss": 0.1635, "loss/crossentropy": 2.7899149656295776, "loss/hidden": 0.0, "loss/logits": 0.16349099203944206, "loss/reg": 0.5979835391044617, "step": 352 }, { "epoch": 0.00353, "grad_norm": 0.38306719064712524, "grad_norm_var": 0.0057661213153224625, "learning_rate": 5e-05, "loss": 0.1719, "loss/crossentropy": 2.7702980637550354, "loss/hidden": 0.0, "loss/logits": 0.17191964015364647, "loss/reg": 0.5972602367401123, "step": 353 }, { "epoch": 0.00354, "grad_norm": 0.4106595814228058, "grad_norm_var": 0.005722083267414772, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.6797468066215515, "loss/hidden": 0.0, "loss/logits": 0.1730450540781021, "loss/reg": 0.5967325568199158, "step": 354 }, { "epoch": 0.00355, "grad_norm": 0.38218018412590027, "grad_norm_var": 0.00576142442529601, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.85380482673645, "loss/hidden": 0.0, "loss/logits": 0.17732341215014458, "loss/reg": 0.5962488651275635, "step": 355 }, { "epoch": 0.00356, "grad_norm": 0.3581814169883728, "grad_norm_var": 0.005771135512007227, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.7387739419937134, "loss/hidden": 0.0, "loss/logits": 0.17462486401200294, "loss/reg": 0.595615565776825, "step": 356 }, { "epoch": 0.00357, "grad_norm": 0.439247190952301, "grad_norm_var": 0.0007138867136148641, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.832124352455139, "loss/hidden": 0.0, "loss/logits": 0.18056054040789604, "loss/reg": 0.5949600338935852, "step": 357 }, { "epoch": 0.00358, "grad_norm": 0.41376587748527527, "grad_norm_var": 0.0007328583032164548, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.7249953150749207, "loss/hidden": 0.0, "loss/logits": 0.18347394838929176, "loss/reg": 0.5940920114517212, "step": 358 }, { "epoch": 0.00359, "grad_norm": 0.3810238242149353, "grad_norm_var": 0.0007283445470831891, "learning_rate": 5e-05, "loss": 0.1784, "loss/crossentropy": 2.595168113708496, "loss/hidden": 0.0, "loss/logits": 0.17843574285507202, "loss/reg": 0.5934211611747742, "step": 359 }, { "epoch": 0.0036, "grad_norm": 0.42762747406959534, "grad_norm_var": 0.0006374531154284176, "learning_rate": 5e-05, "loss": 0.1881, "loss/crossentropy": 2.6344693303108215, "loss/hidden": 0.0, "loss/logits": 0.18808726966381073, "loss/reg": 0.5929632186889648, "step": 360 }, { "epoch": 0.00361, "grad_norm": 0.39267221093177795, "grad_norm_var": 0.0006060820745644809, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 2.6138933300971985, "loss/hidden": 0.0, "loss/logits": 0.1769532896578312, "loss/reg": 0.5923884510993958, "step": 361 }, { "epoch": 0.00362, "grad_norm": 0.42071565985679626, "grad_norm_var": 0.0006453603710681895, "learning_rate": 5e-05, "loss": 0.1988, "loss/crossentropy": 2.749572992324829, "loss/hidden": 0.0, "loss/logits": 0.19881777465343475, "loss/reg": 0.5920059084892273, "step": 362 }, { "epoch": 0.00363, "grad_norm": 0.418607622385025, "grad_norm_var": 0.0006846282599237955, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.780629813671112, "loss/hidden": 0.0, "loss/logits": 0.17724807187914848, "loss/reg": 0.5918452739715576, "step": 363 }, { "epoch": 0.00364, "grad_norm": 0.47056329250335693, "grad_norm_var": 0.0010187810039602341, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.7792168855667114, "loss/hidden": 0.0, "loss/logits": 0.1771654710173607, "loss/reg": 0.59168541431427, "step": 364 }, { "epoch": 0.00365, "grad_norm": 0.5536573529243469, "grad_norm_var": 0.0025012713306054478, "learning_rate": 5e-05, "loss": 0.1952, "loss/crossentropy": 2.9163432121276855, "loss/hidden": 0.0, "loss/logits": 0.19520263373851776, "loss/reg": 0.59149569272995, "step": 365 }, { "epoch": 0.00366, "grad_norm": 0.41469600796699524, "grad_norm_var": 0.0023840585347143024, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 2.857950270175934, "loss/hidden": 0.0, "loss/logits": 0.1776857189834118, "loss/reg": 0.5910859704017639, "step": 366 }, { "epoch": 0.00367, "grad_norm": 0.41129082441329956, "grad_norm_var": 0.0023013289890903453, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.797708034515381, "loss/hidden": 0.0, "loss/logits": 0.18357457593083382, "loss/reg": 0.5906261205673218, "step": 367 }, { "epoch": 0.00368, "grad_norm": 0.4086815416812897, "grad_norm_var": 0.0020015004518984746, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.834747314453125, "loss/hidden": 0.0, "loss/logits": 0.18137776851654053, "loss/reg": 0.5902115106582642, "step": 368 }, { "epoch": 0.00369, "grad_norm": 0.3696390390396118, "grad_norm_var": 0.002075162070222828, "learning_rate": 5e-05, "loss": 0.1618, "loss/crossentropy": 2.6722304224967957, "loss/hidden": 0.0, "loss/logits": 0.16181737929582596, "loss/reg": 0.5896137952804565, "step": 369 }, { "epoch": 0.0037, "grad_norm": 0.43068623542785645, "grad_norm_var": 0.0020830966483702086, "learning_rate": 5e-05, "loss": 0.1687, "loss/crossentropy": 2.736994683742523, "loss/hidden": 0.0, "loss/logits": 0.1687219738960266, "loss/reg": 0.5891397595405579, "step": 370 }, { "epoch": 0.00371, "grad_norm": 0.4438025951385498, "grad_norm_var": 0.002023433457028612, "learning_rate": 5e-05, "loss": 0.19, "loss/crossentropy": 2.639078915119171, "loss/hidden": 0.0, "loss/logits": 0.189987700432539, "loss/reg": 0.5885957479476929, "step": 371 }, { "epoch": 0.00372, "grad_norm": 0.4177490472793579, "grad_norm_var": 0.001736914015593257, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.729760766029358, "loss/hidden": 0.0, "loss/logits": 0.1889122985303402, "loss/reg": 0.5881770253181458, "step": 372 }, { "epoch": 0.00373, "grad_norm": 0.37580448389053345, "grad_norm_var": 0.0018755844645033807, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.7882957458496094, "loss/hidden": 0.0, "loss/logits": 0.17420567572116852, "loss/reg": 0.5878958702087402, "step": 373 }, { "epoch": 0.00374, "grad_norm": 0.4087849259376526, "grad_norm_var": 0.001882561374074602, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.8258581161499023, "loss/hidden": 0.0, "loss/logits": 0.17425675690174103, "loss/reg": 0.5874165296554565, "step": 374 }, { "epoch": 0.00375, "grad_norm": 0.40947815775871277, "grad_norm_var": 0.0017791266827080284, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.918753445148468, "loss/hidden": 0.0, "loss/logits": 0.1740703210234642, "loss/reg": 0.5873636603355408, "step": 375 }, { "epoch": 0.00376, "grad_norm": 0.3950541019439697, "grad_norm_var": 0.00182709563577886, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 2.81976717710495, "loss/hidden": 0.0, "loss/logits": 0.17119481414556503, "loss/reg": 0.5872699618339539, "step": 376 }, { "epoch": 0.00377, "grad_norm": 0.3760331869125366, "grad_norm_var": 0.0019080611827182125, "learning_rate": 5e-05, "loss": 0.1724, "loss/crossentropy": 2.7792267203330994, "loss/hidden": 0.0, "loss/logits": 0.17243267223238945, "loss/reg": 0.5868967175483704, "step": 377 }, { "epoch": 0.00378, "grad_norm": 0.3866977095603943, "grad_norm_var": 0.0019786280597973755, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.789883553981781, "loss/hidden": 0.0, "loss/logits": 0.18089993298053741, "loss/reg": 0.5866216421127319, "step": 378 }, { "epoch": 0.00379, "grad_norm": 0.4040381908416748, "grad_norm_var": 0.0019911061590992874, "learning_rate": 5e-05, "loss": 0.1805, "loss/crossentropy": 2.6780543327331543, "loss/hidden": 0.0, "loss/logits": 0.1804923713207245, "loss/reg": 0.5862380862236023, "step": 379 }, { "epoch": 0.0038, "grad_norm": 0.39698526263237, "grad_norm_var": 0.0018068417785806516, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.82189279794693, "loss/hidden": 0.0, "loss/logits": 0.1807085946202278, "loss/reg": 0.585616946220398, "step": 380 }, { "epoch": 0.00381, "grad_norm": 0.7630665302276611, "grad_norm_var": 0.008483518016259477, "learning_rate": 5e-05, "loss": 0.2001, "loss/crossentropy": 2.799209713935852, "loss/hidden": 0.0, "loss/logits": 0.20006826519966125, "loss/reg": 0.585263192653656, "step": 381 }, { "epoch": 0.00382, "grad_norm": 0.43940269947052, "grad_norm_var": 0.00848515452019868, "learning_rate": 5e-05, "loss": 0.1726, "loss/crossentropy": 2.8344491124153137, "loss/hidden": 0.0, "loss/logits": 0.17257989197969437, "loss/reg": 0.5849599242210388, "step": 382 }, { "epoch": 0.00383, "grad_norm": 0.43284982442855835, "grad_norm_var": 0.00846811413541693, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.753903329372406, "loss/hidden": 0.0, "loss/logits": 0.17858872190117836, "loss/reg": 0.5849275588989258, "step": 383 }, { "epoch": 0.00384, "grad_norm": 0.3986302316188812, "grad_norm_var": 0.0085012192718563, "learning_rate": 5e-05, "loss": 0.1703, "loss/crossentropy": 2.9002522826194763, "loss/hidden": 0.0, "loss/logits": 0.17030159011483192, "loss/reg": 0.5849871635437012, "step": 384 }, { "epoch": 0.00385, "grad_norm": 0.472619891166687, "grad_norm_var": 0.008362091105227102, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.9094314575195312, "loss/hidden": 0.0, "loss/logits": 0.18491265177726746, "loss/reg": 0.5849130153656006, "step": 385 }, { "epoch": 0.00386, "grad_norm": 0.4059242308139801, "grad_norm_var": 0.008412939539572154, "learning_rate": 5e-05, "loss": 0.1696, "loss/crossentropy": 2.7713544368743896, "loss/hidden": 0.0, "loss/logits": 0.16957605630159378, "loss/reg": 0.5850569605827332, "step": 386 }, { "epoch": 0.00387, "grad_norm": 0.40131813287734985, "grad_norm_var": 0.00846417332788434, "learning_rate": 5e-05, "loss": 0.1728, "loss/crossentropy": 2.792680263519287, "loss/hidden": 0.0, "loss/logits": 0.17280464619398117, "loss/reg": 0.5853747725486755, "step": 387 }, { "epoch": 0.00388, "grad_norm": 0.44553592801094055, "grad_norm_var": 0.008466014151907652, "learning_rate": 5e-05, "loss": 0.1877, "loss/crossentropy": 2.7707930207252502, "loss/hidden": 0.0, "loss/logits": 0.18773751333355904, "loss/reg": 0.5852359533309937, "step": 388 }, { "epoch": 0.00389, "grad_norm": 0.42672857642173767, "grad_norm_var": 0.008246437505886783, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.724213123321533, "loss/hidden": 0.0, "loss/logits": 0.18039826676249504, "loss/reg": 0.5849812030792236, "step": 389 }, { "epoch": 0.0039, "grad_norm": 0.39746537804603577, "grad_norm_var": 0.008294308380923765, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.846687376499176, "loss/hidden": 0.0, "loss/logits": 0.1792096234858036, "loss/reg": 0.5848593711853027, "step": 390 }, { "epoch": 0.00391, "grad_norm": 0.3957357704639435, "grad_norm_var": 0.008351939992373191, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.8164305090904236, "loss/hidden": 0.0, "loss/logits": 0.17417019233107567, "loss/reg": 0.5846148729324341, "step": 391 }, { "epoch": 0.00392, "grad_norm": 0.4590193033218384, "grad_norm_var": 0.008278656658152583, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.707228720188141, "loss/hidden": 0.0, "loss/logits": 0.18924356251955032, "loss/reg": 0.5849184989929199, "step": 392 }, { "epoch": 0.00393, "grad_norm": 0.4552902281284332, "grad_norm_var": 0.008020350004020571, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.7451746463775635, "loss/hidden": 0.0, "loss/logits": 0.18314184993505478, "loss/reg": 0.5845480561256409, "step": 393 }, { "epoch": 0.00394, "grad_norm": 0.47150692343711853, "grad_norm_var": 0.007837956883828101, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.791899621486664, "loss/hidden": 0.0, "loss/logits": 0.1841963566839695, "loss/reg": 0.5846589803695679, "step": 394 }, { "epoch": 0.00395, "grad_norm": 0.37800633907318115, "grad_norm_var": 0.008032489644381786, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.957661509513855, "loss/hidden": 0.0, "loss/logits": 0.18240166082978249, "loss/reg": 0.5845913887023926, "step": 395 }, { "epoch": 0.00396, "grad_norm": 0.3868597745895386, "grad_norm_var": 0.008105415283185483, "learning_rate": 5e-05, "loss": 0.1707, "loss/crossentropy": 2.716295599937439, "loss/hidden": 0.0, "loss/logits": 0.1706732027232647, "loss/reg": 0.58450847864151, "step": 396 }, { "epoch": 0.00397, "grad_norm": 0.41772395372390747, "grad_norm_var": 0.0009423328059000369, "learning_rate": 5e-05, "loss": 0.1751, "loss/crossentropy": 2.832162916660309, "loss/hidden": 0.0, "loss/logits": 0.17506500706076622, "loss/reg": 0.5844741463661194, "step": 397 }, { "epoch": 0.00398, "grad_norm": 0.4163476526737213, "grad_norm_var": 0.0009283243375692233, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.770324468612671, "loss/hidden": 0.0, "loss/logits": 0.176910649985075, "loss/reg": 0.5840878486633301, "step": 398 }, { "epoch": 0.00399, "grad_norm": 0.3898262083530426, "grad_norm_var": 0.000985202299642533, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.6994412541389465, "loss/hidden": 0.0, "loss/logits": 0.1707712933421135, "loss/reg": 0.5841794013977051, "step": 399 }, { "epoch": 0.004, "grad_norm": 0.3953019976615906, "grad_norm_var": 0.000995337231657667, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.9121304154396057, "loss/hidden": 0.0, "loss/logits": 0.17168370634317398, "loss/reg": 0.5838119983673096, "step": 400 }, { "epoch": 0.00401, "grad_norm": 0.37097325921058655, "grad_norm_var": 0.0009238811484828228, "learning_rate": 5e-05, "loss": 0.1654, "loss/crossentropy": 2.779486835002899, "loss/hidden": 0.0, "loss/logits": 0.16539718210697174, "loss/reg": 0.5838875770568848, "step": 401 }, { "epoch": 0.00402, "grad_norm": 0.3963313102722168, "grad_norm_var": 0.0009391277261223944, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.5993003845214844, "loss/hidden": 0.0, "loss/logits": 0.1794535517692566, "loss/reg": 0.5834932327270508, "step": 402 }, { "epoch": 0.00403, "grad_norm": 0.44347628951072693, "grad_norm_var": 0.0009859603666994823, "learning_rate": 5e-05, "loss": 0.1994, "loss/crossentropy": 2.6690316796302795, "loss/hidden": 0.0, "loss/logits": 0.19936401024460793, "loss/reg": 0.5833864212036133, "step": 403 }, { "epoch": 0.00404, "grad_norm": 0.40031078457832336, "grad_norm_var": 0.0009319700705569052, "learning_rate": 5e-05, "loss": 0.1857, "loss/crossentropy": 2.8152228593826294, "loss/hidden": 0.0, "loss/logits": 0.18572049587965012, "loss/reg": 0.5832197666168213, "step": 404 }, { "epoch": 0.00405, "grad_norm": 0.40114834904670715, "grad_norm_var": 0.000924530110886564, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.7632827162742615, "loss/hidden": 0.0, "loss/logits": 0.17476283386349678, "loss/reg": 0.5828325748443604, "step": 405 }, { "epoch": 0.00406, "grad_norm": 0.4073428213596344, "grad_norm_var": 0.0009128585412395622, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.790923058986664, "loss/hidden": 0.0, "loss/logits": 0.17633340135216713, "loss/reg": 0.5823079943656921, "step": 406 }, { "epoch": 0.00407, "grad_norm": 0.4071415066719055, "grad_norm_var": 0.0008969013824560872, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.788071632385254, "loss/hidden": 0.0, "loss/logits": 0.18063032627105713, "loss/reg": 0.5823886394500732, "step": 407 }, { "epoch": 0.00408, "grad_norm": 0.5077223181724548, "grad_norm_var": 0.0013486116025118104, "learning_rate": 5e-05, "loss": 0.1949, "loss/crossentropy": 2.7864105701446533, "loss/hidden": 0.0, "loss/logits": 0.19490304216742516, "loss/reg": 0.5818777084350586, "step": 408 }, { "epoch": 0.00409, "grad_norm": 0.563755989074707, "grad_norm_var": 0.00266179494699698, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.7841888666152954, "loss/hidden": 0.0, "loss/logits": 0.18019061908125877, "loss/reg": 0.5819647312164307, "step": 409 }, { "epoch": 0.0041, "grad_norm": 0.4199267029762268, "grad_norm_var": 0.0024883634860263276, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 2.7937299013137817, "loss/hidden": 0.0, "loss/logits": 0.18373947218060493, "loss/reg": 0.5819553732872009, "step": 410 }, { "epoch": 0.00411, "grad_norm": 0.3726835250854492, "grad_norm_var": 0.002519147756132034, "learning_rate": 5e-05, "loss": 0.1641, "loss/crossentropy": 2.9953745007514954, "loss/hidden": 0.0, "loss/logits": 0.16414344683289528, "loss/reg": 0.5821545124053955, "step": 411 }, { "epoch": 0.00412, "grad_norm": 0.37909170985221863, "grad_norm_var": 0.002555746768842804, "learning_rate": 5e-05, "loss": 0.1698, "loss/crossentropy": 2.8532305359840393, "loss/hidden": 0.0, "loss/logits": 0.16976147145032883, "loss/reg": 0.5821978449821472, "step": 412 }, { "epoch": 0.00413, "grad_norm": 0.4100850522518158, "grad_norm_var": 0.002559745280501687, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.5914472341537476, "loss/hidden": 0.0, "loss/logits": 0.1925639696419239, "loss/reg": 0.5821495652198792, "step": 413 }, { "epoch": 0.00414, "grad_norm": 0.3752373158931732, "grad_norm_var": 0.002672192520969274, "learning_rate": 5e-05, "loss": 0.1754, "loss/crossentropy": 2.9987975358963013, "loss/hidden": 0.0, "loss/logits": 0.1754269115626812, "loss/reg": 0.5823691487312317, "step": 414 }, { "epoch": 0.00415, "grad_norm": 0.3841235935688019, "grad_norm_var": 0.002693382744506941, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.763660728931427, "loss/hidden": 0.0, "loss/logits": 0.1757623367011547, "loss/reg": 0.582161545753479, "step": 415 }, { "epoch": 0.00416, "grad_norm": 0.3570231795310974, "grad_norm_var": 0.00288379169742277, "learning_rate": 5e-05, "loss": 0.168, "loss/crossentropy": 2.758460223674774, "loss/hidden": 0.0, "loss/logits": 0.16795291006565094, "loss/reg": 0.5821666121482849, "step": 416 }, { "epoch": 0.00417, "grad_norm": 0.3734234571456909, "grad_norm_var": 0.002870674459150694, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.7249505519866943, "loss/hidden": 0.0, "loss/logits": 0.1773550808429718, "loss/reg": 0.5819055438041687, "step": 417 }, { "epoch": 0.00418, "grad_norm": 0.36981824040412903, "grad_norm_var": 0.0029715060864126527, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.8472041487693787, "loss/hidden": 0.0, "loss/logits": 0.17362413555383682, "loss/reg": 0.582258939743042, "step": 418 }, { "epoch": 0.00419, "grad_norm": 0.40299826860427856, "grad_norm_var": 0.002897389264898405, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.7846495509147644, "loss/hidden": 0.0, "loss/logits": 0.17561296746134758, "loss/reg": 0.5819419026374817, "step": 419 }, { "epoch": 0.0042, "grad_norm": 0.3982432186603546, "grad_norm_var": 0.0028998422079659155, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.8036774396896362, "loss/hidden": 0.0, "loss/logits": 0.17744813859462738, "loss/reg": 0.5817914605140686, "step": 420 }, { "epoch": 0.00421, "grad_norm": 0.396598219871521, "grad_norm_var": 0.0029053599081703243, "learning_rate": 5e-05, "loss": 0.1925, "loss/crossentropy": 2.659131944179535, "loss/hidden": 0.0, "loss/logits": 0.19252929836511612, "loss/reg": 0.5821592807769775, "step": 421 }, { "epoch": 0.00422, "grad_norm": 0.3717289865016937, "grad_norm_var": 0.002986925603063251, "learning_rate": 5e-05, "loss": 0.1704, "loss/crossentropy": 2.873345971107483, "loss/hidden": 0.0, "loss/logits": 0.1703544557094574, "loss/reg": 0.5818156003952026, "step": 422 }, { "epoch": 0.00423, "grad_norm": 0.40224766731262207, "grad_norm_var": 0.0029874166579480057, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.7183436155319214, "loss/hidden": 0.0, "loss/logits": 0.18251808360219002, "loss/reg": 0.5813149213790894, "step": 423 }, { "epoch": 0.00424, "grad_norm": 0.3594338297843933, "grad_norm_var": 0.0023365710890668745, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 2.755337953567505, "loss/hidden": 0.0, "loss/logits": 0.17050115019083023, "loss/reg": 0.5807434320449829, "step": 424 }, { "epoch": 0.00425, "grad_norm": 0.4195038378238678, "grad_norm_var": 0.000411062438941201, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.7850858569145203, "loss/hidden": 0.0, "loss/logits": 0.1814507134258747, "loss/reg": 0.580720841884613, "step": 425 }, { "epoch": 0.00426, "grad_norm": 0.39549335837364197, "grad_norm_var": 0.00034114024216596407, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.7257344126701355, "loss/hidden": 0.0, "loss/logits": 0.18313366174697876, "loss/reg": 0.5806552767753601, "step": 426 }, { "epoch": 0.00427, "grad_norm": 0.38917210698127747, "grad_norm_var": 0.00032999221643899014, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.70923113822937, "loss/hidden": 0.0, "loss/logits": 0.18044820055365562, "loss/reg": 0.5801239609718323, "step": 427 }, { "epoch": 0.00428, "grad_norm": 0.3752542734146118, "grad_norm_var": 0.00033471019929868227, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.737777829170227, "loss/hidden": 0.0, "loss/logits": 0.18848220631480217, "loss/reg": 0.5796414017677307, "step": 428 }, { "epoch": 0.00429, "grad_norm": 0.3592444062232971, "grad_norm_var": 0.00033484942441988077, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.847250282764435, "loss/hidden": 0.0, "loss/logits": 0.1746002770960331, "loss/reg": 0.5792865753173828, "step": 429 }, { "epoch": 0.0043, "grad_norm": 0.5438507795333862, "grad_norm_var": 0.001935067170886736, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.7627553939819336, "loss/hidden": 0.0, "loss/logits": 0.17859209701418877, "loss/reg": 0.5787414908409119, "step": 430 }, { "epoch": 0.00431, "grad_norm": 0.4011813700199127, "grad_norm_var": 0.0019316205614038535, "learning_rate": 5e-05, "loss": 0.1739, "loss/crossentropy": 2.7840686440467834, "loss/hidden": 0.0, "loss/logits": 0.17386120185256004, "loss/reg": 0.5786591172218323, "step": 431 }, { "epoch": 0.00432, "grad_norm": 0.36942005157470703, "grad_norm_var": 0.0018789475137448997, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.7540953755378723, "loss/hidden": 0.0, "loss/logits": 0.17347709089517593, "loss/reg": 0.5786066651344299, "step": 432 }, { "epoch": 0.00433, "grad_norm": 0.36542290449142456, "grad_norm_var": 0.0019064721419176404, "learning_rate": 5e-05, "loss": 0.1631, "loss/crossentropy": 2.794360876083374, "loss/hidden": 0.0, "loss/logits": 0.16313519701361656, "loss/reg": 0.5781553983688354, "step": 433 }, { "epoch": 0.00434, "grad_norm": 0.5787592530250549, "grad_norm_var": 0.003934136579948344, "learning_rate": 5e-05, "loss": 0.193, "loss/crossentropy": 2.953792989253998, "loss/hidden": 0.0, "loss/logits": 0.19302739575505257, "loss/reg": 0.57806396484375, "step": 434 }, { "epoch": 0.00435, "grad_norm": 0.4253801107406616, "grad_norm_var": 0.003950416307799391, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.776081919670105, "loss/hidden": 0.0, "loss/logits": 0.18330252915620804, "loss/reg": 0.5776569843292236, "step": 435 }, { "epoch": 0.00436, "grad_norm": 0.42276060581207275, "grad_norm_var": 0.003951404670663339, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.806955397129059, "loss/hidden": 0.0, "loss/logits": 0.18240658938884735, "loss/reg": 0.5772163271903992, "step": 436 }, { "epoch": 0.00437, "grad_norm": 0.403897762298584, "grad_norm_var": 0.003940751373217266, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.8416900038719177, "loss/hidden": 0.0, "loss/logits": 0.18038833513855934, "loss/reg": 0.5768018364906311, "step": 437 }, { "epoch": 0.00438, "grad_norm": 0.576501727104187, "grad_norm_var": 0.005477755146007354, "learning_rate": 5e-05, "loss": 0.204, "loss/crossentropy": 2.7251105904579163, "loss/hidden": 0.0, "loss/logits": 0.20401697978377342, "loss/reg": 0.5766705870628357, "step": 438 }, { "epoch": 0.00439, "grad_norm": 0.4603712558746338, "grad_norm_var": 0.005518618715385637, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.7764219641685486, "loss/hidden": 0.0, "loss/logits": 0.18892782926559448, "loss/reg": 0.5762342810630798, "step": 439 }, { "epoch": 0.0044, "grad_norm": 0.4142100512981415, "grad_norm_var": 0.0052064468857804655, "learning_rate": 5e-05, "loss": 0.1744, "loss/crossentropy": 2.768882632255554, "loss/hidden": 0.0, "loss/logits": 0.174424946308136, "loss/reg": 0.5760279893875122, "step": 440 }, { "epoch": 0.00441, "grad_norm": 0.4051181972026825, "grad_norm_var": 0.005241962005983114, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 2.680642604827881, "loss/hidden": 0.0, "loss/logits": 0.1785084791481495, "loss/reg": 0.5755354166030884, "step": 441 }, { "epoch": 0.00442, "grad_norm": 0.38737961649894714, "grad_norm_var": 0.00528381522509556, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.8040342926979065, "loss/hidden": 0.0, "loss/logits": 0.18461211025714874, "loss/reg": 0.5751298069953918, "step": 442 }, { "epoch": 0.00443, "grad_norm": 0.4075280427932739, "grad_norm_var": 0.005205266926479564, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.789302349090576, "loss/hidden": 0.0, "loss/logits": 0.18090900778770447, "loss/reg": 0.5749806761741638, "step": 443 }, { "epoch": 0.00444, "grad_norm": 0.38606885075569153, "grad_norm_var": 0.005132169152184037, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.7104042172431946, "loss/hidden": 0.0, "loss/logits": 0.1783064529299736, "loss/reg": 0.5747793316841125, "step": 444 }, { "epoch": 0.00445, "grad_norm": 0.4039571285247803, "grad_norm_var": 0.004825201756458593, "learning_rate": 5e-05, "loss": 0.193, "loss/crossentropy": 2.647743821144104, "loss/hidden": 0.0, "loss/logits": 0.19295665621757507, "loss/reg": 0.5744392275810242, "step": 445 }, { "epoch": 0.00446, "grad_norm": 0.3933171331882477, "grad_norm_var": 0.004046437993535908, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.695836305618286, "loss/hidden": 0.0, "loss/logits": 0.19062871485948563, "loss/reg": 0.5743327736854553, "step": 446 }, { "epoch": 0.00447, "grad_norm": 0.3783518373966217, "grad_norm_var": 0.004151757026561261, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.738230884075165, "loss/hidden": 0.0, "loss/logits": 0.17578332498669624, "loss/reg": 0.5737802386283875, "step": 447 }, { "epoch": 0.00448, "grad_norm": 0.4196593463420868, "grad_norm_var": 0.00394622430680673, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.7796876430511475, "loss/hidden": 0.0, "loss/logits": 0.1849343292415142, "loss/reg": 0.5736260414123535, "step": 448 }, { "epoch": 0.00449, "grad_norm": 0.4242750406265259, "grad_norm_var": 0.0036811315635943273, "learning_rate": 5e-05, "loss": 0.1912, "loss/crossentropy": 2.7677462100982666, "loss/hidden": 0.0, "loss/logits": 0.1912018582224846, "loss/reg": 0.5737444162368774, "step": 449 }, { "epoch": 0.0045, "grad_norm": 0.4159826338291168, "grad_norm_var": 0.0021187643442514077, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 2.924024760723114, "loss/hidden": 0.0, "loss/logits": 0.18373457342386246, "loss/reg": 0.5738208293914795, "step": 450 }, { "epoch": 0.00451, "grad_norm": 0.4384543299674988, "grad_norm_var": 0.0021383080222619664, "learning_rate": 5e-05, "loss": 0.1858, "loss/crossentropy": 2.828491747379303, "loss/hidden": 0.0, "loss/logits": 0.18577688932418823, "loss/reg": 0.5741365551948547, "step": 451 }, { "epoch": 0.00452, "grad_norm": 0.36623266339302063, "grad_norm_var": 0.0023256149774210128, "learning_rate": 5e-05, "loss": 0.1706, "loss/crossentropy": 2.591509759426117, "loss/hidden": 0.0, "loss/logits": 0.17055655643343925, "loss/reg": 0.5744441151618958, "step": 452 }, { "epoch": 0.00453, "grad_norm": 0.3640042841434479, "grad_norm_var": 0.002497869200450313, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 2.8003897666931152, "loss/hidden": 0.0, "loss/logits": 0.1711767576634884, "loss/reg": 0.5742901563644409, "step": 453 }, { "epoch": 0.00454, "grad_norm": 0.3884834945201874, "grad_norm_var": 0.0006608076805941072, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.855776846408844, "loss/hidden": 0.0, "loss/logits": 0.17857619374990463, "loss/reg": 0.5738571882247925, "step": 454 }, { "epoch": 0.00455, "grad_norm": 0.39492306113243103, "grad_norm_var": 0.0004308201461168763, "learning_rate": 5e-05, "loss": 0.1731, "loss/crossentropy": 2.7390910387039185, "loss/hidden": 0.0, "loss/logits": 0.17305242642760277, "loss/reg": 0.5741905570030212, "step": 455 }, { "epoch": 0.00456, "grad_norm": 0.4077781140804291, "grad_norm_var": 0.00042057323504788674, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.894853889942169, "loss/hidden": 0.0, "loss/logits": 0.17618122324347496, "loss/reg": 0.5738011598587036, "step": 456 }, { "epoch": 0.00457, "grad_norm": 0.4236481189727783, "grad_norm_var": 0.0004575329852199143, "learning_rate": 5e-05, "loss": 0.1923, "loss/crossentropy": 2.8220651149749756, "loss/hidden": 0.0, "loss/logits": 0.19226129725575447, "loss/reg": 0.5731528997421265, "step": 457 }, { "epoch": 0.00458, "grad_norm": 0.37275391817092896, "grad_norm_var": 0.0004955186745842462, "learning_rate": 5e-05, "loss": 0.1715, "loss/crossentropy": 2.798386514186859, "loss/hidden": 0.0, "loss/logits": 0.17149720713496208, "loss/reg": 0.57298344373703, "step": 458 }, { "epoch": 0.00459, "grad_norm": 0.3688414990901947, "grad_norm_var": 0.0005455269613291791, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.805033504962921, "loss/hidden": 0.0, "loss/logits": 0.1740814931690693, "loss/reg": 0.5725979208946228, "step": 459 }, { "epoch": 0.0046, "grad_norm": 0.38976651430130005, "grad_norm_var": 0.000541154555367133, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.7494282126426697, "loss/hidden": 0.0, "loss/logits": 0.16880641877651215, "loss/reg": 0.5721160769462585, "step": 460 }, { "epoch": 0.00461, "grad_norm": 0.3831474781036377, "grad_norm_var": 0.00054864385134518, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.7287977933883667, "loss/hidden": 0.0, "loss/logits": 0.17921587824821472, "loss/reg": 0.5715569257736206, "step": 461 }, { "epoch": 0.00462, "grad_norm": 0.36860400438308716, "grad_norm_var": 0.0005943412689863537, "learning_rate": 5e-05, "loss": 0.1697, "loss/crossentropy": 2.7832899689674377, "loss/hidden": 0.0, "loss/logits": 0.16973211243748665, "loss/reg": 0.5710586905479431, "step": 462 }, { "epoch": 0.00463, "grad_norm": 0.44674229621887207, "grad_norm_var": 0.0007434618207279531, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.6440680027008057, "loss/hidden": 0.0, "loss/logits": 0.18494433909654617, "loss/reg": 0.5704825520515442, "step": 463 }, { "epoch": 0.00464, "grad_norm": 0.3807585835456848, "grad_norm_var": 0.0007274162210646436, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.8867653608322144, "loss/hidden": 0.0, "loss/logits": 0.18089880049228668, "loss/reg": 0.5697844624519348, "step": 464 }, { "epoch": 0.00465, "grad_norm": 0.5407595634460449, "grad_norm_var": 0.0020161607329257534, "learning_rate": 5e-05, "loss": 0.1879, "loss/crossentropy": 2.953329920768738, "loss/hidden": 0.0, "loss/logits": 0.1879200041294098, "loss/reg": 0.5693488121032715, "step": 465 }, { "epoch": 0.00466, "grad_norm": 0.3990873694419861, "grad_norm_var": 0.002005160916023489, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.7261569499969482, "loss/hidden": 0.0, "loss/logits": 0.17743229493498802, "loss/reg": 0.5683730244636536, "step": 466 }, { "epoch": 0.00467, "grad_norm": 0.5567159652709961, "grad_norm_var": 0.003452137605292253, "learning_rate": 5e-05, "loss": 0.1781, "loss/crossentropy": 2.9425289630889893, "loss/hidden": 0.0, "loss/logits": 0.17811905220150948, "loss/reg": 0.5678460597991943, "step": 467 }, { "epoch": 0.00468, "grad_norm": 0.3896469473838806, "grad_norm_var": 0.0033512772240770834, "learning_rate": 5e-05, "loss": 0.1667, "loss/crossentropy": 2.7407366037368774, "loss/hidden": 0.0, "loss/logits": 0.16665565222501755, "loss/reg": 0.5669754147529602, "step": 468 }, { "epoch": 0.00469, "grad_norm": 0.4358081519603729, "grad_norm_var": 0.003223787268375455, "learning_rate": 5e-05, "loss": 0.1805, "loss/crossentropy": 2.8806650042533875, "loss/hidden": 0.0, "loss/logits": 0.1804743930697441, "loss/reg": 0.5666906237602234, "step": 469 }, { "epoch": 0.0047, "grad_norm": 0.4263615906238556, "grad_norm_var": 0.0031771834961121237, "learning_rate": 5e-05, "loss": 0.1724, "loss/crossentropy": 2.828365921974182, "loss/hidden": 0.0, "loss/logits": 0.1723964773118496, "loss/reg": 0.565990149974823, "step": 470 }, { "epoch": 0.00471, "grad_norm": 0.4618017077445984, "grad_norm_var": 0.0032524306965235894, "learning_rate": 5e-05, "loss": 0.1878, "loss/crossentropy": 2.679270625114441, "loss/hidden": 0.0, "loss/logits": 0.18778762221336365, "loss/reg": 0.565496027469635, "step": 471 }, { "epoch": 0.00472, "grad_norm": 0.38135483860969543, "grad_norm_var": 0.0033462215580461288, "learning_rate": 5e-05, "loss": 0.171, "loss/crossentropy": 2.6836743354797363, "loss/hidden": 0.0, "loss/logits": 0.17096831649541855, "loss/reg": 0.5650753974914551, "step": 472 }, { "epoch": 0.00473, "grad_norm": 0.37833553552627563, "grad_norm_var": 0.0034546972469660393, "learning_rate": 5e-05, "loss": 0.1729, "loss/crossentropy": 2.788007378578186, "loss/hidden": 0.0, "loss/logits": 0.1729329563677311, "loss/reg": 0.5645444989204407, "step": 473 }, { "epoch": 0.00474, "grad_norm": 0.4066854417324066, "grad_norm_var": 0.0033240787387435076, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.7353034019470215, "loss/hidden": 0.0, "loss/logits": 0.17924946174025536, "loss/reg": 0.5641481876373291, "step": 474 }, { "epoch": 0.00475, "grad_norm": 0.4183666408061981, "grad_norm_var": 0.0031418613226838255, "learning_rate": 5e-05, "loss": 0.202, "loss/crossentropy": 2.758584201335907, "loss/hidden": 0.0, "loss/logits": 0.20200031250715256, "loss/reg": 0.5638066530227661, "step": 475 }, { "epoch": 0.00476, "grad_norm": 0.3630387783050537, "grad_norm_var": 0.003304039972764594, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.6706173419952393, "loss/hidden": 0.0, "loss/logits": 0.1780076026916504, "loss/reg": 0.5629574656486511, "step": 476 }, { "epoch": 0.00477, "grad_norm": 0.37611472606658936, "grad_norm_var": 0.003342696718253235, "learning_rate": 5e-05, "loss": 0.1811, "loss/crossentropy": 2.6658287048339844, "loss/hidden": 0.0, "loss/logits": 0.18105322867631912, "loss/reg": 0.5628162026405334, "step": 477 }, { "epoch": 0.00478, "grad_norm": 0.7620260715484619, "grad_norm_var": 0.0102870795647101, "learning_rate": 5e-05, "loss": 0.1895, "loss/crossentropy": 2.9203463792800903, "loss/hidden": 0.0, "loss/logits": 0.1895483024418354, "loss/reg": 0.562698245048523, "step": 478 }, { "epoch": 0.00479, "grad_norm": 0.4148297905921936, "grad_norm_var": 0.010344275088247907, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.8112802505493164, "loss/hidden": 0.0, "loss/logits": 0.1758042462170124, "loss/reg": 0.5627515912055969, "step": 479 }, { "epoch": 0.0048, "grad_norm": 0.3925195038318634, "grad_norm_var": 0.01025495604763204, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.7661134004592896, "loss/hidden": 0.0, "loss/logits": 0.17794544249773026, "loss/reg": 0.5628776550292969, "step": 480 }, { "epoch": 0.00481, "grad_norm": 0.3882812261581421, "grad_norm_var": 0.009740198250122001, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.8221606612205505, "loss/hidden": 0.0, "loss/logits": 0.17361542955040932, "loss/reg": 0.5624843239784241, "step": 481 }, { "epoch": 0.00482, "grad_norm": 0.41976743936538696, "grad_norm_var": 0.009669459339438295, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.7733758687973022, "loss/hidden": 0.0, "loss/logits": 0.17884841933846474, "loss/reg": 0.562309741973877, "step": 482 }, { "epoch": 0.00483, "grad_norm": 0.38955554366111755, "grad_norm_var": 0.008719294531001683, "learning_rate": 5e-05, "loss": 0.166, "loss/crossentropy": 2.795892119407654, "loss/hidden": 0.0, "loss/logits": 0.16601001098752022, "loss/reg": 0.5619499087333679, "step": 483 }, { "epoch": 0.00484, "grad_norm": 0.36897599697113037, "grad_norm_var": 0.008844211651655517, "learning_rate": 5e-05, "loss": 0.1657, "loss/crossentropy": 2.8755455017089844, "loss/hidden": 0.0, "loss/logits": 0.16571567207574844, "loss/reg": 0.5614863038063049, "step": 484 }, { "epoch": 0.00485, "grad_norm": 0.38269054889678955, "grad_norm_var": 0.008936846350090673, "learning_rate": 5e-05, "loss": 0.1764, "loss/crossentropy": 2.785321533679962, "loss/hidden": 0.0, "loss/logits": 0.176405381411314, "loss/reg": 0.561246931552887, "step": 485 }, { "epoch": 0.00486, "grad_norm": 0.3768419027328491, "grad_norm_var": 0.009052523346335361, "learning_rate": 5e-05, "loss": 0.1664, "loss/crossentropy": 2.76261168718338, "loss/hidden": 0.0, "loss/logits": 0.1664263717830181, "loss/reg": 0.560944676399231, "step": 486 }, { "epoch": 0.00487, "grad_norm": 0.38061046600341797, "grad_norm_var": 0.008985738598483665, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.771348774433136, "loss/hidden": 0.0, "loss/logits": 0.17944207042455673, "loss/reg": 0.5604755878448486, "step": 487 }, { "epoch": 0.00488, "grad_norm": 0.4170921742916107, "grad_norm_var": 0.008917156562034522, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.80866539478302, "loss/hidden": 0.0, "loss/logits": 0.18220976367592812, "loss/reg": 0.5599964261054993, "step": 488 }, { "epoch": 0.00489, "grad_norm": 0.3931379020214081, "grad_norm_var": 0.00885901465884101, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.758543074131012, "loss/hidden": 0.0, "loss/logits": 0.18124109134078026, "loss/reg": 0.559814989566803, "step": 489 }, { "epoch": 0.0049, "grad_norm": 0.37608602643013, "grad_norm_var": 0.008954143829608545, "learning_rate": 5e-05, "loss": 0.1692, "loss/crossentropy": 2.7592194080352783, "loss/hidden": 0.0, "loss/logits": 0.16919198632240295, "loss/reg": 0.559546172618866, "step": 490 }, { "epoch": 0.00491, "grad_norm": 0.43985411524772644, "grad_norm_var": 0.008996239148172795, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.764371871948242, "loss/hidden": 0.0, "loss/logits": 0.18073590844869614, "loss/reg": 0.5591524839401245, "step": 491 }, { "epoch": 0.00492, "grad_norm": 0.41151654720306396, "grad_norm_var": 0.008806683601152191, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.7623562812805176, "loss/hidden": 0.0, "loss/logits": 0.18255820870399475, "loss/reg": 0.558942437171936, "step": 492 }, { "epoch": 0.00493, "grad_norm": 0.40937989950180054, "grad_norm_var": 0.008689541522826557, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 2.69577294588089, "loss/hidden": 0.0, "loss/logits": 0.18956592679023743, "loss/reg": 0.5589540600776672, "step": 493 }, { "epoch": 0.00494, "grad_norm": 0.3867286145687103, "grad_norm_var": 0.0003875831525608183, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.7451700568199158, "loss/hidden": 0.0, "loss/logits": 0.1801215335726738, "loss/reg": 0.5587448477745056, "step": 494 }, { "epoch": 0.00495, "grad_norm": 0.3616257309913635, "grad_norm_var": 0.00043618572384384046, "learning_rate": 5e-05, "loss": 0.1697, "loss/crossentropy": 2.7521727681159973, "loss/hidden": 0.0, "loss/logits": 0.1697472371160984, "loss/reg": 0.5585340857505798, "step": 495 }, { "epoch": 0.00496, "grad_norm": 0.3907410204410553, "grad_norm_var": 0.00043659611188001657, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.6787723302841187, "loss/hidden": 0.0, "loss/logits": 0.17943834513425827, "loss/reg": 0.5581747889518738, "step": 496 }, { "epoch": 0.00497, "grad_norm": 0.4027113914489746, "grad_norm_var": 0.0004399439948514883, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.8013442754745483, "loss/hidden": 0.0, "loss/logits": 0.17905466258525848, "loss/reg": 0.5585915446281433, "step": 497 }, { "epoch": 0.00498, "grad_norm": 0.3661331832408905, "grad_norm_var": 0.00043694638113240815, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.74288672208786, "loss/hidden": 0.0, "loss/logits": 0.17878345400094986, "loss/reg": 0.5589151382446289, "step": 498 }, { "epoch": 0.00499, "grad_norm": 0.3563874363899231, "grad_norm_var": 0.0005114511181728454, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.7776819467544556, "loss/hidden": 0.0, "loss/logits": 0.17516036331653595, "loss/reg": 0.5589350461959839, "step": 499 }, { "epoch": 0.005, "grad_norm": 0.4111122190952301, "grad_norm_var": 0.0005111437558825927, "learning_rate": 5e-05, "loss": 0.1978, "loss/crossentropy": 2.7738694548606873, "loss/hidden": 0.0, "loss/logits": 0.1978212594985962, "loss/reg": 0.559177041053772, "step": 500 }, { "epoch": 0.00501, "grad_norm": 0.3646180033683777, "grad_norm_var": 0.0005525817665442024, "learning_rate": 5e-05, "loss": 0.1713, "loss/crossentropy": 2.795500636100769, "loss/hidden": 0.0, "loss/logits": 0.17134494706988335, "loss/reg": 0.559291660785675, "step": 501 }, { "epoch": 0.00502, "grad_norm": 0.3951560854911804, "grad_norm_var": 0.0005407157299892162, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.851865530014038, "loss/hidden": 0.0, "loss/logits": 0.18456299602985382, "loss/reg": 0.5589885115623474, "step": 502 }, { "epoch": 0.00503, "grad_norm": 0.3487588167190552, "grad_norm_var": 0.0006500759070847906, "learning_rate": 5e-05, "loss": 0.1651, "loss/crossentropy": 2.7636953592300415, "loss/hidden": 0.0, "loss/logits": 0.16514211148023605, "loss/reg": 0.5592247843742371, "step": 503 }, { "epoch": 0.00504, "grad_norm": 0.34490668773651123, "grad_norm_var": 0.0007096021809185893, "learning_rate": 5e-05, "loss": 0.1649, "loss/crossentropy": 2.7371578216552734, "loss/hidden": 0.0, "loss/logits": 0.16494091972708702, "loss/reg": 0.5589237809181213, "step": 504 }, { "epoch": 0.00505, "grad_norm": 0.368867427110672, "grad_norm_var": 0.0007198515639204299, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.749998986721039, "loss/hidden": 0.0, "loss/logits": 0.1762167476117611, "loss/reg": 0.5585927963256836, "step": 505 }, { "epoch": 0.00506, "grad_norm": 0.38909733295440674, "grad_norm_var": 0.0007177240002733824, "learning_rate": 5e-05, "loss": 0.1778, "loss/crossentropy": 2.8114724159240723, "loss/hidden": 0.0, "loss/logits": 0.17781662940979004, "loss/reg": 0.5580586791038513, "step": 506 }, { "epoch": 0.00507, "grad_norm": 0.48178917169570923, "grad_norm_var": 0.0011386765733061707, "learning_rate": 5e-05, "loss": 0.2158, "loss/crossentropy": 2.827729344367981, "loss/hidden": 0.0, "loss/logits": 0.21578003093600273, "loss/reg": 0.5576668381690979, "step": 507 }, { "epoch": 0.00508, "grad_norm": 0.3689277768135071, "grad_norm_var": 0.0011119452313765542, "learning_rate": 5e-05, "loss": 0.1813, "loss/crossentropy": 2.731115460395813, "loss/hidden": 0.0, "loss/logits": 0.1813487969338894, "loss/reg": 0.5570773482322693, "step": 508 }, { "epoch": 0.00509, "grad_norm": 0.3963679373264313, "grad_norm_var": 0.001078813752766535, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.90335613489151, "loss/hidden": 0.0, "loss/logits": 0.1756189428269863, "loss/reg": 0.5566422343254089, "step": 509 }, { "epoch": 0.0051, "grad_norm": 0.3818899393081665, "grad_norm_var": 0.001078110574975677, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.6921775341033936, "loss/hidden": 0.0, "loss/logits": 0.17579015344381332, "loss/reg": 0.5562159419059753, "step": 510 }, { "epoch": 0.00511, "grad_norm": 0.4105952978134155, "grad_norm_var": 0.0010879833648030654, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.76626718044281, "loss/hidden": 0.0, "loss/logits": 0.18759262561798096, "loss/reg": 0.5560811758041382, "step": 511 }, { "epoch": 0.00512, "grad_norm": 0.40013277530670166, "grad_norm_var": 0.001099271844169832, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.7505248188972473, "loss/hidden": 0.0, "loss/logits": 0.18590008467435837, "loss/reg": 0.5556550025939941, "step": 512 }, { "epoch": 0.00513, "grad_norm": 0.3795160949230194, "grad_norm_var": 0.0010834282857788721, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.6976094245910645, "loss/hidden": 0.0, "loss/logits": 0.17762970924377441, "loss/reg": 0.5554805994033813, "step": 513 }, { "epoch": 0.00514, "grad_norm": 0.34423166513442993, "grad_norm_var": 0.0011692797982479981, "learning_rate": 5e-05, "loss": 0.1702, "loss/crossentropy": 2.765163004398346, "loss/hidden": 0.0, "loss/logits": 0.17023758962750435, "loss/reg": 0.5553342700004578, "step": 514 }, { "epoch": 0.00515, "grad_norm": 0.4963194727897644, "grad_norm_var": 0.0018798250462047712, "learning_rate": 5e-05, "loss": 0.1693, "loss/crossentropy": 3.0134795904159546, "loss/hidden": 0.0, "loss/logits": 0.1692982017993927, "loss/reg": 0.5548786520957947, "step": 515 }, { "epoch": 0.00516, "grad_norm": 0.45683982968330383, "grad_norm_var": 0.002123121039505079, "learning_rate": 5e-05, "loss": 0.1887, "loss/crossentropy": 2.725669205188751, "loss/hidden": 0.0, "loss/logits": 0.18869562819600105, "loss/reg": 0.5545822381973267, "step": 516 }, { "epoch": 0.00517, "grad_norm": 0.3576827347278595, "grad_norm_var": 0.0021546846477613437, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.6858309507369995, "loss/hidden": 0.0, "loss/logits": 0.17429396510124207, "loss/reg": 0.5538730621337891, "step": 517 }, { "epoch": 0.00518, "grad_norm": 0.37338805198669434, "grad_norm_var": 0.002184042818861409, "learning_rate": 5e-05, "loss": 0.1657, "loss/crossentropy": 2.6914477348327637, "loss/hidden": 0.0, "loss/logits": 0.16566015407443047, "loss/reg": 0.5536062717437744, "step": 518 }, { "epoch": 0.00519, "grad_norm": 0.3741968274116516, "grad_norm_var": 0.002072033986956075, "learning_rate": 5e-05, "loss": 0.1678, "loss/crossentropy": 2.605296790599823, "loss/hidden": 0.0, "loss/logits": 0.16782359406352043, "loss/reg": 0.552930474281311, "step": 519 }, { "epoch": 0.0052, "grad_norm": 0.4026655852794647, "grad_norm_var": 0.0018924758759032655, "learning_rate": 5e-05, "loss": 0.1669, "loss/crossentropy": 2.7116551995277405, "loss/hidden": 0.0, "loss/logits": 0.16693567857146263, "loss/reg": 0.5522305369377136, "step": 520 }, { "epoch": 0.00521, "grad_norm": 0.37725111842155457, "grad_norm_var": 0.0018632900526013517, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.618664503097534, "loss/hidden": 0.0, "loss/logits": 0.17579519376158714, "loss/reg": 0.5519070625305176, "step": 521 }, { "epoch": 0.00522, "grad_norm": 0.4184610843658447, "grad_norm_var": 0.0018767224642930902, "learning_rate": 5e-05, "loss": 0.1732, "loss/crossentropy": 2.879607856273651, "loss/hidden": 0.0, "loss/logits": 0.17317330464720726, "loss/reg": 0.5518190860748291, "step": 522 }, { "epoch": 0.00523, "grad_norm": 0.41720226407051086, "grad_norm_var": 0.0014440065576243222, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.977618157863617, "loss/hidden": 0.0, "loss/logits": 0.18360265716910362, "loss/reg": 0.5513535737991333, "step": 523 }, { "epoch": 0.00524, "grad_norm": 0.4283323884010315, "grad_norm_var": 0.0014403980049692345, "learning_rate": 5e-05, "loss": 0.1672, "loss/crossentropy": 2.6983245611190796, "loss/hidden": 0.0, "loss/logits": 0.16721409559249878, "loss/reg": 0.5508003830909729, "step": 524 }, { "epoch": 0.00525, "grad_norm": 0.39922916889190674, "grad_norm_var": 0.0014391646512415571, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.8782798051834106, "loss/hidden": 0.0, "loss/logits": 0.1780015379190445, "loss/reg": 0.55071622133255, "step": 525 }, { "epoch": 0.00526, "grad_norm": 0.37440040707588196, "grad_norm_var": 0.0014618745832183403, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.8488889932632446, "loss/hidden": 0.0, "loss/logits": 0.1732909493148327, "loss/reg": 0.5507029891014099, "step": 526 }, { "epoch": 0.00527, "grad_norm": 0.382803350687027, "grad_norm_var": 0.001473306245922131, "learning_rate": 5e-05, "loss": 0.166, "loss/crossentropy": 2.831286072731018, "loss/hidden": 0.0, "loss/logits": 0.16601553559303284, "loss/reg": 0.5502115488052368, "step": 527 }, { "epoch": 0.00528, "grad_norm": 0.3767335116863251, "grad_norm_var": 0.0015037297523115853, "learning_rate": 5e-05, "loss": 0.1784, "loss/crossentropy": 2.878285050392151, "loss/hidden": 0.0, "loss/logits": 0.1784132830798626, "loss/reg": 0.5501455664634705, "step": 528 }, { "epoch": 0.00529, "grad_norm": 0.37205299735069275, "grad_norm_var": 0.0015250598616080506, "learning_rate": 5e-05, "loss": 0.1778, "loss/crossentropy": 2.7280725240707397, "loss/hidden": 0.0, "loss/logits": 0.17779358476400375, "loss/reg": 0.5502381324768066, "step": 529 }, { "epoch": 0.0053, "grad_norm": 0.38356900215148926, "grad_norm_var": 0.0013450739123886243, "learning_rate": 5e-05, "loss": 0.1761, "loss/crossentropy": 2.754570782184601, "loss/hidden": 0.0, "loss/logits": 0.17608623951673508, "loss/reg": 0.5498242378234863, "step": 530 }, { "epoch": 0.00531, "grad_norm": 0.4249264597892761, "grad_norm_var": 0.0007414839613229381, "learning_rate": 5e-05, "loss": 0.1887, "loss/crossentropy": 2.629789888858795, "loss/hidden": 0.0, "loss/logits": 0.18867794051766396, "loss/reg": 0.550035297870636, "step": 531 }, { "epoch": 0.00532, "grad_norm": 0.3683103621006012, "grad_norm_var": 0.0005011770026748648, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.8216918110847473, "loss/hidden": 0.0, "loss/logits": 0.17304040119051933, "loss/reg": 0.5499457716941833, "step": 532 }, { "epoch": 0.00533, "grad_norm": 0.4460945725440979, "grad_norm_var": 0.0006152335964912344, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.864534854888916, "loss/hidden": 0.0, "loss/logits": 0.17616653442382812, "loss/reg": 0.5496830344200134, "step": 533 }, { "epoch": 0.00534, "grad_norm": 0.4277942180633545, "grad_norm_var": 0.0006436326744770682, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.739046812057495, "loss/hidden": 0.0, "loss/logits": 0.17914197221398354, "loss/reg": 0.5494042038917542, "step": 534 }, { "epoch": 0.00535, "grad_norm": 0.44801026582717896, "grad_norm_var": 0.0007461882713404459, "learning_rate": 5e-05, "loss": 0.1771, "loss/crossentropy": 2.8221105337142944, "loss/hidden": 0.0, "loss/logits": 0.1770734190940857, "loss/reg": 0.5490994453430176, "step": 535 }, { "epoch": 0.00536, "grad_norm": 0.3951584994792938, "grad_norm_var": 0.0007500350607376898, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.984010100364685, "loss/hidden": 0.0, "loss/logits": 0.17627639323472977, "loss/reg": 0.5489474534988403, "step": 536 }, { "epoch": 0.00537, "grad_norm": 0.44170045852661133, "grad_norm_var": 0.0007924955076504999, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.7797727584838867, "loss/hidden": 0.0, "loss/logits": 0.18819041550159454, "loss/reg": 0.548499584197998, "step": 537 }, { "epoch": 0.00538, "grad_norm": 0.4363236725330353, "grad_norm_var": 0.0008408090080865183, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.7309396862983704, "loss/hidden": 0.0, "loss/logits": 0.18217438086867332, "loss/reg": 0.5480769276618958, "step": 538 }, { "epoch": 0.00539, "grad_norm": 0.43851688504219055, "grad_norm_var": 0.0008963077132380537, "learning_rate": 5e-05, "loss": 0.1834, "loss/crossentropy": 2.694208800792694, "loss/hidden": 0.0, "loss/logits": 0.18335244432091713, "loss/reg": 0.5475513339042664, "step": 539 }, { "epoch": 0.0054, "grad_norm": 0.4670868217945099, "grad_norm_var": 0.00109008641831011, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.782553732395172, "loss/hidden": 0.0, "loss/logits": 0.18300291150808334, "loss/reg": 0.5469756126403809, "step": 540 }, { "epoch": 0.00541, "grad_norm": 0.41480833292007446, "grad_norm_var": 0.0010799339677187824, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.8658891320228577, "loss/hidden": 0.0, "loss/logits": 0.1788952723145485, "loss/reg": 0.5468266606330872, "step": 541 }, { "epoch": 0.00542, "grad_norm": 0.39333972334861755, "grad_norm_var": 0.0010064117893725114, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.8483704328536987, "loss/hidden": 0.0, "loss/logits": 0.18157437071204185, "loss/reg": 0.5463190078735352, "step": 542 }, { "epoch": 0.00543, "grad_norm": 0.38449928164482117, "grad_norm_var": 0.0009996329266353224, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.89563649892807, "loss/hidden": 0.0, "loss/logits": 0.18023038655519485, "loss/reg": 0.5457070469856262, "step": 543 }, { "epoch": 0.00544, "grad_norm": 0.4127125144004822, "grad_norm_var": 0.0009032852477402376, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.717514157295227, "loss/hidden": 0.0, "loss/logits": 0.1778719685971737, "loss/reg": 0.5450869798660278, "step": 544 }, { "epoch": 0.00545, "grad_norm": 0.42192932963371277, "grad_norm_var": 0.0007669634152076406, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.822247326374054, "loss/hidden": 0.0, "loss/logits": 0.1792241632938385, "loss/reg": 0.5444886684417725, "step": 545 }, { "epoch": 0.00546, "grad_norm": 0.38074740767478943, "grad_norm_var": 0.0007808089398953795, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.7772040963172913, "loss/hidden": 0.0, "loss/logits": 0.18086504191160202, "loss/reg": 0.5441609621047974, "step": 546 }, { "epoch": 0.00547, "grad_norm": 0.4026728570461273, "grad_norm_var": 0.0007937971567138972, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.632952332496643, "loss/hidden": 0.0, "loss/logits": 0.18688174337148666, "loss/reg": 0.5435310006141663, "step": 547 }, { "epoch": 0.00548, "grad_norm": 0.3834827244281769, "grad_norm_var": 0.0007087122314574017, "learning_rate": 5e-05, "loss": 0.1668, "loss/crossentropy": 2.8526415824890137, "loss/hidden": 0.0, "loss/logits": 0.1667657606303692, "loss/reg": 0.5432454347610474, "step": 548 }, { "epoch": 0.00549, "grad_norm": 0.35930392146110535, "grad_norm_var": 0.0008593622049050157, "learning_rate": 5e-05, "loss": 0.1657, "loss/crossentropy": 2.8654907941818237, "loss/hidden": 0.0, "loss/logits": 0.16574327275156975, "loss/reg": 0.5429373383522034, "step": 549 }, { "epoch": 0.0055, "grad_norm": 0.37501123547554016, "grad_norm_var": 0.0009294104105828538, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.6172654032707214, "loss/hidden": 0.0, "loss/logits": 0.1732707992196083, "loss/reg": 0.5424562692642212, "step": 550 }, { "epoch": 0.00551, "grad_norm": 0.35893934965133667, "grad_norm_var": 0.00097036191428946, "learning_rate": 5e-05, "loss": 0.1644, "loss/crossentropy": 2.825605630874634, "loss/hidden": 0.0, "loss/logits": 0.16437287256121635, "loss/reg": 0.542082667350769, "step": 551 }, { "epoch": 0.00552, "grad_norm": 0.3661603033542633, "grad_norm_var": 0.0010576424960035376, "learning_rate": 5e-05, "loss": 0.1678, "loss/crossentropy": 2.8828023076057434, "loss/hidden": 0.0, "loss/logits": 0.16778088733553886, "loss/reg": 0.5416398644447327, "step": 552 }, { "epoch": 0.00553, "grad_norm": 0.3865301012992859, "grad_norm_var": 0.000958246275512605, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.712374687194824, "loss/hidden": 0.0, "loss/logits": 0.17604910582304, "loss/reg": 0.5409717559814453, "step": 553 }, { "epoch": 0.00554, "grad_norm": 0.35436421632766724, "grad_norm_var": 0.000968888453220181, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.6958473324775696, "loss/hidden": 0.0, "loss/logits": 0.16875110194087029, "loss/reg": 0.5407248735427856, "step": 554 }, { "epoch": 0.00555, "grad_norm": 0.37209799885749817, "grad_norm_var": 0.0008482144647558365, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.5784429907798767, "loss/hidden": 0.0, "loss/logits": 0.18067141622304916, "loss/reg": 0.5404811501502991, "step": 555 }, { "epoch": 0.00556, "grad_norm": 0.39256706833839417, "grad_norm_var": 0.00042543603395958824, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.858420431613922, "loss/hidden": 0.0, "loss/logits": 0.17301730066537857, "loss/reg": 0.5401744246482849, "step": 556 }, { "epoch": 0.00557, "grad_norm": 0.38052526116371155, "grad_norm_var": 0.0003623997720090107, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.7944374680519104, "loss/hidden": 0.0, "loss/logits": 0.17900896444916725, "loss/reg": 0.539861261844635, "step": 557 }, { "epoch": 0.00558, "grad_norm": 0.41661277413368225, "grad_norm_var": 0.0004289413341459269, "learning_rate": 5e-05, "loss": 0.1855, "loss/crossentropy": 2.848187267780304, "loss/hidden": 0.0, "loss/logits": 0.1854848824441433, "loss/reg": 0.5397405624389648, "step": 558 }, { "epoch": 0.00559, "grad_norm": 0.38366085290908813, "grad_norm_var": 0.00042895849436531687, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.9321094155311584, "loss/hidden": 0.0, "loss/logits": 0.1844218708574772, "loss/reg": 0.5393415689468384, "step": 559 }, { "epoch": 0.0056, "grad_norm": 0.3829214572906494, "grad_norm_var": 0.0003712012348421371, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.7179561853408813, "loss/hidden": 0.0, "loss/logits": 0.1734691746532917, "loss/reg": 0.5390350222587585, "step": 560 }, { "epoch": 0.00561, "grad_norm": 0.37189996242523193, "grad_norm_var": 0.0002635871650183456, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.602396249771118, "loss/hidden": 0.0, "loss/logits": 0.1755872517824173, "loss/reg": 0.5390979051589966, "step": 561 }, { "epoch": 0.00562, "grad_norm": 0.6090081334114075, "grad_norm_var": 0.0035665512027914384, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.9679704904556274, "loss/hidden": 0.0, "loss/logits": 0.19063398614525795, "loss/reg": 0.5383903980255127, "step": 562 }, { "epoch": 0.00563, "grad_norm": 0.38416802883148193, "grad_norm_var": 0.0035652834241256906, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.7394378185272217, "loss/hidden": 0.0, "loss/logits": 0.17094270884990692, "loss/reg": 0.5380064845085144, "step": 563 }, { "epoch": 0.00564, "grad_norm": 0.3854963183403015, "grad_norm_var": 0.003563161971314547, "learning_rate": 5e-05, "loss": 0.1674, "loss/crossentropy": 2.719312608242035, "loss/hidden": 0.0, "loss/logits": 0.1674134097993374, "loss/reg": 0.5377374887466431, "step": 564 }, { "epoch": 0.00565, "grad_norm": 0.37196803092956543, "grad_norm_var": 0.00351720988929937, "learning_rate": 5e-05, "loss": 0.1745, "loss/crossentropy": 2.745683968067169, "loss/hidden": 0.0, "loss/logits": 0.17449527606368065, "loss/reg": 0.5376200079917908, "step": 565 }, { "epoch": 0.00566, "grad_norm": 0.35309478640556335, "grad_norm_var": 0.0036005151693585093, "learning_rate": 5e-05, "loss": 0.1617, "loss/crossentropy": 2.7752469778060913, "loss/hidden": 0.0, "loss/logits": 0.16166164726018906, "loss/reg": 0.5375326871871948, "step": 566 }, { "epoch": 0.00567, "grad_norm": 0.5402910709381104, "grad_norm_var": 0.004859629380434294, "learning_rate": 5e-05, "loss": 0.206, "loss/crossentropy": 2.7998775839805603, "loss/hidden": 0.0, "loss/logits": 0.20604126900434494, "loss/reg": 0.537605345249176, "step": 567 }, { "epoch": 0.00568, "grad_norm": 0.7805297374725342, "grad_norm_var": 0.013544015903013332, "learning_rate": 5e-05, "loss": 0.1961, "loss/crossentropy": 2.9325369000434875, "loss/hidden": 0.0, "loss/logits": 0.19606028869748116, "loss/reg": 0.5372644066810608, "step": 568 }, { "epoch": 0.00569, "grad_norm": 0.39921799302101135, "grad_norm_var": 0.013482046654875882, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.8327025771141052, "loss/hidden": 0.0, "loss/logits": 0.1757877767086029, "loss/reg": 0.5372633934020996, "step": 569 }, { "epoch": 0.0057, "grad_norm": 0.4382418692111969, "grad_norm_var": 0.013076977820547267, "learning_rate": 5e-05, "loss": 0.2058, "loss/crossentropy": 2.844972252845764, "loss/hidden": 0.0, "loss/logits": 0.20578501746058464, "loss/reg": 0.5372618436813354, "step": 570 }, { "epoch": 0.00571, "grad_norm": 0.4133698642253876, "grad_norm_var": 0.012836502354090598, "learning_rate": 5e-05, "loss": 0.1864, "loss/crossentropy": 2.917839288711548, "loss/hidden": 0.0, "loss/logits": 0.18642428889870644, "loss/reg": 0.5374752879142761, "step": 571 }, { "epoch": 0.00572, "grad_norm": 0.41047024726867676, "grad_norm_var": 0.012748743018850976, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.821923851966858, "loss/hidden": 0.0, "loss/logits": 0.17791394144296646, "loss/reg": 0.5372009873390198, "step": 572 }, { "epoch": 0.00573, "grad_norm": 0.402464359998703, "grad_norm_var": 0.012608236077279249, "learning_rate": 5e-05, "loss": 0.1818, "loss/crossentropy": 2.8350918292999268, "loss/hidden": 0.0, "loss/logits": 0.1818305253982544, "loss/reg": 0.5367937088012695, "step": 573 }, { "epoch": 0.00574, "grad_norm": 0.45223113894462585, "grad_norm_var": 0.012575445405596615, "learning_rate": 5e-05, "loss": 0.1699, "loss/crossentropy": 2.7930484414100647, "loss/hidden": 0.0, "loss/logits": 0.16991710662841797, "loss/reg": 0.5364768505096436, "step": 574 }, { "epoch": 0.00575, "grad_norm": 0.43352261185646057, "grad_norm_var": 0.0123400575930528, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.6940504908561707, "loss/hidden": 0.0, "loss/logits": 0.17819049209356308, "loss/reg": 0.5362961888313293, "step": 575 }, { "epoch": 0.00576, "grad_norm": 0.4019607901573181, "grad_norm_var": 0.012203711012744553, "learning_rate": 5e-05, "loss": 0.1778, "loss/crossentropy": 2.824812114238739, "loss/hidden": 0.0, "loss/logits": 0.17784994468092918, "loss/reg": 0.5359688401222229, "step": 576 }, { "epoch": 0.00577, "grad_norm": 0.3904159665107727, "grad_norm_var": 0.01204035888789203, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.8570236563682556, "loss/hidden": 0.0, "loss/logits": 0.17014898359775543, "loss/reg": 0.535413920879364, "step": 577 }, { "epoch": 0.00578, "grad_norm": 0.42199060320854187, "grad_norm_var": 0.010209071215430932, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.7174891233444214, "loss/hidden": 0.0, "loss/logits": 0.17948715761303902, "loss/reg": 0.5351020693778992, "step": 578 }, { "epoch": 0.00579, "grad_norm": 0.40046098828315735, "grad_norm_var": 0.01011259683339543, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.7127727270126343, "loss/hidden": 0.0, "loss/logits": 0.1733325570821762, "loss/reg": 0.5349224805831909, "step": 579 }, { "epoch": 0.0058, "grad_norm": 0.40743646025657654, "grad_norm_var": 0.009991334725998734, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.891405701637268, "loss/hidden": 0.0, "loss/logits": 0.18850185722112656, "loss/reg": 0.5345614552497864, "step": 580 }, { "epoch": 0.00581, "grad_norm": 0.395015150308609, "grad_norm_var": 0.009819763398026723, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.671793520450592, "loss/hidden": 0.0, "loss/logits": 0.18011539056897163, "loss/reg": 0.5344705581665039, "step": 581 }, { "epoch": 0.00582, "grad_norm": 0.3927249610424042, "grad_norm_var": 0.009458477967162345, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.6277888417243958, "loss/hidden": 0.0, "loss/logits": 0.18980396538972855, "loss/reg": 0.5338695645332336, "step": 582 }, { "epoch": 0.00583, "grad_norm": 0.3756638765335083, "grad_norm_var": 0.009006289093083627, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.697905421257019, "loss/hidden": 0.0, "loss/logits": 0.18083222210407257, "loss/reg": 0.5335188508033752, "step": 583 }, { "epoch": 0.00584, "grad_norm": 0.3705867826938629, "grad_norm_var": 0.0004720043535742846, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.8062366247177124, "loss/hidden": 0.0, "loss/logits": 0.17092613503336906, "loss/reg": 0.5328809022903442, "step": 584 }, { "epoch": 0.00585, "grad_norm": 0.4000037908554077, "grad_norm_var": 0.00047126837341423535, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.7631296515464783, "loss/hidden": 0.0, "loss/logits": 0.1729564294219017, "loss/reg": 0.5326610803604126, "step": 585 }, { "epoch": 0.00586, "grad_norm": 0.37959474325180054, "grad_norm_var": 0.00043927783173227655, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.6262499690055847, "loss/hidden": 0.0, "loss/logits": 0.1847260780632496, "loss/reg": 0.5325880646705627, "step": 586 }, { "epoch": 0.00587, "grad_norm": 0.3690916895866394, "grad_norm_var": 0.0005005591271069698, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.7358859181404114, "loss/hidden": 0.0, "loss/logits": 0.18084875866770744, "loss/reg": 0.5325286984443665, "step": 587 }, { "epoch": 0.00588, "grad_norm": 0.40073201060295105, "grad_norm_var": 0.0004931862253077994, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.7996283769607544, "loss/hidden": 0.0, "loss/logits": 0.1812371350824833, "loss/reg": 0.5322884321212769, "step": 588 }, { "epoch": 0.00589, "grad_norm": 0.3917679786682129, "grad_norm_var": 0.0004962782838882777, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.7606716752052307, "loss/hidden": 0.0, "loss/logits": 0.18188262358307838, "loss/reg": 0.5318838357925415, "step": 589 }, { "epoch": 0.0059, "grad_norm": 0.37616971135139465, "grad_norm_var": 0.0003175098597832382, "learning_rate": 5e-05, "loss": 0.1707, "loss/crossentropy": 2.7626240253448486, "loss/hidden": 0.0, "loss/logits": 0.1707039400935173, "loss/reg": 0.5315775275230408, "step": 590 }, { "epoch": 0.00591, "grad_norm": 0.37914201617240906, "grad_norm_var": 0.0002171916153030732, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.970440983772278, "loss/hidden": 0.0, "loss/logits": 0.17250791192054749, "loss/reg": 0.5312857031822205, "step": 591 }, { "epoch": 0.00592, "grad_norm": 0.7160173654556274, "grad_norm_var": 0.006849122844446109, "learning_rate": 5e-05, "loss": 0.2227, "loss/crossentropy": 2.9245062470436096, "loss/hidden": 0.0, "loss/logits": 0.22272298485040665, "loss/reg": 0.5312067270278931, "step": 592 }, { "epoch": 0.00593, "grad_norm": 0.4377254247665405, "grad_norm_var": 0.006862788302224128, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.9227344393730164, "loss/hidden": 0.0, "loss/logits": 0.18386131152510643, "loss/reg": 0.5308799147605896, "step": 593 }, { "epoch": 0.00594, "grad_norm": 0.45590725541114807, "grad_norm_var": 0.006973611243057246, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.737419009208679, "loss/hidden": 0.0, "loss/logits": 0.18036724999547005, "loss/reg": 0.5307196974754333, "step": 594 }, { "epoch": 0.00595, "grad_norm": 0.43346816301345825, "grad_norm_var": 0.006975506244857602, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.69352525472641, "loss/hidden": 0.0, "loss/logits": 0.17834031954407692, "loss/reg": 0.5304762721061707, "step": 595 }, { "epoch": 0.00596, "grad_norm": 0.49834227561950684, "grad_norm_var": 0.007369226603172315, "learning_rate": 5e-05, "loss": 0.1946, "loss/crossentropy": 2.7087314128875732, "loss/hidden": 0.0, "loss/logits": 0.19464265927672386, "loss/reg": 0.5300901532173157, "step": 596 }, { "epoch": 0.00597, "grad_norm": 0.36550673842430115, "grad_norm_var": 0.007534725485214718, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.77804034948349, "loss/hidden": 0.0, "loss/logits": 0.17604774609208107, "loss/reg": 0.52995365858078, "step": 597 }, { "epoch": 0.00598, "grad_norm": 0.3578112721443176, "grad_norm_var": 0.007744410721280144, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.7086586356163025, "loss/hidden": 0.0, "loss/logits": 0.17759665846824646, "loss/reg": 0.5296736359596252, "step": 598 }, { "epoch": 0.00599, "grad_norm": 0.3988676369190216, "grad_norm_var": 0.007643304035022599, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.874056398868561, "loss/hidden": 0.0, "loss/logits": 0.18355432152748108, "loss/reg": 0.5293606519699097, "step": 599 }, { "epoch": 0.006, "grad_norm": 0.5932928919792175, "grad_norm_var": 0.009255973698311222, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.7041032314300537, "loss/hidden": 0.0, "loss/logits": 0.18525120243430138, "loss/reg": 0.5293173789978027, "step": 600 }, { "epoch": 0.00601, "grad_norm": 0.40957698225975037, "grad_norm_var": 0.009217554775980649, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.801423966884613, "loss/hidden": 0.0, "loss/logits": 0.17732501029968262, "loss/reg": 0.5291053652763367, "step": 601 }, { "epoch": 0.00602, "grad_norm": 0.4099724292755127, "grad_norm_var": 0.009050055855905337, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.62606680393219, "loss/hidden": 0.0, "loss/logits": 0.17833668738603592, "loss/reg": 0.5285364389419556, "step": 602 }, { "epoch": 0.00603, "grad_norm": 0.4101642966270447, "grad_norm_var": 0.008783124895312754, "learning_rate": 5e-05, "loss": 0.1873, "loss/crossentropy": 2.65740829706192, "loss/hidden": 0.0, "loss/logits": 0.18734794855117798, "loss/reg": 0.5283911824226379, "step": 603 }, { "epoch": 0.00604, "grad_norm": 0.4322654604911804, "grad_norm_var": 0.008681626234312019, "learning_rate": 5e-05, "loss": 0.2062, "loss/crossentropy": 2.877937972545624, "loss/hidden": 0.0, "loss/logits": 0.20618272945284843, "loss/reg": 0.527982771396637, "step": 604 }, { "epoch": 0.00605, "grad_norm": 0.3815467357635498, "grad_norm_var": 0.008756102431631147, "learning_rate": 5e-05, "loss": 0.1697, "loss/crossentropy": 2.7979888916015625, "loss/hidden": 0.0, "loss/logits": 0.16965517029166222, "loss/reg": 0.5275786519050598, "step": 605 }, { "epoch": 0.00606, "grad_norm": 0.4141106605529785, "grad_norm_var": 0.008518179748961604, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 2.827108085155487, "loss/hidden": 0.0, "loss/logits": 0.1786588616669178, "loss/reg": 0.5275526642799377, "step": 606 }, { "epoch": 0.00607, "grad_norm": 0.3933965265750885, "grad_norm_var": 0.00840883143751778, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.7290459275245667, "loss/hidden": 0.0, "loss/logits": 0.17209718376398087, "loss/reg": 0.5274314284324646, "step": 607 }, { "epoch": 0.00608, "grad_norm": 0.48277348279953003, "grad_norm_var": 0.003357203138046864, "learning_rate": 5e-05, "loss": 0.1855, "loss/crossentropy": 2.811889350414276, "loss/hidden": 0.0, "loss/logits": 0.18549248203635216, "loss/reg": 0.5269420742988586, "step": 608 }, { "epoch": 0.00609, "grad_norm": 0.4523940682411194, "grad_norm_var": 0.003386405154396614, "learning_rate": 5e-05, "loss": 0.2047, "loss/crossentropy": 2.755116641521454, "loss/hidden": 0.0, "loss/logits": 0.20469101145863533, "loss/reg": 0.526553750038147, "step": 609 }, { "epoch": 0.0061, "grad_norm": 0.38295847177505493, "grad_norm_var": 0.0034727258554186234, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 2.636898934841156, "loss/hidden": 0.0, "loss/logits": 0.17049512267112732, "loss/reg": 0.5265843868255615, "step": 610 }, { "epoch": 0.00611, "grad_norm": 0.5146827101707458, "grad_norm_var": 0.003965530055703681, "learning_rate": 5e-05, "loss": 0.2114, "loss/crossentropy": 2.779209792613983, "loss/hidden": 0.0, "loss/logits": 0.21136777475476265, "loss/reg": 0.526534378528595, "step": 611 }, { "epoch": 0.00612, "grad_norm": 0.3972010910511017, "grad_norm_var": 0.003698133930842806, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.675155997276306, "loss/hidden": 0.0, "loss/logits": 0.1681441329419613, "loss/reg": 0.5264838933944702, "step": 612 }, { "epoch": 0.00613, "grad_norm": 0.4252116084098816, "grad_norm_var": 0.003449051623889865, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.799851715564728, "loss/hidden": 0.0, "loss/logits": 0.18222320079803467, "loss/reg": 0.5266112089157104, "step": 613 }, { "epoch": 0.00614, "grad_norm": 0.42539313435554504, "grad_norm_var": 0.0030974108024641846, "learning_rate": 5e-05, "loss": 0.1711, "loss/crossentropy": 2.7043932676315308, "loss/hidden": 0.0, "loss/logits": 0.17109937220811844, "loss/reg": 0.5261294841766357, "step": 614 }, { "epoch": 0.00615, "grad_norm": 0.43782854080200195, "grad_norm_var": 0.003016333415567359, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.900669753551483, "loss/hidden": 0.0, "loss/logits": 0.17520315572619438, "loss/reg": 0.5259124040603638, "step": 615 }, { "epoch": 0.00616, "grad_norm": 0.4858180284500122, "grad_norm_var": 0.001472407111076753, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.826725125312805, "loss/hidden": 0.0, "loss/logits": 0.18385881185531616, "loss/reg": 0.5255004167556763, "step": 616 }, { "epoch": 0.00617, "grad_norm": 0.4063803255558014, "grad_norm_var": 0.0014810923590587628, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.765011191368103, "loss/hidden": 0.0, "loss/logits": 0.18457970023155212, "loss/reg": 0.5255771279335022, "step": 617 }, { "epoch": 0.00618, "grad_norm": 0.36755988001823425, "grad_norm_var": 0.0016969131477061656, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.8099457025527954, "loss/hidden": 0.0, "loss/logits": 0.17360728234052658, "loss/reg": 0.5253624320030212, "step": 618 }, { "epoch": 0.00619, "grad_norm": 0.4963686168193817, "grad_norm_var": 0.001983884546546107, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.847084701061249, "loss/hidden": 0.0, "loss/logits": 0.18559687584638596, "loss/reg": 0.5250545144081116, "step": 619 }, { "epoch": 0.0062, "grad_norm": 0.4522532820701599, "grad_norm_var": 0.002012245048029356, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.874119997024536, "loss/hidden": 0.0, "loss/logits": 0.17398691177368164, "loss/reg": 0.5245436429977417, "step": 620 }, { "epoch": 0.00621, "grad_norm": 0.3922837972640991, "grad_norm_var": 0.0019468741106489626, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.733251214027405, "loss/hidden": 0.0, "loss/logits": 0.18459589779376984, "loss/reg": 0.5241463780403137, "step": 621 }, { "epoch": 0.00622, "grad_norm": 0.37801602482795715, "grad_norm_var": 0.002118790882621424, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.7481032013893127, "loss/hidden": 0.0, "loss/logits": 0.17722558602690697, "loss/reg": 0.5237584710121155, "step": 622 }, { "epoch": 0.00623, "grad_norm": 0.370016872882843, "grad_norm_var": 0.0022691069694934286, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.8038156032562256, "loss/hidden": 0.0, "loss/logits": 0.17304755002260208, "loss/reg": 0.523026168346405, "step": 623 }, { "epoch": 0.00624, "grad_norm": 0.47145068645477295, "grad_norm_var": 0.0021962339412960664, "learning_rate": 5e-05, "loss": 0.1879, "loss/crossentropy": 2.8937525153160095, "loss/hidden": 0.0, "loss/logits": 0.18792713806033134, "loss/reg": 0.5222709774971008, "step": 624 }, { "epoch": 0.00625, "grad_norm": 0.40490373969078064, "grad_norm_var": 0.002185821493403876, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.6153727173805237, "loss/hidden": 0.0, "loss/logits": 0.18022970482707024, "loss/reg": 0.521355152130127, "step": 625 }, { "epoch": 0.00626, "grad_norm": 0.3764815628528595, "grad_norm_var": 0.0022251993767452277, "learning_rate": 5e-05, "loss": 0.1745, "loss/crossentropy": 2.726262867450714, "loss/hidden": 0.0, "loss/logits": 0.17447687685489655, "loss/reg": 0.5206652879714966, "step": 626 }, { "epoch": 0.00627, "grad_norm": 0.3648182153701782, "grad_norm_var": 0.001839186216416048, "learning_rate": 5e-05, "loss": 0.1732, "loss/crossentropy": 2.780579686164856, "loss/hidden": 0.0, "loss/logits": 0.173160370439291, "loss/reg": 0.5202951431274414, "step": 627 }, { "epoch": 0.00628, "grad_norm": 0.4027639627456665, "grad_norm_var": 0.0018273629625155884, "learning_rate": 5e-05, "loss": 0.1881, "loss/crossentropy": 2.6956580877304077, "loss/hidden": 0.0, "loss/logits": 0.18811288848519325, "loss/reg": 0.5197395086288452, "step": 628 }, { "epoch": 0.00629, "grad_norm": 0.36433014273643494, "grad_norm_var": 0.001985032532502847, "learning_rate": 5e-05, "loss": 0.1691, "loss/crossentropy": 2.882453143596649, "loss/hidden": 0.0, "loss/logits": 0.1691296584904194, "loss/reg": 0.5192205905914307, "step": 629 }, { "epoch": 0.0063, "grad_norm": 0.4026321768760681, "grad_norm_var": 0.001977651124460685, "learning_rate": 5e-05, "loss": 0.1864, "loss/crossentropy": 2.9053996205329895, "loss/hidden": 0.0, "loss/logits": 0.1864301711320877, "loss/reg": 0.5185652375221252, "step": 630 }, { "epoch": 0.00631, "grad_norm": 0.4006529450416565, "grad_norm_var": 0.0019303966580059358, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.764085590839386, "loss/hidden": 0.0, "loss/logits": 0.1772325150668621, "loss/reg": 0.5183531045913696, "step": 631 }, { "epoch": 0.00632, "grad_norm": 0.38397860527038574, "grad_norm_var": 0.0015293508694572725, "learning_rate": 5e-05, "loss": 0.167, "loss/crossentropy": 2.7896572947502136, "loss/hidden": 0.0, "loss/logits": 0.16699952259659767, "loss/reg": 0.5179742574691772, "step": 632 }, { "epoch": 0.00633, "grad_norm": 0.3865090310573578, "grad_norm_var": 0.0015429031479982868, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.8101632595062256, "loss/hidden": 0.0, "loss/logits": 0.16890694573521614, "loss/reg": 0.5176936984062195, "step": 633 }, { "epoch": 0.00634, "grad_norm": 0.40509217977523804, "grad_norm_var": 0.0014639072860746912, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.7948710918426514, "loss/hidden": 0.0, "loss/logits": 0.17558567970991135, "loss/reg": 0.517310619354248, "step": 634 }, { "epoch": 0.00635, "grad_norm": 0.41124147176742554, "grad_norm_var": 0.0008602902517445406, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.8675787448883057, "loss/hidden": 0.0, "loss/logits": 0.18453383073210716, "loss/reg": 0.5169790387153625, "step": 635 }, { "epoch": 0.00636, "grad_norm": 0.4257376194000244, "grad_norm_var": 0.0007122974232836143, "learning_rate": 5e-05, "loss": 0.1952, "loss/crossentropy": 2.876689314842224, "loss/hidden": 0.0, "loss/logits": 0.19524602219462395, "loss/reg": 0.5165721774101257, "step": 636 }, { "epoch": 0.00637, "grad_norm": 0.4066680669784546, "grad_norm_var": 0.0007175133686616264, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.8337889313697815, "loss/hidden": 0.0, "loss/logits": 0.1798693686723709, "loss/reg": 0.5163912773132324, "step": 637 }, { "epoch": 0.00638, "grad_norm": 0.40588733553886414, "grad_norm_var": 0.0006947513160394399, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.655221939086914, "loss/hidden": 0.0, "loss/logits": 0.17079980671405792, "loss/reg": 0.5159400701522827, "step": 638 }, { "epoch": 0.00639, "grad_norm": 0.4100181460380554, "grad_norm_var": 0.0006404545546728498, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.772250473499298, "loss/hidden": 0.0, "loss/logits": 0.182156041264534, "loss/reg": 0.515633225440979, "step": 639 }, { "epoch": 0.0064, "grad_norm": 0.4184816777706146, "grad_norm_var": 0.00032141447723561535, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.6707375049591064, "loss/hidden": 0.0, "loss/logits": 0.1871240958571434, "loss/reg": 0.5148553848266602, "step": 640 }, { "epoch": 0.00641, "grad_norm": 0.37024372816085815, "grad_norm_var": 0.0003652267912876798, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 2.794716000556946, "loss/hidden": 0.0, "loss/logits": 0.17974266782402992, "loss/reg": 0.5144128799438477, "step": 641 }, { "epoch": 0.00642, "grad_norm": 0.46003326773643494, "grad_norm_var": 0.000584414889056121, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.736919343471527, "loss/hidden": 0.0, "loss/logits": 0.18742602318525314, "loss/reg": 0.5135025382041931, "step": 642 }, { "epoch": 0.00643, "grad_norm": 0.3949570655822754, "grad_norm_var": 0.0004950140805240033, "learning_rate": 5e-05, "loss": 0.188, "loss/crossentropy": 2.776914596557617, "loss/hidden": 0.0, "loss/logits": 0.1879568062722683, "loss/reg": 0.5124860405921936, "step": 643 }, { "epoch": 0.00644, "grad_norm": 0.3681175410747528, "grad_norm_var": 0.0005714822500744621, "learning_rate": 5e-05, "loss": 0.1693, "loss/crossentropy": 2.8671674132347107, "loss/hidden": 0.0, "loss/logits": 0.169349767267704, "loss/reg": 0.5119767785072327, "step": 644 }, { "epoch": 0.00645, "grad_norm": 0.4091375470161438, "grad_norm_var": 0.00047841608215064925, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.6508522033691406, "loss/hidden": 0.0, "loss/logits": 0.18475625663995743, "loss/reg": 0.5114136338233948, "step": 645 }, { "epoch": 0.00646, "grad_norm": 0.3707975447177887, "grad_norm_var": 0.0005463388025109002, "learning_rate": 5e-05, "loss": 0.1713, "loss/crossentropy": 2.7380632758140564, "loss/hidden": 0.0, "loss/logits": 0.17127932608127594, "loss/reg": 0.5108417868614197, "step": 646 }, { "epoch": 0.00647, "grad_norm": 0.37662187218666077, "grad_norm_var": 0.0005858578414773737, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.802596867084503, "loss/hidden": 0.0, "loss/logits": 0.18647762387990952, "loss/reg": 0.5103307366371155, "step": 647 }, { "epoch": 0.00648, "grad_norm": 0.3773002326488495, "grad_norm_var": 0.0006031076778356089, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.769145131111145, "loss/hidden": 0.0, "loss/logits": 0.17941123619675636, "loss/reg": 0.5098391175270081, "step": 648 }, { "epoch": 0.00649, "grad_norm": 0.4668806791305542, "grad_norm_var": 0.0008643741585044477, "learning_rate": 5e-05, "loss": 0.1863, "loss/crossentropy": 2.744249641895294, "loss/hidden": 0.0, "loss/logits": 0.18628650158643723, "loss/reg": 0.5096877813339233, "step": 649 }, { "epoch": 0.0065, "grad_norm": 0.39535969495773315, "grad_norm_var": 0.0008699488232316181, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.7585464119911194, "loss/hidden": 0.0, "loss/logits": 0.18037841096520424, "loss/reg": 0.5094480514526367, "step": 650 }, { "epoch": 0.00651, "grad_norm": 0.4207727611064911, "grad_norm_var": 0.0008845527225496141, "learning_rate": 5e-05, "loss": 0.1941, "loss/crossentropy": 2.793500542640686, "loss/hidden": 0.0, "loss/logits": 0.1941496804356575, "loss/reg": 0.5095275640487671, "step": 651 }, { "epoch": 0.00652, "grad_norm": 0.5624954700469971, "grad_norm_var": 0.0024350118160794427, "learning_rate": 5e-05, "loss": 0.2061, "loss/crossentropy": 2.542850375175476, "loss/hidden": 0.0, "loss/logits": 0.20611152052879333, "loss/reg": 0.509577751159668, "step": 652 }, { "epoch": 0.00653, "grad_norm": 0.40899237990379333, "grad_norm_var": 0.002433275337154047, "learning_rate": 5e-05, "loss": 0.1979, "loss/crossentropy": 2.88733571767807, "loss/hidden": 0.0, "loss/logits": 0.19786739349365234, "loss/reg": 0.509839653968811, "step": 653 }, { "epoch": 0.00654, "grad_norm": 0.39847955107688904, "grad_norm_var": 0.0024442300897151528, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.7209237217903137, "loss/hidden": 0.0, "loss/logits": 0.17720161378383636, "loss/reg": 0.509778618812561, "step": 654 }, { "epoch": 0.00655, "grad_norm": 0.5470329523086548, "grad_norm_var": 0.0035622848666768867, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.8853079676628113, "loss/hidden": 0.0, "loss/logits": 0.1826171614229679, "loss/reg": 0.5098048448562622, "step": 655 }, { "epoch": 0.00656, "grad_norm": 0.38805046677589417, "grad_norm_var": 0.003632842470516299, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.8198031187057495, "loss/hidden": 0.0, "loss/logits": 0.18087942153215408, "loss/reg": 0.5097930431365967, "step": 656 }, { "epoch": 0.00657, "grad_norm": 0.3979627192020416, "grad_norm_var": 0.003498063341403901, "learning_rate": 5e-05, "loss": 0.1606, "loss/crossentropy": 2.61093008518219, "loss/hidden": 0.0, "loss/logits": 0.16060923412442207, "loss/reg": 0.5096233487129211, "step": 657 }, { "epoch": 0.00658, "grad_norm": 0.4784572720527649, "grad_norm_var": 0.0036140916668723324, "learning_rate": 5e-05, "loss": 0.1771, "loss/crossentropy": 2.848282754421234, "loss/hidden": 0.0, "loss/logits": 0.17712492123246193, "loss/reg": 0.5099442005157471, "step": 658 }, { "epoch": 0.00659, "grad_norm": 0.42795342206954956, "grad_norm_var": 0.0035605743189723984, "learning_rate": 5e-05, "loss": 0.1694, "loss/crossentropy": 2.7662516832351685, "loss/hidden": 0.0, "loss/logits": 0.169414434581995, "loss/reg": 0.5102131366729736, "step": 659 }, { "epoch": 0.0066, "grad_norm": 0.39331135153770447, "grad_norm_var": 0.003410339875276025, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.785003662109375, "loss/hidden": 0.0, "loss/logits": 0.18282215297222137, "loss/reg": 0.5104846954345703, "step": 660 }, { "epoch": 0.00661, "grad_norm": 0.39343392848968506, "grad_norm_var": 0.003461531355837233, "learning_rate": 5e-05, "loss": 0.1864, "loss/crossentropy": 2.8266950249671936, "loss/hidden": 0.0, "loss/logits": 0.18642331287264824, "loss/reg": 0.5110275745391846, "step": 661 }, { "epoch": 0.00662, "grad_norm": 0.49109530448913574, "grad_norm_var": 0.0034927001199977306, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.8089317679405212, "loss/hidden": 0.0, "loss/logits": 0.1793442964553833, "loss/reg": 0.5115256905555725, "step": 662 }, { "epoch": 0.00663, "grad_norm": 0.44122421741485596, "grad_norm_var": 0.003269966030933639, "learning_rate": 5e-05, "loss": 0.1893, "loss/crossentropy": 2.8747792840003967, "loss/hidden": 0.0, "loss/logits": 0.18931875750422478, "loss/reg": 0.5118873119354248, "step": 663 }, { "epoch": 0.00664, "grad_norm": 0.3680807054042816, "grad_norm_var": 0.003348419992552846, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.9338263273239136, "loss/hidden": 0.0, "loss/logits": 0.1759457141160965, "loss/reg": 0.511714518070221, "step": 664 }, { "epoch": 0.00665, "grad_norm": 0.42077165842056274, "grad_norm_var": 0.0032928239121887953, "learning_rate": 5e-05, "loss": 0.1928, "loss/crossentropy": 2.7103938460350037, "loss/hidden": 0.0, "loss/logits": 0.19282248243689537, "loss/reg": 0.5116373300552368, "step": 665 }, { "epoch": 0.00666, "grad_norm": 0.37517157196998596, "grad_norm_var": 0.0034205356030741524, "learning_rate": 5e-05, "loss": 0.1624, "loss/crossentropy": 2.8938183784484863, "loss/hidden": 0.0, "loss/logits": 0.16238898038864136, "loss/reg": 0.5118199586868286, "step": 666 }, { "epoch": 0.00667, "grad_norm": 0.42928510904312134, "grad_norm_var": 0.003412230463772152, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.7335448265075684, "loss/hidden": 0.0, "loss/logits": 0.18473324179649353, "loss/reg": 0.5114844441413879, "step": 667 }, { "epoch": 0.00668, "grad_norm": 0.39981576800346375, "grad_norm_var": 0.002249027653200253, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.6496554017066956, "loss/hidden": 0.0, "loss/logits": 0.18023861572146416, "loss/reg": 0.5116505026817322, "step": 668 }, { "epoch": 0.00669, "grad_norm": 0.38968613743782043, "grad_norm_var": 0.002306952352372349, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.8335792422294617, "loss/hidden": 0.0, "loss/logits": 0.17547546327114105, "loss/reg": 0.5115586519241333, "step": 669 }, { "epoch": 0.0067, "grad_norm": 0.4208097755908966, "grad_norm_var": 0.002270356345835826, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.7906799912452698, "loss/hidden": 0.0, "loss/logits": 0.17182449251413345, "loss/reg": 0.511476457118988, "step": 670 }, { "epoch": 0.00671, "grad_norm": 0.5996054410934448, "grad_norm_var": 0.0033150937286433517, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 3.0074755549430847, "loss/hidden": 0.0, "loss/logits": 0.17866749316453934, "loss/reg": 0.5113058686256409, "step": 671 }, { "epoch": 0.00672, "grad_norm": 0.6499840617179871, "grad_norm_var": 0.006280606751953022, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.9014773964881897, "loss/hidden": 0.0, "loss/logits": 0.1749504879117012, "loss/reg": 0.5113338828086853, "step": 672 }, { "epoch": 0.00673, "grad_norm": 0.5914822220802307, "grad_norm_var": 0.007477446396842247, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.865480661392212, "loss/hidden": 0.0, "loss/logits": 0.18399883806705475, "loss/reg": 0.5112448930740356, "step": 673 }, { "epoch": 0.00674, "grad_norm": 0.6226277351379395, "grad_norm_var": 0.009239241748926697, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.8043219447135925, "loss/hidden": 0.0, "loss/logits": 0.18601788952946663, "loss/reg": 0.5112152695655823, "step": 674 }, { "epoch": 0.00675, "grad_norm": 0.6738488078117371, "grad_norm_var": 0.011856248423391695, "learning_rate": 5e-05, "loss": 0.1909, "loss/crossentropy": 2.7278326749801636, "loss/hidden": 0.0, "loss/logits": 0.19090308994054794, "loss/reg": 0.5112627744674683, "step": 675 }, { "epoch": 0.00676, "grad_norm": 0.5430609583854675, "grad_norm_var": 0.011551595178041855, "learning_rate": 5e-05, "loss": 0.1679, "loss/crossentropy": 2.7318336367607117, "loss/hidden": 0.0, "loss/logits": 0.1679130382835865, "loss/reg": 0.5111681818962097, "step": 676 }, { "epoch": 0.00677, "grad_norm": 0.5723696947097778, "grad_norm_var": 0.011293596196170604, "learning_rate": 5e-05, "loss": 0.1938, "loss/crossentropy": 2.7385933995246887, "loss/hidden": 0.0, "loss/logits": 0.19382726401090622, "loss/reg": 0.5109630823135376, "step": 677 }, { "epoch": 0.00678, "grad_norm": 0.4179444909095764, "grad_norm_var": 0.011708132955936887, "learning_rate": 5e-05, "loss": 0.1677, "loss/crossentropy": 2.756455659866333, "loss/hidden": 0.0, "loss/logits": 0.16765116900205612, "loss/reg": 0.5110304951667786, "step": 678 }, { "epoch": 0.00679, "grad_norm": 0.4529385566711426, "grad_norm_var": 0.01163312962118352, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.847602903842926, "loss/hidden": 0.0, "loss/logits": 0.19322841987013817, "loss/reg": 0.5106305480003357, "step": 679 }, { "epoch": 0.0068, "grad_norm": 0.4780008792877197, "grad_norm_var": 0.010521296127292364, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.854358732700348, "loss/hidden": 0.0, "loss/logits": 0.177520539611578, "loss/reg": 0.5106386542320251, "step": 680 }, { "epoch": 0.00681, "grad_norm": 0.4141899347305298, "grad_norm_var": 0.010595582905107278, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.7946380376815796, "loss/hidden": 0.0, "loss/logits": 0.17412305623292923, "loss/reg": 0.5103532075881958, "step": 681 }, { "epoch": 0.00682, "grad_norm": 0.43358147144317627, "grad_norm_var": 0.009821650886730812, "learning_rate": 5e-05, "loss": 0.1744, "loss/crossentropy": 2.8096965551376343, "loss/hidden": 0.0, "loss/logits": 0.174409382045269, "loss/reg": 0.5101112723350525, "step": 682 }, { "epoch": 0.00683, "grad_norm": 0.3993518352508545, "grad_norm_var": 0.010182139511289986, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.9657493233680725, "loss/hidden": 0.0, "loss/logits": 0.18188364803791046, "loss/reg": 0.5098157525062561, "step": 683 }, { "epoch": 0.00684, "grad_norm": 0.39248356223106384, "grad_norm_var": 0.010287065638214528, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.9617007970809937, "loss/hidden": 0.0, "loss/logits": 0.17156171798706055, "loss/reg": 0.5095759034156799, "step": 684 }, { "epoch": 0.00685, "grad_norm": 0.8029955625534058, "grad_norm_var": 0.01470545816487907, "learning_rate": 5e-05, "loss": 0.2151, "loss/crossentropy": 2.9013417959213257, "loss/hidden": 0.0, "loss/logits": 0.21508875116705894, "loss/reg": 0.5091412663459778, "step": 685 }, { "epoch": 0.00686, "grad_norm": 0.4522547125816345, "grad_norm_var": 0.014313318430436702, "learning_rate": 5e-05, "loss": 0.1982, "loss/crossentropy": 2.7179940938949585, "loss/hidden": 0.0, "loss/logits": 0.1982102207839489, "loss/reg": 0.5087611675262451, "step": 686 }, { "epoch": 0.00687, "grad_norm": 0.40405595302581787, "grad_norm_var": 0.01491569889920283, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.7349427938461304, "loss/hidden": 0.0, "loss/logits": 0.17570801451802254, "loss/reg": 0.5087314248085022, "step": 687 }, { "epoch": 0.00688, "grad_norm": 0.4721728265285492, "grad_norm_var": 0.0137821666701137, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.788485586643219, "loss/hidden": 0.0, "loss/logits": 0.18031561002135277, "loss/reg": 0.5086828470230103, "step": 688 }, { "epoch": 0.00689, "grad_norm": 0.5174816846847534, "grad_norm_var": 0.013297862556061786, "learning_rate": 5e-05, "loss": 0.1863, "loss/crossentropy": 2.7682158946990967, "loss/hidden": 0.0, "loss/logits": 0.18630298599600792, "loss/reg": 0.5083693861961365, "step": 689 }, { "epoch": 0.0069, "grad_norm": 0.4024963080883026, "grad_norm_var": 0.01281779371501746, "learning_rate": 5e-05, "loss": 0.1737, "loss/crossentropy": 2.7792950868606567, "loss/hidden": 0.0, "loss/logits": 0.17373281717300415, "loss/reg": 0.508225679397583, "step": 690 }, { "epoch": 0.00691, "grad_norm": 0.46942126750946045, "grad_norm_var": 0.010400187399223108, "learning_rate": 5e-05, "loss": 0.2003, "loss/crossentropy": 2.643202543258667, "loss/hidden": 0.0, "loss/logits": 0.2002827487885952, "loss/reg": 0.5080039501190186, "step": 691 }, { "epoch": 0.00692, "grad_norm": 0.37246689200401306, "grad_norm_var": 0.010706232958532938, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.829901933670044, "loss/hidden": 0.0, "loss/logits": 0.18309412151575089, "loss/reg": 0.5075996518135071, "step": 692 }, { "epoch": 0.00693, "grad_norm": 0.35888025164604187, "grad_norm_var": 0.010523808613100387, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.7274303436279297, "loss/hidden": 0.0, "loss/logits": 0.16883475333452225, "loss/reg": 0.5074198246002197, "step": 693 }, { "epoch": 0.00694, "grad_norm": 0.4098101556301117, "grad_norm_var": 0.010565470770167333, "learning_rate": 5e-05, "loss": 0.1999, "loss/crossentropy": 2.5043751001358032, "loss/hidden": 0.0, "loss/logits": 0.19988223165273666, "loss/reg": 0.5070257782936096, "step": 694 }, { "epoch": 0.00695, "grad_norm": 0.3361075818538666, "grad_norm_var": 0.011404509218310018, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.73706591129303, "loss/hidden": 0.0, "loss/logits": 0.17832985147833824, "loss/reg": 0.5067664384841919, "step": 695 }, { "epoch": 0.00696, "grad_norm": 0.3633408546447754, "grad_norm_var": 0.011717614209581569, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.77338969707489, "loss/hidden": 0.0, "loss/logits": 0.1749737299978733, "loss/reg": 0.5063682198524475, "step": 696 }, { "epoch": 0.00697, "grad_norm": 0.42576926946640015, "grad_norm_var": 0.011689900337107383, "learning_rate": 5e-05, "loss": 0.1781, "loss/crossentropy": 2.7477203011512756, "loss/hidden": 0.0, "loss/logits": 0.17808011174201965, "loss/reg": 0.5063145160675049, "step": 697 }, { "epoch": 0.00698, "grad_norm": 0.6859210133552551, "grad_norm_var": 0.015511119905048398, "learning_rate": 5e-05, "loss": 0.208, "loss/crossentropy": 2.7767802476882935, "loss/hidden": 0.0, "loss/logits": 0.20797424390912056, "loss/reg": 0.5058445930480957, "step": 698 }, { "epoch": 0.00699, "grad_norm": 0.41096484661102295, "grad_norm_var": 0.015434833764210683, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.955298900604248, "loss/hidden": 0.0, "loss/logits": 0.18670274317264557, "loss/reg": 0.5052971839904785, "step": 699 }, { "epoch": 0.007, "grad_norm": 0.42934897541999817, "grad_norm_var": 0.015213519891019293, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.9050281047821045, "loss/hidden": 0.0, "loss/logits": 0.18417277932167053, "loss/reg": 0.5051462650299072, "step": 700 }, { "epoch": 0.00701, "grad_norm": 0.4011780321598053, "grad_norm_var": 0.006772641603009537, "learning_rate": 5e-05, "loss": 0.1771, "loss/crossentropy": 2.769184708595276, "loss/hidden": 0.0, "loss/logits": 0.17708734795451164, "loss/reg": 0.5047007203102112, "step": 701 }, { "epoch": 0.00702, "grad_norm": 0.48211103677749634, "grad_norm_var": 0.006909066893630215, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.754790782928467, "loss/hidden": 0.0, "loss/logits": 0.1735387034714222, "loss/reg": 0.5046074986457825, "step": 702 }, { "epoch": 0.00703, "grad_norm": 0.41086065769195557, "grad_norm_var": 0.006884933077648355, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.80459064245224, "loss/hidden": 0.0, "loss/logits": 0.18284977227449417, "loss/reg": 0.5042373538017273, "step": 703 }, { "epoch": 0.00704, "grad_norm": 0.39511173963546753, "grad_norm_var": 0.006866646855033809, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.7684733867645264, "loss/hidden": 0.0, "loss/logits": 0.1741652488708496, "loss/reg": 0.5037651658058167, "step": 704 }, { "epoch": 0.00705, "grad_norm": 0.4088932275772095, "grad_norm_var": 0.006329113216238928, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 2.706860303878784, "loss/hidden": 0.0, "loss/logits": 0.1796705722808838, "loss/reg": 0.5035074353218079, "step": 705 }, { "epoch": 0.00706, "grad_norm": 0.394777774810791, "grad_norm_var": 0.006353595772177304, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 2.7835565209388733, "loss/hidden": 0.0, "loss/logits": 0.17491931095719337, "loss/reg": 0.5030983686447144, "step": 706 }, { "epoch": 0.00707, "grad_norm": 0.9192830324172974, "grad_norm_var": 0.02183536325629126, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.6520556211471558, "loss/hidden": 0.0, "loss/logits": 0.19029763340950012, "loss/reg": 0.502878725528717, "step": 707 }, { "epoch": 0.00708, "grad_norm": 0.4863012433052063, "grad_norm_var": 0.021463886256536356, "learning_rate": 5e-05, "loss": 0.1886, "loss/crossentropy": 2.6553282737731934, "loss/hidden": 0.0, "loss/logits": 0.1885831542313099, "loss/reg": 0.5025891065597534, "step": 708 }, { "epoch": 0.00709, "grad_norm": 0.4717464745044708, "grad_norm_var": 0.02077720910170602, "learning_rate": 5e-05, "loss": 0.1894, "loss/crossentropy": 2.9119072556495667, "loss/hidden": 0.0, "loss/logits": 0.18944115936756134, "loss/reg": 0.5021151304244995, "step": 709 }, { "epoch": 0.0071, "grad_norm": 0.5018190145492554, "grad_norm_var": 0.020635747793694748, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.6813968420028687, "loss/hidden": 0.0, "loss/logits": 0.17822735011577606, "loss/reg": 0.501754879951477, "step": 710 }, { "epoch": 0.00711, "grad_norm": 0.3789680600166321, "grad_norm_var": 0.019984139987440423, "learning_rate": 5e-05, "loss": 0.1738, "loss/crossentropy": 2.863506317138672, "loss/hidden": 0.0, "loss/logits": 0.17375260964035988, "loss/reg": 0.5013217926025391, "step": 711 }, { "epoch": 0.00712, "grad_norm": 0.4153907001018524, "grad_norm_var": 0.019393127986007015, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.7599775195121765, "loss/hidden": 0.0, "loss/logits": 0.17934707179665565, "loss/reg": 0.5010343790054321, "step": 712 }, { "epoch": 0.00713, "grad_norm": 0.4465808570384979, "grad_norm_var": 0.019280389902616095, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.7691873908042908, "loss/hidden": 0.0, "loss/logits": 0.18454280123114586, "loss/reg": 0.5005761981010437, "step": 713 }, { "epoch": 0.00714, "grad_norm": 0.4050392210483551, "grad_norm_var": 0.01640400566448978, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.746528208255768, "loss/hidden": 0.0, "loss/logits": 0.17397798597812653, "loss/reg": 0.5002607107162476, "step": 714 }, { "epoch": 0.00715, "grad_norm": 0.35748758912086487, "grad_norm_var": 0.016931655415879224, "learning_rate": 5e-05, "loss": 0.1727, "loss/crossentropy": 2.686008334159851, "loss/hidden": 0.0, "loss/logits": 0.1727052442729473, "loss/reg": 0.4999920427799225, "step": 715 }, { "epoch": 0.00716, "grad_norm": 0.4125484228134155, "grad_norm_var": 0.017010242545140844, "learning_rate": 5e-05, "loss": 0.1771, "loss/crossentropy": 2.7122842669487, "loss/hidden": 0.0, "loss/logits": 0.17708629369735718, "loss/reg": 0.4993031919002533, "step": 716 }, { "epoch": 0.00717, "grad_norm": 0.3935418725013733, "grad_norm_var": 0.017069201319001717, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.8318325877189636, "loss/hidden": 0.0, "loss/logits": 0.17522020265460014, "loss/reg": 0.49907004833221436, "step": 717 }, { "epoch": 0.00718, "grad_norm": 0.4203684628009796, "grad_norm_var": 0.017084510205142782, "learning_rate": 5e-05, "loss": 0.191, "loss/crossentropy": 2.80024790763855, "loss/hidden": 0.0, "loss/logits": 0.19102013111114502, "loss/reg": 0.4984093904495239, "step": 718 }, { "epoch": 0.00719, "grad_norm": 0.3722877502441406, "grad_norm_var": 0.017384814636405805, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.852687895298004, "loss/hidden": 0.0, "loss/logits": 0.1709176041185856, "loss/reg": 0.4979517161846161, "step": 719 }, { "epoch": 0.0072, "grad_norm": 0.38861578702926636, "grad_norm_var": 0.017433917393445582, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.7754414677619934, "loss/hidden": 0.0, "loss/logits": 0.17570730671286583, "loss/reg": 0.4975866377353668, "step": 720 }, { "epoch": 0.00721, "grad_norm": 0.426360160112381, "grad_norm_var": 0.017361086710475513, "learning_rate": 5e-05, "loss": 0.1764, "loss/crossentropy": 2.841568171977997, "loss/hidden": 0.0, "loss/logits": 0.1763586401939392, "loss/reg": 0.4970490038394928, "step": 721 }, { "epoch": 0.00722, "grad_norm": 0.42075178027153015, "grad_norm_var": 0.017213929470745255, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 2.751410186290741, "loss/hidden": 0.0, "loss/logits": 0.1901235617697239, "loss/reg": 0.4966580271720886, "step": 722 }, { "epoch": 0.00723, "grad_norm": 0.7979768514633179, "grad_norm_var": 0.010560647611445484, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.993684768676758, "loss/hidden": 0.0, "loss/logits": 0.17626673355698586, "loss/reg": 0.4961376488208771, "step": 723 }, { "epoch": 0.00724, "grad_norm": 0.5370972752571106, "grad_norm_var": 0.011011888186661389, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 2.714000642299652, "loss/hidden": 0.0, "loss/logits": 0.17871497198939323, "loss/reg": 0.49577420949935913, "step": 724 }, { "epoch": 0.00725, "grad_norm": 0.4444805383682251, "grad_norm_var": 0.010967156420705578, "learning_rate": 5e-05, "loss": 0.1697, "loss/crossentropy": 2.7231724858283997, "loss/hidden": 0.0, "loss/logits": 0.1696707122027874, "loss/reg": 0.49540677666664124, "step": 725 }, { "epoch": 0.00726, "grad_norm": 0.49782150983810425, "grad_norm_var": 0.010937847762020007, "learning_rate": 5e-05, "loss": 0.1864, "loss/crossentropy": 2.7989856004714966, "loss/hidden": 0.0, "loss/logits": 0.18637549877166748, "loss/reg": 0.4949151873588562, "step": 726 }, { "epoch": 0.00727, "grad_norm": 0.5502548217773438, "grad_norm_var": 0.011270176674914926, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.8378891348838806, "loss/hidden": 0.0, "loss/logits": 0.1772335320711136, "loss/reg": 0.4946534037590027, "step": 727 }, { "epoch": 0.00728, "grad_norm": 0.725472629070282, "grad_norm_var": 0.015624920951629484, "learning_rate": 5e-05, "loss": 0.2085, "loss/crossentropy": 2.896777629852295, "loss/hidden": 0.0, "loss/logits": 0.20847301557660103, "loss/reg": 0.49412932991981506, "step": 728 }, { "epoch": 0.00729, "grad_norm": 0.5051751136779785, "grad_norm_var": 0.015619093317576794, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.7677701711654663, "loss/hidden": 0.0, "loss/logits": 0.16883263364434242, "loss/reg": 0.49356475472450256, "step": 729 }, { "epoch": 0.0073, "grad_norm": 0.45163899660110474, "grad_norm_var": 0.015298660084022453, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.8651182651519775, "loss/hidden": 0.0, "loss/logits": 0.1891641914844513, "loss/reg": 0.49311143159866333, "step": 730 }, { "epoch": 0.00731, "grad_norm": 0.4691639542579651, "grad_norm_var": 0.01423354172936181, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.8307586908340454, "loss/hidden": 0.0, "loss/logits": 0.17331324890255928, "loss/reg": 0.49266478419303894, "step": 731 }, { "epoch": 0.00732, "grad_norm": 0.4598138928413391, "grad_norm_var": 0.013895479340263691, "learning_rate": 5e-05, "loss": 0.1728, "loss/crossentropy": 2.740417778491974, "loss/hidden": 0.0, "loss/logits": 0.17279662936925888, "loss/reg": 0.49212732911109924, "step": 732 }, { "epoch": 0.00733, "grad_norm": 0.46383538842201233, "grad_norm_var": 0.01328805545675138, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.8887507915496826, "loss/hidden": 0.0, "loss/logits": 0.18068893253803253, "loss/reg": 0.4916749596595764, "step": 733 }, { "epoch": 0.00734, "grad_norm": 0.4099755883216858, "grad_norm_var": 0.013399186988902, "learning_rate": 5e-05, "loss": 0.1726, "loss/crossentropy": 2.6440797448158264, "loss/hidden": 0.0, "loss/logits": 0.17260344699025154, "loss/reg": 0.491207093000412, "step": 734 }, { "epoch": 0.00735, "grad_norm": 0.38732391595840454, "grad_norm_var": 0.013167210679394522, "learning_rate": 5e-05, "loss": 0.1862, "loss/crossentropy": 2.7742497324943542, "loss/hidden": 0.0, "loss/logits": 0.18615082278847694, "loss/reg": 0.4907788932323456, "step": 735 }, { "epoch": 0.00736, "grad_norm": 0.4502575993537903, "grad_norm_var": 0.012522235949792201, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.7469452023506165, "loss/hidden": 0.0, "loss/logits": 0.18431715667247772, "loss/reg": 0.49023789167404175, "step": 736 }, { "epoch": 0.00737, "grad_norm": 0.45339852571487427, "grad_norm_var": 0.012303033731598252, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.823056995868683, "loss/hidden": 0.0, "loss/logits": 0.182712834328413, "loss/reg": 0.48991167545318604, "step": 737 }, { "epoch": 0.00738, "grad_norm": 0.44522249698638916, "grad_norm_var": 0.012076908092921008, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.7579530477523804, "loss/hidden": 0.0, "loss/logits": 0.18482673540711403, "loss/reg": 0.48969200253486633, "step": 738 }, { "epoch": 0.00739, "grad_norm": 0.40187308192253113, "grad_norm_var": 0.006307187128519942, "learning_rate": 5e-05, "loss": 0.1872, "loss/crossentropy": 2.743358552455902, "loss/hidden": 0.0, "loss/logits": 0.1872260645031929, "loss/reg": 0.4892804026603699, "step": 739 }, { "epoch": 0.0074, "grad_norm": 0.39080098271369934, "grad_norm_var": 0.00649794666603611, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.6275678277015686, "loss/hidden": 0.0, "loss/logits": 0.1795000620186329, "loss/reg": 0.48879218101501465, "step": 740 }, { "epoch": 0.00741, "grad_norm": 0.43708568811416626, "grad_norm_var": 0.006525694719990667, "learning_rate": 5e-05, "loss": 0.2061, "loss/crossentropy": 2.728283941745758, "loss/hidden": 0.0, "loss/logits": 0.206060990691185, "loss/reg": 0.48847025632858276, "step": 741 }, { "epoch": 0.00742, "grad_norm": 0.4282028079032898, "grad_norm_var": 0.006558247434499705, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.7496354579925537, "loss/hidden": 0.0, "loss/logits": 0.18562320619821548, "loss/reg": 0.4880310893058777, "step": 742 }, { "epoch": 0.00743, "grad_norm": 0.378826379776001, "grad_norm_var": 0.006431292744934469, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.78373521566391, "loss/hidden": 0.0, "loss/logits": 0.17143305763602257, "loss/reg": 0.4876352846622467, "step": 743 }, { "epoch": 0.00744, "grad_norm": 0.43183639645576477, "grad_norm_var": 0.0011771047933298935, "learning_rate": 5e-05, "loss": 0.189, "loss/crossentropy": 2.8287684321403503, "loss/hidden": 0.0, "loss/logits": 0.18895183876156807, "loss/reg": 0.48708289861679077, "step": 744 }, { "epoch": 0.00745, "grad_norm": 0.3891737163066864, "grad_norm_var": 0.0009370200560070393, "learning_rate": 5e-05, "loss": 0.1719, "loss/crossentropy": 2.7831881046295166, "loss/hidden": 0.0, "loss/logits": 0.1718580201268196, "loss/reg": 0.4864675998687744, "step": 745 }, { "epoch": 0.00746, "grad_norm": 0.38805752992630005, "grad_norm_var": 0.0009895099827777623, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.7164615392684937, "loss/hidden": 0.0, "loss/logits": 0.1795516572892666, "loss/reg": 0.4860108196735382, "step": 746 }, { "epoch": 0.00747, "grad_norm": 0.4134718179702759, "grad_norm_var": 0.0008483841133281526, "learning_rate": 5e-05, "loss": 0.1868, "loss/crossentropy": 2.648416757583618, "loss/hidden": 0.0, "loss/logits": 0.18679198622703552, "loss/reg": 0.485711008310318, "step": 747 }, { "epoch": 0.00748, "grad_norm": 0.41030919551849365, "grad_norm_var": 0.0007425343052934037, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.5832308530807495, "loss/hidden": 0.0, "loss/logits": 0.1818922609090805, "loss/reg": 0.4852675795555115, "step": 748 }, { "epoch": 0.00749, "grad_norm": 0.451198935508728, "grad_norm_var": 0.0006744089018574891, "learning_rate": 5e-05, "loss": 0.1999, "loss/crossentropy": 2.7698270082473755, "loss/hidden": 0.0, "loss/logits": 0.19988799840211868, "loss/reg": 0.4848245084285736, "step": 749 }, { "epoch": 0.0075, "grad_norm": 0.39419135451316833, "grad_norm_var": 0.0007041078583662874, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.8577305674552917, "loss/hidden": 0.0, "loss/logits": 0.17092034220695496, "loss/reg": 0.4844954311847687, "step": 750 }, { "epoch": 0.00751, "grad_norm": 0.43690329790115356, "grad_norm_var": 0.0006701449881905241, "learning_rate": 5e-05, "loss": 0.1952, "loss/crossentropy": 2.8064569234848022, "loss/hidden": 0.0, "loss/logits": 0.19517841562628746, "loss/reg": 0.48453548550605774, "step": 751 }, { "epoch": 0.00752, "grad_norm": 0.36283037066459656, "grad_norm_var": 0.0007811720549501189, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.8547937273979187, "loss/hidden": 0.0, "loss/logits": 0.1673429198563099, "loss/reg": 0.48410069942474365, "step": 752 }, { "epoch": 0.00753, "grad_norm": 0.4136514961719513, "grad_norm_var": 0.0006675978619711586, "learning_rate": 5e-05, "loss": 0.1921, "loss/crossentropy": 2.589355766773224, "loss/hidden": 0.0, "loss/logits": 0.19206470623612404, "loss/reg": 0.48393598198890686, "step": 753 }, { "epoch": 0.00754, "grad_norm": 0.4114004969596863, "grad_norm_var": 0.0005840971491052079, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.7363895177841187, "loss/hidden": 0.0, "loss/logits": 0.1884075105190277, "loss/reg": 0.4837333858013153, "step": 754 }, { "epoch": 0.00755, "grad_norm": 0.3473058044910431, "grad_norm_var": 0.0008201455593773017, "learning_rate": 5e-05, "loss": 0.1594, "loss/crossentropy": 2.8345980048179626, "loss/hidden": 0.0, "loss/logits": 0.15941740572452545, "loss/reg": 0.48360180854797363, "step": 755 }, { "epoch": 0.00756, "grad_norm": 0.4001142978668213, "grad_norm_var": 0.000807527516491513, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.7102696299552917, "loss/hidden": 0.0, "loss/logits": 0.18458664789795876, "loss/reg": 0.48326364159584045, "step": 756 }, { "epoch": 0.00757, "grad_norm": 0.367316335439682, "grad_norm_var": 0.0008217480927494969, "learning_rate": 5e-05, "loss": 0.1685, "loss/crossentropy": 2.7205876111984253, "loss/hidden": 0.0, "loss/logits": 0.16851075738668442, "loss/reg": 0.4831550121307373, "step": 757 }, { "epoch": 0.00758, "grad_norm": 0.3782046139240265, "grad_norm_var": 0.000800303768090993, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.8809556365013123, "loss/hidden": 0.0, "loss/logits": 0.18034518137574196, "loss/reg": 0.48281270265579224, "step": 758 }, { "epoch": 0.00759, "grad_norm": 0.48549550771713257, "grad_norm_var": 0.001232712409078296, "learning_rate": 5e-05, "loss": 0.1996, "loss/crossentropy": 2.7350239157676697, "loss/hidden": 0.0, "loss/logits": 0.19958778470754623, "loss/reg": 0.48283034563064575, "step": 759 }, { "epoch": 0.0076, "grad_norm": 2.953063726425171, "grad_norm_var": 0.40751013686140913, "learning_rate": 5e-05, "loss": 0.3128, "loss/crossentropy": 2.8856847882270813, "loss/hidden": 0.0, "loss/logits": 0.31284236907958984, "loss/reg": 0.48296117782592773, "step": 760 }, { "epoch": 0.00761, "grad_norm": 0.41642895340919495, "grad_norm_var": 0.40692608103357025, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.7612224221229553, "loss/hidden": 0.0, "loss/logits": 0.1765710674226284, "loss/reg": 0.4825826585292816, "step": 761 }, { "epoch": 0.00762, "grad_norm": 0.48500484228134155, "grad_norm_var": 0.4052344163467061, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.7033079266548157, "loss/hidden": 0.0, "loss/logits": 0.18231955543160439, "loss/reg": 0.48224908113479614, "step": 762 }, { "epoch": 0.00763, "grad_norm": 0.41894474625587463, "grad_norm_var": 0.4051217517853446, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.655378580093384, "loss/hidden": 0.0, "loss/logits": 0.17895213514566422, "loss/reg": 0.4821183681488037, "step": 763 }, { "epoch": 0.00764, "grad_norm": 0.36629027128219604, "grad_norm_var": 0.40618464695486195, "learning_rate": 5e-05, "loss": 0.1702, "loss/crossentropy": 2.72704154253006, "loss/hidden": 0.0, "loss/logits": 0.1701839119195938, "loss/reg": 0.48201867938041687, "step": 764 }, { "epoch": 0.00765, "grad_norm": 0.38619327545166016, "grad_norm_var": 0.40746130640264244, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.782890498638153, "loss/hidden": 0.0, "loss/logits": 0.17687128856778145, "loss/reg": 0.4819104075431824, "step": 765 }, { "epoch": 0.00766, "grad_norm": 0.39913424849510193, "grad_norm_var": 0.40735094780863096, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 2.568303644657135, "loss/hidden": 0.0, "loss/logits": 0.17972144857048988, "loss/reg": 0.4818379878997803, "step": 766 }, { "epoch": 0.00767, "grad_norm": 0.3685601055622101, "grad_norm_var": 0.4088034704275266, "learning_rate": 5e-05, "loss": 0.1698, "loss/crossentropy": 2.7961647510528564, "loss/hidden": 0.0, "loss/logits": 0.16975264623761177, "loss/reg": 0.4814355671405792, "step": 767 }, { "epoch": 0.00768, "grad_norm": 0.44236090779304504, "grad_norm_var": 0.407108029070114, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.9139368534088135, "loss/hidden": 0.0, "loss/logits": 0.17787499353289604, "loss/reg": 0.4809689223766327, "step": 768 }, { "epoch": 0.00769, "grad_norm": 0.39624902606010437, "grad_norm_var": 0.407478058403598, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.8614590764045715, "loss/hidden": 0.0, "loss/logits": 0.18120555952191353, "loss/reg": 0.48079386353492737, "step": 769 }, { "epoch": 0.0077, "grad_norm": 0.36076128482818604, "grad_norm_var": 0.4086678491844749, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 2.7885656356811523, "loss/hidden": 0.0, "loss/logits": 0.17119654640555382, "loss/reg": 0.48046737909317017, "step": 770 }, { "epoch": 0.00771, "grad_norm": 0.37886202335357666, "grad_norm_var": 0.40783217123416493, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.9000056385993958, "loss/hidden": 0.0, "loss/logits": 0.17598726227879524, "loss/reg": 0.4801839292049408, "step": 771 }, { "epoch": 0.00772, "grad_norm": 0.3991648554801941, "grad_norm_var": 0.407852807967478, "learning_rate": 5e-05, "loss": 0.1863, "loss/crossentropy": 2.8720275163650513, "loss/hidden": 0.0, "loss/logits": 0.18625085428357124, "loss/reg": 0.4799173176288605, "step": 772 }, { "epoch": 0.00773, "grad_norm": 0.3754219114780426, "grad_norm_var": 0.407645833330487, "learning_rate": 5e-05, "loss": 0.1737, "loss/crossentropy": 2.7886128425598145, "loss/hidden": 0.0, "loss/logits": 0.17374197766184807, "loss/reg": 0.47972872853279114, "step": 773 }, { "epoch": 0.00774, "grad_norm": 0.362498015165329, "grad_norm_var": 0.4080485329793974, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.7082871794700623, "loss/hidden": 0.0, "loss/logits": 0.16615569218993187, "loss/reg": 0.4796118438243866, "step": 774 }, { "epoch": 0.00775, "grad_norm": 0.3762495517730713, "grad_norm_var": 0.40991104019579505, "learning_rate": 5e-05, "loss": 0.1753, "loss/crossentropy": 2.727062463760376, "loss/hidden": 0.0, "loss/logits": 0.17534175142645836, "loss/reg": 0.47948557138442993, "step": 775 }, { "epoch": 0.00776, "grad_norm": 0.37043890357017517, "grad_norm_var": 0.0011214386124233718, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.783520519733429, "loss/hidden": 0.0, "loss/logits": 0.17408838123083115, "loss/reg": 0.47931718826293945, "step": 776 }, { "epoch": 0.00777, "grad_norm": 0.38397738337516785, "grad_norm_var": 0.0010898217083848713, "learning_rate": 5e-05, "loss": 0.1687, "loss/crossentropy": 2.7797967195510864, "loss/hidden": 0.0, "loss/logits": 0.16867585107684135, "loss/reg": 0.47926589846611023, "step": 777 }, { "epoch": 0.00778, "grad_norm": 0.4056382477283478, "grad_norm_var": 0.0004980665850293183, "learning_rate": 5e-05, "loss": 0.1728, "loss/crossentropy": 2.706587791442871, "loss/hidden": 0.0, "loss/logits": 0.17276694998145103, "loss/reg": 0.4790947437286377, "step": 778 }, { "epoch": 0.00779, "grad_norm": 0.5380297303199768, "grad_norm_var": 0.0018928579585091717, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.6305949687957764, "loss/hidden": 0.0, "loss/logits": 0.19068614393472672, "loss/reg": 0.4790230691432953, "step": 779 }, { "epoch": 0.0078, "grad_norm": 0.5009308457374573, "grad_norm_var": 0.002521875056156401, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.9420247077941895, "loss/hidden": 0.0, "loss/logits": 0.19576625525951385, "loss/reg": 0.4788469076156616, "step": 780 }, { "epoch": 0.00781, "grad_norm": 0.4676210880279541, "grad_norm_var": 0.002756204408870829, "learning_rate": 5e-05, "loss": 0.1913, "loss/crossentropy": 2.740632951259613, "loss/hidden": 0.0, "loss/logits": 0.19129598140716553, "loss/reg": 0.4786304533481598, "step": 781 }, { "epoch": 0.00782, "grad_norm": 0.3729724586009979, "grad_norm_var": 0.0028294494798240385, "learning_rate": 5e-05, "loss": 0.1692, "loss/crossentropy": 2.81093168258667, "loss/hidden": 0.0, "loss/logits": 0.16922969371080399, "loss/reg": 0.4788009226322174, "step": 782 }, { "epoch": 0.00783, "grad_norm": 0.36452633142471313, "grad_norm_var": 0.002850728578130542, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.834507703781128, "loss/hidden": 0.0, "loss/logits": 0.17466987669467926, "loss/reg": 0.47844743728637695, "step": 783 }, { "epoch": 0.00784, "grad_norm": 0.4101770222187042, "grad_norm_var": 0.002759355089165405, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.8280447125434875, "loss/hidden": 0.0, "loss/logits": 0.181928813457489, "loss/reg": 0.47824418544769287, "step": 784 }, { "epoch": 0.00785, "grad_norm": 0.36138761043548584, "grad_norm_var": 0.0028712006433561717, "learning_rate": 5e-05, "loss": 0.164, "loss/crossentropy": 2.8210853338241577, "loss/hidden": 0.0, "loss/logits": 0.1639898717403412, "loss/reg": 0.478073388338089, "step": 785 }, { "epoch": 0.00786, "grad_norm": 0.41130200028419495, "grad_norm_var": 0.0027543583681572652, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.877467095851898, "loss/hidden": 0.0, "loss/logits": 0.1840183362364769, "loss/reg": 0.47774508595466614, "step": 786 }, { "epoch": 0.00787, "grad_norm": 0.7104492783546448, "grad_norm_var": 0.008472853607363348, "learning_rate": 5e-05, "loss": 0.2108, "loss/crossentropy": 2.680935800075531, "loss/hidden": 0.0, "loss/logits": 0.21083545684814453, "loss/reg": 0.47754210233688354, "step": 787 }, { "epoch": 0.00788, "grad_norm": 0.38502147793769836, "grad_norm_var": 0.00853534646022371, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.726678252220154, "loss/hidden": 0.0, "loss/logits": 0.17860794812440872, "loss/reg": 0.47730037569999695, "step": 788 }, { "epoch": 0.00789, "grad_norm": 0.3921632170677185, "grad_norm_var": 0.008442664990774285, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.858565390110016, "loss/hidden": 0.0, "loss/logits": 0.17478104308247566, "loss/reg": 0.4769524931907654, "step": 789 }, { "epoch": 0.0079, "grad_norm": 0.4116172790527344, "grad_norm_var": 0.008178640580641139, "learning_rate": 5e-05, "loss": 0.1745, "loss/crossentropy": 2.7498709559440613, "loss/hidden": 0.0, "loss/logits": 0.17446143925189972, "loss/reg": 0.4767577052116394, "step": 790 }, { "epoch": 0.00791, "grad_norm": 0.41223233938217163, "grad_norm_var": 0.008006931110841184, "learning_rate": 5e-05, "loss": 0.1916, "loss/crossentropy": 2.9883262515068054, "loss/hidden": 0.0, "loss/logits": 0.1915891245007515, "loss/reg": 0.4764123260974884, "step": 791 }, { "epoch": 0.00792, "grad_norm": 0.39093345403671265, "grad_norm_var": 0.007867268669206891, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.92649245262146, "loss/hidden": 0.0, "loss/logits": 0.16895557194948196, "loss/reg": 0.4766080379486084, "step": 792 }, { "epoch": 0.00793, "grad_norm": 0.3857032358646393, "grad_norm_var": 0.007856303785613727, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 2.751195549964905, "loss/hidden": 0.0, "loss/logits": 0.17847151309251785, "loss/reg": 0.4767628610134125, "step": 793 }, { "epoch": 0.00794, "grad_norm": 0.4190113842487335, "grad_norm_var": 0.007819505914468871, "learning_rate": 5e-05, "loss": 0.1753, "loss/crossentropy": 2.6583277583122253, "loss/hidden": 0.0, "loss/logits": 0.17533044517040253, "loss/reg": 0.47678232192993164, "step": 794 }, { "epoch": 0.00795, "grad_norm": 0.3886236846446991, "grad_norm_var": 0.0071299327820755175, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.6765416264533997, "loss/hidden": 0.0, "loss/logits": 0.18669021874666214, "loss/reg": 0.47689518332481384, "step": 795 }, { "epoch": 0.00796, "grad_norm": 0.3849639296531677, "grad_norm_var": 0.0067815788410599395, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.8617140650749207, "loss/hidden": 0.0, "loss/logits": 0.18171793967485428, "loss/reg": 0.47739872336387634, "step": 796 }, { "epoch": 0.00797, "grad_norm": 0.37762588262557983, "grad_norm_var": 0.006677883683017364, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.967951714992523, "loss/hidden": 0.0, "loss/logits": 0.1743115298449993, "loss/reg": 0.477798193693161, "step": 797 }, { "epoch": 0.00798, "grad_norm": 0.4047374129295349, "grad_norm_var": 0.0065791703575071615, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.761665403842926, "loss/hidden": 0.0, "loss/logits": 0.18700791150331497, "loss/reg": 0.47754812240600586, "step": 798 }, { "epoch": 0.00799, "grad_norm": 0.5060772895812988, "grad_norm_var": 0.0069136764438900485, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.7741934657096863, "loss/hidden": 0.0, "loss/logits": 0.1850084848701954, "loss/reg": 0.47804275155067444, "step": 799 }, { "epoch": 0.008, "grad_norm": 0.41059041023254395, "grad_norm_var": 0.006913035369568758, "learning_rate": 5e-05, "loss": 0.1663, "loss/crossentropy": 2.977358341217041, "loss/hidden": 0.0, "loss/logits": 0.16634685918688774, "loss/reg": 0.4780350923538208, "step": 800 }, { "epoch": 0.00801, "grad_norm": 0.4174109101295471, "grad_norm_var": 0.00665623232689259, "learning_rate": 5e-05, "loss": 0.1952, "loss/crossentropy": 2.5766881704330444, "loss/hidden": 0.0, "loss/logits": 0.1951538361608982, "loss/reg": 0.4783391058444977, "step": 801 }, { "epoch": 0.00802, "grad_norm": 0.44634637236595154, "grad_norm_var": 0.006666512479295857, "learning_rate": 5e-05, "loss": 0.2009, "loss/crossentropy": 2.6422817707061768, "loss/hidden": 0.0, "loss/logits": 0.2009154111146927, "loss/reg": 0.478293776512146, "step": 802 }, { "epoch": 0.00803, "grad_norm": 0.4219566285610199, "grad_norm_var": 0.0009928573043308446, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.7058839797973633, "loss/hidden": 0.0, "loss/logits": 0.18189909309148788, "loss/reg": 0.47866225242614746, "step": 803 }, { "epoch": 0.00804, "grad_norm": 0.3984600007534027, "grad_norm_var": 0.000959946099882399, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.8107792139053345, "loss/hidden": 0.0, "loss/logits": 0.176250621676445, "loss/reg": 0.478975385427475, "step": 804 }, { "epoch": 0.00805, "grad_norm": 0.4336244761943817, "grad_norm_var": 0.0009658603706985051, "learning_rate": 5e-05, "loss": 0.1978, "loss/crossentropy": 2.7950161695480347, "loss/hidden": 0.0, "loss/logits": 0.19777705147862434, "loss/reg": 0.47901153564453125, "step": 805 }, { "epoch": 0.00806, "grad_norm": 0.41313114762306213, "grad_norm_var": 0.0009657003521270339, "learning_rate": 5e-05, "loss": 0.1872, "loss/crossentropy": 2.837526857852936, "loss/hidden": 0.0, "loss/logits": 0.1872144304215908, "loss/reg": 0.47882410883903503, "step": 806 }, { "epoch": 0.00807, "grad_norm": 0.3815898597240448, "grad_norm_var": 0.0010283973491182412, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.856390058994293, "loss/hidden": 0.0, "loss/logits": 0.17252086475491524, "loss/reg": 0.47878149151802063, "step": 807 }, { "epoch": 0.00808, "grad_norm": 0.3954201936721802, "grad_norm_var": 0.0010174721281111076, "learning_rate": 5e-05, "loss": 0.1732, "loss/crossentropy": 2.7578430771827698, "loss/hidden": 0.0, "loss/logits": 0.1731596365571022, "loss/reg": 0.4786725640296936, "step": 808 }, { "epoch": 0.00809, "grad_norm": 0.40758198499679565, "grad_norm_var": 0.0009719040855993288, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.809332489967346, "loss/hidden": 0.0, "loss/logits": 0.17089753970503807, "loss/reg": 0.47836586833000183, "step": 809 }, { "epoch": 0.0081, "grad_norm": 0.4072379767894745, "grad_norm_var": 0.0009710475678422389, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.754000425338745, "loss/hidden": 0.0, "loss/logits": 0.18188156187534332, "loss/reg": 0.4784814715385437, "step": 810 }, { "epoch": 0.00811, "grad_norm": 0.4962310492992401, "grad_norm_var": 0.0013563321758544096, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.919269859790802, "loss/hidden": 0.0, "loss/logits": 0.18096018210053444, "loss/reg": 0.4782232642173767, "step": 811 }, { "epoch": 0.00812, "grad_norm": 0.3869624137878418, "grad_norm_var": 0.0013475292859184732, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 2.783630609512329, "loss/hidden": 0.0, "loss/logits": 0.178705096244812, "loss/reg": 0.477827250957489, "step": 812 }, { "epoch": 0.00813, "grad_norm": 0.42502880096435547, "grad_norm_var": 0.0012260798490522382, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.7023040056228638, "loss/hidden": 0.0, "loss/logits": 0.18526998162269592, "loss/reg": 0.47723716497421265, "step": 813 }, { "epoch": 0.00814, "grad_norm": 0.4185982942581177, "grad_norm_var": 0.0012061396185813885, "learning_rate": 5e-05, "loss": 0.1979, "loss/crossentropy": 2.6584497690200806, "loss/hidden": 0.0, "loss/logits": 0.1978752613067627, "loss/reg": 0.4769405424594879, "step": 814 }, { "epoch": 0.00815, "grad_norm": 0.4097881019115448, "grad_norm_var": 0.0007176164916637936, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.7900083661079407, "loss/hidden": 0.0, "loss/logits": 0.18981165438890457, "loss/reg": 0.4762992560863495, "step": 815 }, { "epoch": 0.00816, "grad_norm": 0.38526082038879395, "grad_norm_var": 0.0007789318258663358, "learning_rate": 5e-05, "loss": 0.1891, "loss/crossentropy": 2.7710888385772705, "loss/hidden": 0.0, "loss/logits": 0.1891426146030426, "loss/reg": 0.47592562437057495, "step": 816 }, { "epoch": 0.00817, "grad_norm": 0.36478281021118164, "grad_norm_var": 0.0009371517256883603, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.807754695415497, "loss/hidden": 0.0, "loss/logits": 0.17761852219700813, "loss/reg": 0.4756256639957428, "step": 817 }, { "epoch": 0.00818, "grad_norm": 0.3579716682434082, "grad_norm_var": 0.0010205695328415606, "learning_rate": 5e-05, "loss": 0.1696, "loss/crossentropy": 2.8411946296691895, "loss/hidden": 0.0, "loss/logits": 0.1695512868463993, "loss/reg": 0.47549933195114136, "step": 818 }, { "epoch": 0.00819, "grad_norm": 0.4531596302986145, "grad_norm_var": 0.001145824191458807, "learning_rate": 5e-05, "loss": 0.1942, "loss/crossentropy": 2.763140857219696, "loss/hidden": 0.0, "loss/logits": 0.19417620077729225, "loss/reg": 0.47528326511383057, "step": 819 }, { "epoch": 0.0082, "grad_norm": 0.453996479511261, "grad_norm_var": 0.0012647899811277193, "learning_rate": 5e-05, "loss": 0.1984, "loss/crossentropy": 2.78369402885437, "loss/hidden": 0.0, "loss/logits": 0.198363795876503, "loss/reg": 0.47489410638809204, "step": 820 }, { "epoch": 0.00821, "grad_norm": 0.6426920890808105, "grad_norm_var": 0.004602263482167409, "learning_rate": 5e-05, "loss": 0.1954, "loss/crossentropy": 2.7893686294555664, "loss/hidden": 0.0, "loss/logits": 0.19544466957449913, "loss/reg": 0.47476086020469666, "step": 821 }, { "epoch": 0.00822, "grad_norm": 0.39247339963912964, "grad_norm_var": 0.00466152850718012, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.7614815831184387, "loss/hidden": 0.0, "loss/logits": 0.17173147201538086, "loss/reg": 0.4747229814529419, "step": 822 }, { "epoch": 0.00823, "grad_norm": 0.3969910740852356, "grad_norm_var": 0.004589934844416984, "learning_rate": 5e-05, "loss": 0.1719, "loss/crossentropy": 2.6747477054595947, "loss/hidden": 0.0, "loss/logits": 0.17189957946538925, "loss/reg": 0.474607914686203, "step": 823 }, { "epoch": 0.00824, "grad_norm": 0.7954747676849365, "grad_norm_var": 0.013034272179726698, "learning_rate": 5e-05, "loss": 0.2046, "loss/crossentropy": 2.695310413837433, "loss/hidden": 0.0, "loss/logits": 0.20458489656448364, "loss/reg": 0.47468921542167664, "step": 824 }, { "epoch": 0.00825, "grad_norm": 0.47642824053764343, "grad_norm_var": 0.012944443487578106, "learning_rate": 5e-05, "loss": 0.1887, "loss/crossentropy": 3.043631672859192, "loss/hidden": 0.0, "loss/logits": 0.1886681132018566, "loss/reg": 0.4746716022491455, "step": 825 }, { "epoch": 0.00826, "grad_norm": 0.43935635685920715, "grad_norm_var": 0.01280890870462863, "learning_rate": 5e-05, "loss": 0.2013, "loss/crossentropy": 2.8578373789787292, "loss/hidden": 0.0, "loss/logits": 0.20131929218769073, "loss/reg": 0.47495001554489136, "step": 826 }, { "epoch": 0.00827, "grad_norm": 0.40965956449508667, "grad_norm_var": 0.012812360990760127, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.8239371180534363, "loss/hidden": 0.0, "loss/logits": 0.1759297251701355, "loss/reg": 0.47481387853622437, "step": 827 }, { "epoch": 0.00828, "grad_norm": 0.6331911087036133, "grad_norm_var": 0.014514394931071536, "learning_rate": 5e-05, "loss": 0.1953, "loss/crossentropy": 2.857926607131958, "loss/hidden": 0.0, "loss/logits": 0.19533728808164597, "loss/reg": 0.4746149778366089, "step": 828 }, { "epoch": 0.00829, "grad_norm": 0.41076189279556274, "grad_norm_var": 0.014604917774157396, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 2.857885420322418, "loss/hidden": 0.0, "loss/logits": 0.17118285596370697, "loss/reg": 0.4748856723308563, "step": 829 }, { "epoch": 0.0083, "grad_norm": 0.43608206510543823, "grad_norm_var": 0.014515767253063786, "learning_rate": 5e-05, "loss": 0.2049, "loss/crossentropy": 2.7898269295692444, "loss/hidden": 0.0, "loss/logits": 0.2049257569015026, "loss/reg": 0.47489476203918457, "step": 830 }, { "epoch": 0.00831, "grad_norm": 0.471014142036438, "grad_norm_var": 0.014290116060673208, "learning_rate": 5e-05, "loss": 0.1956, "loss/crossentropy": 2.8415348529815674, "loss/hidden": 0.0, "loss/logits": 0.19558412954211235, "loss/reg": 0.47490495443344116, "step": 831 }, { "epoch": 0.00832, "grad_norm": 0.43757137656211853, "grad_norm_var": 0.013870413729055698, "learning_rate": 5e-05, "loss": 0.1922, "loss/crossentropy": 2.704486846923828, "loss/hidden": 0.0, "loss/logits": 0.1922285109758377, "loss/reg": 0.4752568006515503, "step": 832 }, { "epoch": 0.00833, "grad_norm": 0.46833378076553345, "grad_norm_var": 0.013043343995627909, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.6116300225257874, "loss/hidden": 0.0, "loss/logits": 0.18188169971108437, "loss/reg": 0.4752929210662842, "step": 833 }, { "epoch": 0.00834, "grad_norm": 0.3798231780529022, "grad_norm_var": 0.012718535028528998, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 2.781126916408539, "loss/hidden": 0.0, "loss/logits": 0.17486132308840752, "loss/reg": 0.4752624034881592, "step": 834 }, { "epoch": 0.00835, "grad_norm": 0.3832455277442932, "grad_norm_var": 0.013284146428286769, "learning_rate": 5e-05, "loss": 0.1686, "loss/crossentropy": 2.8407583236694336, "loss/hidden": 0.0, "loss/logits": 0.16864283755421638, "loss/reg": 0.4749819040298462, "step": 835 }, { "epoch": 0.00836, "grad_norm": 0.3851434290409088, "grad_norm_var": 0.013788810169135687, "learning_rate": 5e-05, "loss": 0.1676, "loss/crossentropy": 2.825035870075226, "loss/hidden": 0.0, "loss/logits": 0.16755815222859383, "loss/reg": 0.4750012159347534, "step": 836 }, { "epoch": 0.00837, "grad_norm": 0.4028632938861847, "grad_norm_var": 0.011937900528000834, "learning_rate": 5e-05, "loss": 0.1948, "loss/crossentropy": 2.7630125880241394, "loss/hidden": 0.0, "loss/logits": 0.19477133452892303, "loss/reg": 0.4748719334602356, "step": 837 }, { "epoch": 0.00838, "grad_norm": 0.40168240666389465, "grad_norm_var": 0.011863478609137224, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.730747401714325, "loss/hidden": 0.0, "loss/logits": 0.1884002424776554, "loss/reg": 0.4743792712688446, "step": 838 }, { "epoch": 0.00839, "grad_norm": 0.4155077040195465, "grad_norm_var": 0.011734342043379975, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.8569948077201843, "loss/hidden": 0.0, "loss/logits": 0.18532460182905197, "loss/reg": 0.4741096496582031, "step": 839 }, { "epoch": 0.0084, "grad_norm": 0.3987247049808502, "grad_norm_var": 0.003780058809897465, "learning_rate": 5e-05, "loss": 0.1729, "loss/crossentropy": 2.9688004851341248, "loss/hidden": 0.0, "loss/logits": 0.17292658239603043, "loss/reg": 0.4739665389060974, "step": 840 }, { "epoch": 0.00841, "grad_norm": 0.3780858814716339, "grad_norm_var": 0.003832593803460767, "learning_rate": 5e-05, "loss": 0.1706, "loss/crossentropy": 2.908842384815216, "loss/hidden": 0.0, "loss/logits": 0.1706022433936596, "loss/reg": 0.4737795293331146, "step": 841 }, { "epoch": 0.00842, "grad_norm": 0.45243778824806213, "grad_norm_var": 0.0038627646022784236, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.9083980917930603, "loss/hidden": 0.0, "loss/logits": 0.18590235337615013, "loss/reg": 0.4735065698623657, "step": 842 }, { "epoch": 0.00843, "grad_norm": 0.3737180233001709, "grad_norm_var": 0.004036223383593311, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.871476948261261, "loss/hidden": 0.0, "loss/logits": 0.1768825389444828, "loss/reg": 0.47348451614379883, "step": 843 }, { "epoch": 0.00844, "grad_norm": 0.38407278060913086, "grad_norm_var": 0.0010582546371500912, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.7732048630714417, "loss/hidden": 0.0, "loss/logits": 0.1714162901043892, "loss/reg": 0.47327345609664917, "step": 844 }, { "epoch": 0.00845, "grad_norm": 0.9496158957481384, "grad_norm_var": 0.019175097992560064, "learning_rate": 5e-05, "loss": 0.1937, "loss/crossentropy": 2.794276177883148, "loss/hidden": 0.0, "loss/logits": 0.19370241463184357, "loss/reg": 0.4728282690048218, "step": 845 }, { "epoch": 0.00846, "grad_norm": 0.43262991309165955, "grad_norm_var": 0.01917988785568429, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.755667746067047, "loss/hidden": 0.0, "loss/logits": 0.184083241969347, "loss/reg": 0.47277945280075073, "step": 846 }, { "epoch": 0.00847, "grad_norm": 0.4411415457725525, "grad_norm_var": 0.01913066977245711, "learning_rate": 5e-05, "loss": 0.1899, "loss/crossentropy": 2.865315854549408, "loss/hidden": 0.0, "loss/logits": 0.1899258755147457, "loss/reg": 0.4722491502761841, "step": 847 }, { "epoch": 0.00848, "grad_norm": 0.4006330072879791, "grad_norm_var": 0.019241636627569968, "learning_rate": 5e-05, "loss": 0.1852, "loss/crossentropy": 2.8685553669929504, "loss/hidden": 0.0, "loss/logits": 0.18517924845218658, "loss/reg": 0.47171303629875183, "step": 848 }, { "epoch": 0.00849, "grad_norm": 0.46808508038520813, "grad_norm_var": 0.019240716816718465, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.8529489636421204, "loss/hidden": 0.0, "loss/logits": 0.17683344334363937, "loss/reg": 0.4708520174026489, "step": 849 }, { "epoch": 0.0085, "grad_norm": 0.4440319538116455, "grad_norm_var": 0.01897924076048209, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.824026107788086, "loss/hidden": 0.0, "loss/logits": 0.1958274506032467, "loss/reg": 0.4699455499649048, "step": 850 }, { "epoch": 0.00851, "grad_norm": 0.3988456428050995, "grad_norm_var": 0.018867090281505093, "learning_rate": 5e-05, "loss": 0.196, "loss/crossentropy": 2.731741726398468, "loss/hidden": 0.0, "loss/logits": 0.19599812477827072, "loss/reg": 0.46961942315101624, "step": 851 }, { "epoch": 0.00852, "grad_norm": 0.41408345103263855, "grad_norm_var": 0.018686727859587377, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.7289170026779175, "loss/hidden": 0.0, "loss/logits": 0.1802983060479164, "loss/reg": 0.4692685306072235, "step": 852 }, { "epoch": 0.00853, "grad_norm": 0.4150376319885254, "grad_norm_var": 0.018623924625605602, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.740931987762451, "loss/hidden": 0.0, "loss/logits": 0.18505334109067917, "loss/reg": 0.4685782492160797, "step": 853 }, { "epoch": 0.00854, "grad_norm": 0.37355706095695496, "grad_norm_var": 0.018847135571751498, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.794736921787262, "loss/hidden": 0.0, "loss/logits": 0.17246677353978157, "loss/reg": 0.4675617814064026, "step": 854 }, { "epoch": 0.00855, "grad_norm": 0.4115447998046875, "grad_norm_var": 0.018864367817637773, "learning_rate": 5e-05, "loss": 0.1753, "loss/crossentropy": 2.8151164650917053, "loss/hidden": 0.0, "loss/logits": 0.17534137517213821, "loss/reg": 0.4669269621372223, "step": 855 }, { "epoch": 0.00856, "grad_norm": 0.3900041878223419, "grad_norm_var": 0.018924107266986328, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.7815582156181335, "loss/hidden": 0.0, "loss/logits": 0.180332213640213, "loss/reg": 0.4662136137485504, "step": 856 }, { "epoch": 0.00857, "grad_norm": 0.4009651839733124, "grad_norm_var": 0.018751262569167256, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.7821874618530273, "loss/hidden": 0.0, "loss/logits": 0.17475271224975586, "loss/reg": 0.4657014310359955, "step": 857 }, { "epoch": 0.00858, "grad_norm": 0.588543713092804, "grad_norm_var": 0.02000955628858841, "learning_rate": 5e-05, "loss": 0.2199, "loss/crossentropy": 2.9850769639015198, "loss/hidden": 0.0, "loss/logits": 0.2198828160762787, "loss/reg": 0.4653748869895935, "step": 858 }, { "epoch": 0.00859, "grad_norm": 0.4374126195907593, "grad_norm_var": 0.019569367165864313, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.8361648321151733, "loss/hidden": 0.0, "loss/logits": 0.1814994029700756, "loss/reg": 0.46498292684555054, "step": 859 }, { "epoch": 0.0086, "grad_norm": 0.3774106204509735, "grad_norm_var": 0.01963904260057596, "learning_rate": 5e-05, "loss": 0.1778, "loss/crossentropy": 2.730868875980377, "loss/hidden": 0.0, "loss/logits": 0.17779555544257164, "loss/reg": 0.4646517336368561, "step": 860 }, { "epoch": 0.00861, "grad_norm": 0.3936383128166199, "grad_norm_var": 0.002586838774398533, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.8319395780563354, "loss/hidden": 0.0, "loss/logits": 0.1767960973083973, "loss/reg": 0.4640538990497589, "step": 861 }, { "epoch": 0.00862, "grad_norm": 0.372811496257782, "grad_norm_var": 0.0027434255645148267, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.8459997177124023, "loss/hidden": 0.0, "loss/logits": 0.17220817133784294, "loss/reg": 0.4633285701274872, "step": 862 }, { "epoch": 0.00863, "grad_norm": 0.39379799365997314, "grad_norm_var": 0.002753114507804592, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.758630394935608, "loss/hidden": 0.0, "loss/logits": 0.18211441859602928, "loss/reg": 0.4628179967403412, "step": 863 }, { "epoch": 0.00864, "grad_norm": 0.40317854285240173, "grad_norm_var": 0.0027477862113775677, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.7625985741615295, "loss/hidden": 0.0, "loss/logits": 0.17503182590007782, "loss/reg": 0.46220946311950684, "step": 864 }, { "epoch": 0.00865, "grad_norm": 0.49033674597740173, "grad_norm_var": 0.0029282658526530742, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.807451605796814, "loss/hidden": 0.0, "loss/logits": 0.18513855710625648, "loss/reg": 0.4618549942970276, "step": 865 }, { "epoch": 0.00866, "grad_norm": 0.3906607925891876, "grad_norm_var": 0.002928698339220459, "learning_rate": 5e-05, "loss": 0.1784, "loss/crossentropy": 2.716043531894684, "loss/hidden": 0.0, "loss/logits": 0.1783856824040413, "loss/reg": 0.46163463592529297, "step": 866 }, { "epoch": 0.00867, "grad_norm": 0.4586308002471924, "grad_norm_var": 0.0030174245796694846, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.727673649787903, "loss/hidden": 0.0, "loss/logits": 0.18352920189499855, "loss/reg": 0.4608604609966278, "step": 867 }, { "epoch": 0.00868, "grad_norm": 0.41789695620536804, "grad_norm_var": 0.003015591635956823, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.842820107936859, "loss/hidden": 0.0, "loss/logits": 0.18097610399127007, "loss/reg": 0.4605502784252167, "step": 868 }, { "epoch": 0.00869, "grad_norm": 0.383337140083313, "grad_norm_var": 0.003098165879230293, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.878231465816498, "loss/hidden": 0.0, "loss/logits": 0.17990639805793762, "loss/reg": 0.4604140818119049, "step": 869 }, { "epoch": 0.0087, "grad_norm": 0.3863121271133423, "grad_norm_var": 0.0030332052844681953, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.715823709964752, "loss/hidden": 0.0, "loss/logits": 0.1775330752134323, "loss/reg": 0.4602375328540802, "step": 870 }, { "epoch": 0.00871, "grad_norm": 0.37647107243537903, "grad_norm_var": 0.003142757543149995, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.7884933948516846, "loss/hidden": 0.0, "loss/logits": 0.18739710003137589, "loss/reg": 0.4599546194076538, "step": 871 }, { "epoch": 0.00872, "grad_norm": 0.38756513595581055, "grad_norm_var": 0.003151693298060787, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.835453450679779, "loss/hidden": 0.0, "loss/logits": 0.1757812313735485, "loss/reg": 0.45953091979026794, "step": 872 }, { "epoch": 0.00873, "grad_norm": 0.41903817653656006, "grad_norm_var": 0.003135430787783733, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.7359917163848877, "loss/hidden": 0.0, "loss/logits": 0.1813923716545105, "loss/reg": 0.45934534072875977, "step": 873 }, { "epoch": 0.00874, "grad_norm": 0.49642714858055115, "grad_norm_var": 0.0015627070800961782, "learning_rate": 5e-05, "loss": 0.1962, "loss/crossentropy": 2.7962101101875305, "loss/hidden": 0.0, "loss/logits": 0.19618910923600197, "loss/reg": 0.4589351415634155, "step": 874 }, { "epoch": 0.00875, "grad_norm": 0.418437123298645, "grad_norm_var": 0.0015197971562474267, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.766363024711609, "loss/hidden": 0.0, "loss/logits": 0.19026634097099304, "loss/reg": 0.45870718359947205, "step": 875 }, { "epoch": 0.00876, "grad_norm": 0.4025008976459503, "grad_norm_var": 0.0014488746413842458, "learning_rate": 5e-05, "loss": 0.1914, "loss/crossentropy": 2.81729918718338, "loss/hidden": 0.0, "loss/logits": 0.1914457455277443, "loss/reg": 0.4581085443496704, "step": 876 }, { "epoch": 0.00877, "grad_norm": 0.4219055473804474, "grad_norm_var": 0.001429835905592928, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.768358588218689, "loss/hidden": 0.0, "loss/logits": 0.18446233868598938, "loss/reg": 0.4576250910758972, "step": 877 }, { "epoch": 0.00878, "grad_norm": 0.4094695746898651, "grad_norm_var": 0.0013139388952979503, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.7483003735542297, "loss/hidden": 0.0, "loss/logits": 0.18098262697458267, "loss/reg": 0.45773014426231384, "step": 878 }, { "epoch": 0.00879, "grad_norm": 0.3711441457271576, "grad_norm_var": 0.0014130686859717746, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.656266748905182, "loss/hidden": 0.0, "loss/logits": 0.17857538908720016, "loss/reg": 0.457492858171463, "step": 879 }, { "epoch": 0.0088, "grad_norm": 0.3746376931667328, "grad_norm_var": 0.0015073751724968166, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.7513882517814636, "loss/hidden": 0.0, "loss/logits": 0.16883964091539383, "loss/reg": 0.45768028497695923, "step": 880 }, { "epoch": 0.00881, "grad_norm": 0.39095306396484375, "grad_norm_var": 0.0010972193438498127, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.7430336475372314, "loss/hidden": 0.0, "loss/logits": 0.17828257381916046, "loss/reg": 0.4572415053844452, "step": 881 }, { "epoch": 0.00882, "grad_norm": 0.36781200766563416, "grad_norm_var": 0.0011783669179629918, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.6919352412223816, "loss/hidden": 0.0, "loss/logits": 0.17304621264338493, "loss/reg": 0.45687490701675415, "step": 882 }, { "epoch": 0.00883, "grad_norm": 0.3737168312072754, "grad_norm_var": 0.0010236116159424472, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 3.013018250465393, "loss/hidden": 0.0, "loss/logits": 0.17816292867064476, "loss/reg": 0.45695242285728455, "step": 883 }, { "epoch": 0.00884, "grad_norm": 0.4151737093925476, "grad_norm_var": 0.0010175228375996308, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.676515579223633, "loss/hidden": 0.0, "loss/logits": 0.17421120405197144, "loss/reg": 0.45693111419677734, "step": 884 }, { "epoch": 0.00885, "grad_norm": 0.36094677448272705, "grad_norm_var": 0.0010976495402016321, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.785749316215515, "loss/hidden": 0.0, "loss/logits": 0.177512064576149, "loss/reg": 0.4566132426261902, "step": 885 }, { "epoch": 0.00886, "grad_norm": 0.35830846428871155, "grad_norm_var": 0.001191355505636423, "learning_rate": 5e-05, "loss": 0.1632, "loss/crossentropy": 2.6745218634605408, "loss/hidden": 0.0, "loss/logits": 0.16320858150720596, "loss/reg": 0.45632797479629517, "step": 886 }, { "epoch": 0.00887, "grad_norm": 0.39323604106903076, "grad_norm_var": 0.0011640798876311937, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.71240770816803, "loss/hidden": 0.0, "loss/logits": 0.18253474682569504, "loss/reg": 0.4562245309352875, "step": 887 }, { "epoch": 0.00888, "grad_norm": 0.4248831868171692, "grad_norm_var": 0.001201290718041633, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.7176828384399414, "loss/hidden": 0.0, "loss/logits": 0.19113916158676147, "loss/reg": 0.4565688371658325, "step": 888 }, { "epoch": 0.00889, "grad_norm": 0.40107715129852295, "grad_norm_var": 0.0011756494462473288, "learning_rate": 5e-05, "loss": 0.1861, "loss/crossentropy": 2.849431097507477, "loss/hidden": 0.0, "loss/logits": 0.18608149141073227, "loss/reg": 0.4560372531414032, "step": 889 }, { "epoch": 0.0089, "grad_norm": 0.3687891960144043, "grad_norm_var": 0.000532226131293471, "learning_rate": 5e-05, "loss": 0.1778, "loss/crossentropy": 2.8518633246421814, "loss/hidden": 0.0, "loss/logits": 0.17775431275367737, "loss/reg": 0.45601704716682434, "step": 890 }, { "epoch": 0.00891, "grad_norm": 0.42580699920654297, "grad_norm_var": 0.000562766690329423, "learning_rate": 5e-05, "loss": 0.1875, "loss/crossentropy": 2.6784918308258057, "loss/hidden": 0.0, "loss/logits": 0.18748047947883606, "loss/reg": 0.4564152956008911, "step": 891 }, { "epoch": 0.00892, "grad_norm": 0.47338661551475525, "grad_norm_var": 0.000982939397889467, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 2.7470583319664, "loss/hidden": 0.0, "loss/logits": 0.17078452557325363, "loss/reg": 0.456853985786438, "step": 892 }, { "epoch": 0.00893, "grad_norm": 0.42612820863723755, "grad_norm_var": 0.0009988064598212146, "learning_rate": 5e-05, "loss": 0.189, "loss/crossentropy": 2.7244904041290283, "loss/hidden": 0.0, "loss/logits": 0.1889967955648899, "loss/reg": 0.45654603838920593, "step": 893 }, { "epoch": 0.00894, "grad_norm": 0.3851776719093323, "grad_norm_var": 0.000991953256338655, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.8310117721557617, "loss/hidden": 0.0, "loss/logits": 0.18191207200288773, "loss/reg": 0.45636725425720215, "step": 894 }, { "epoch": 0.00895, "grad_norm": 0.3990049362182617, "grad_norm_var": 0.0009538964612970435, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.739080309867859, "loss/hidden": 0.0, "loss/logits": 0.18228528648614883, "loss/reg": 0.45640242099761963, "step": 895 }, { "epoch": 0.00896, "grad_norm": 0.3810177147388458, "grad_norm_var": 0.0009381066895440308, "learning_rate": 5e-05, "loss": 0.2004, "loss/crossentropy": 2.754279911518097, "loss/hidden": 0.0, "loss/logits": 0.2003578580915928, "loss/reg": 0.4563208818435669, "step": 896 }, { "epoch": 0.00897, "grad_norm": 0.40651124715805054, "grad_norm_var": 0.0009415446363810384, "learning_rate": 5e-05, "loss": 0.1895, "loss/crossentropy": 2.9638347029685974, "loss/hidden": 0.0, "loss/logits": 0.18953344970941544, "loss/reg": 0.4560629725456238, "step": 897 }, { "epoch": 0.00898, "grad_norm": 0.3975878655910492, "grad_norm_var": 0.0008788501535310041, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.6844716668128967, "loss/hidden": 0.0, "loss/logits": 0.17517001926898956, "loss/reg": 0.4556216597557068, "step": 898 }, { "epoch": 0.00899, "grad_norm": 0.3664843440055847, "grad_norm_var": 0.0009069078021963624, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.8173253536224365, "loss/hidden": 0.0, "loss/logits": 0.1811939775943756, "loss/reg": 0.4556581377983093, "step": 899 }, { "epoch": 0.009, "grad_norm": 0.38823944330215454, "grad_norm_var": 0.0008940574126111637, "learning_rate": 5e-05, "loss": 0.197, "loss/crossentropy": 2.8085598945617676, "loss/hidden": 0.0, "loss/logits": 0.1970026046037674, "loss/reg": 0.455336332321167, "step": 900 }, { "epoch": 0.00901, "grad_norm": 0.3874801695346832, "grad_norm_var": 0.0008094961446685904, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.8820138573646545, "loss/hidden": 0.0, "loss/logits": 0.18027295917272568, "loss/reg": 0.4550902247428894, "step": 901 }, { "epoch": 0.00902, "grad_norm": 0.3880668580532074, "grad_norm_var": 0.0007036068835187851, "learning_rate": 5e-05, "loss": 0.1695, "loss/crossentropy": 2.8119542002677917, "loss/hidden": 0.0, "loss/logits": 0.16951999813318253, "loss/reg": 0.4550669491291046, "step": 902 }, { "epoch": 0.00903, "grad_norm": 0.3959619700908661, "grad_norm_var": 0.0007013203623452184, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.7950295209884644, "loss/hidden": 0.0, "loss/logits": 0.17857198789715767, "loss/reg": 0.4553045630455017, "step": 903 }, { "epoch": 0.00904, "grad_norm": 0.5046120285987854, "grad_norm_var": 0.001352767270814074, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.7969126105308533, "loss/hidden": 0.0, "loss/logits": 0.18121619522571564, "loss/reg": 0.45559972524642944, "step": 904 }, { "epoch": 0.00905, "grad_norm": 0.3799777626991272, "grad_norm_var": 0.0013943231168663435, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.6912559270858765, "loss/hidden": 0.0, "loss/logits": 0.1762259304523468, "loss/reg": 0.4554554522037506, "step": 905 }, { "epoch": 0.00906, "grad_norm": 0.3529849946498871, "grad_norm_var": 0.0014854787745738525, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.7441487908363342, "loss/hidden": 0.0, "loss/logits": 0.16807252541184425, "loss/reg": 0.45576900243759155, "step": 906 }, { "epoch": 0.00907, "grad_norm": 0.3670613169670105, "grad_norm_var": 0.0015276334324050252, "learning_rate": 5e-05, "loss": 0.1646, "loss/crossentropy": 2.8141930103302, "loss/hidden": 0.0, "loss/logits": 0.16464769840240479, "loss/reg": 0.45600464940071106, "step": 907 }, { "epoch": 0.00908, "grad_norm": 0.36800873279571533, "grad_norm_var": 0.001190276169188668, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.7759130001068115, "loss/hidden": 0.0, "loss/logits": 0.17351876199245453, "loss/reg": 0.45601940155029297, "step": 908 }, { "epoch": 0.00909, "grad_norm": 0.37947767972946167, "grad_norm_var": 0.0011226845652304806, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.7729212045669556, "loss/hidden": 0.0, "loss/logits": 0.17815018445253372, "loss/reg": 0.456147164106369, "step": 909 }, { "epoch": 0.0091, "grad_norm": 0.38141465187072754, "grad_norm_var": 0.001126229161893022, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.7569803595542908, "loss/hidden": 0.0, "loss/logits": 0.17456472292542458, "loss/reg": 0.45628389716148376, "step": 910 }, { "epoch": 0.00911, "grad_norm": 0.3567200303077698, "grad_norm_var": 0.0011885816140802864, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.8163618445396423, "loss/hidden": 0.0, "loss/logits": 0.17863010615110397, "loss/reg": 0.45632410049438477, "step": 911 }, { "epoch": 0.00912, "grad_norm": 0.3706549406051636, "grad_norm_var": 0.0012043886598858244, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.7405253052711487, "loss/hidden": 0.0, "loss/logits": 0.18364877998828888, "loss/reg": 0.45602551102638245, "step": 912 }, { "epoch": 0.00913, "grad_norm": 0.4362257122993469, "grad_norm_var": 0.0013370623618320309, "learning_rate": 5e-05, "loss": 0.1687, "loss/crossentropy": 2.698362410068512, "loss/hidden": 0.0, "loss/logits": 0.16872235387563705, "loss/reg": 0.4559316039085388, "step": 913 }, { "epoch": 0.00914, "grad_norm": 0.3858959376811981, "grad_norm_var": 0.001331922016538345, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.707677483558655, "loss/hidden": 0.0, "loss/logits": 0.1835642084479332, "loss/reg": 0.455908864736557, "step": 914 }, { "epoch": 0.00915, "grad_norm": 0.3679225444793701, "grad_norm_var": 0.0013279102693886657, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.8874467611312866, "loss/hidden": 0.0, "loss/logits": 0.18442845717072487, "loss/reg": 0.456117182970047, "step": 915 }, { "epoch": 0.00916, "grad_norm": 0.37645041942596436, "grad_norm_var": 0.001336485935439191, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.7380640506744385, "loss/hidden": 0.0, "loss/logits": 0.17480285465717316, "loss/reg": 0.4560961127281189, "step": 916 }, { "epoch": 0.00917, "grad_norm": 0.384146511554718, "grad_norm_var": 0.0013371592085634247, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.7248184084892273, "loss/hidden": 0.0, "loss/logits": 0.17740866914391518, "loss/reg": 0.4560099244117737, "step": 917 }, { "epoch": 0.00918, "grad_norm": 0.37465181946754456, "grad_norm_var": 0.0013468991025680074, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.708874523639679, "loss/hidden": 0.0, "loss/logits": 0.17475076764822006, "loss/reg": 0.45605844259262085, "step": 918 }, { "epoch": 0.00919, "grad_norm": 0.6961257457733154, "grad_norm_var": 0.007361312657991267, "learning_rate": 5e-05, "loss": 0.2081, "loss/crossentropy": 2.6864916682243347, "loss/hidden": 0.0, "loss/logits": 0.2081458643078804, "loss/reg": 0.4559793174266815, "step": 919 }, { "epoch": 0.0092, "grad_norm": 1.254311203956604, "grad_norm_var": 0.052432011592350045, "learning_rate": 5e-05, "loss": 0.2337, "loss/crossentropy": 2.824992299079895, "loss/hidden": 0.0, "loss/logits": 0.23370274528861046, "loss/reg": 0.4557313919067383, "step": 920 }, { "epoch": 0.00921, "grad_norm": 0.5610416531562805, "grad_norm_var": 0.05274222439864617, "learning_rate": 5e-05, "loss": 0.2195, "loss/crossentropy": 2.8316932320594788, "loss/hidden": 0.0, "loss/logits": 0.2194625288248062, "loss/reg": 0.4557979106903076, "step": 921 }, { "epoch": 0.00922, "grad_norm": 0.4762079119682312, "grad_norm_var": 0.05187847066570333, "learning_rate": 5e-05, "loss": 0.195, "loss/crossentropy": 2.771957039833069, "loss/hidden": 0.0, "loss/logits": 0.19497125223279, "loss/reg": 0.4555503726005554, "step": 922 }, { "epoch": 0.00923, "grad_norm": 0.45956218242645264, "grad_norm_var": 0.051131079668024944, "learning_rate": 5e-05, "loss": 0.1985, "loss/crossentropy": 2.782064378261566, "loss/hidden": 0.0, "loss/logits": 0.19849781692028046, "loss/reg": 0.45516690611839294, "step": 923 }, { "epoch": 0.00924, "grad_norm": 0.4426262080669403, "grad_norm_var": 0.05039669019321806, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.857316493988037, "loss/hidden": 0.0, "loss/logits": 0.184848602861166, "loss/reg": 0.4548613429069519, "step": 924 }, { "epoch": 0.00925, "grad_norm": 0.4766765832901001, "grad_norm_var": 0.04966543003678317, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.8048247694969177, "loss/hidden": 0.0, "loss/logits": 0.18982521444559097, "loss/reg": 0.4546864330768585, "step": 925 }, { "epoch": 0.00926, "grad_norm": 0.45893600583076477, "grad_norm_var": 0.048944100639587027, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 2.8256375193595886, "loss/hidden": 0.0, "loss/logits": 0.17973751574754715, "loss/reg": 0.4545992314815521, "step": 926 }, { "epoch": 0.00927, "grad_norm": 0.4265064299106598, "grad_norm_var": 0.04798614451857006, "learning_rate": 5e-05, "loss": 0.1732, "loss/crossentropy": 2.9487354159355164, "loss/hidden": 0.0, "loss/logits": 0.17318306863307953, "loss/reg": 0.4542984068393707, "step": 927 }, { "epoch": 0.00928, "grad_norm": 0.4173678457736969, "grad_norm_var": 0.04733717926624418, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.9056347608566284, "loss/hidden": 0.0, "loss/logits": 0.17634670808911324, "loss/reg": 0.45372530817985535, "step": 928 }, { "epoch": 0.00929, "grad_norm": 0.7064141035079956, "grad_norm_var": 0.04961434867798153, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.8854130506515503, "loss/hidden": 0.0, "loss/logits": 0.1823662631213665, "loss/reg": 0.4534356892108917, "step": 929 }, { "epoch": 0.0093, "grad_norm": 1.089198350906372, "grad_norm_var": 0.06827682983960885, "learning_rate": 5e-05, "loss": 0.2166, "loss/crossentropy": 3.006975471973419, "loss/hidden": 0.0, "loss/logits": 0.21658925712108612, "loss/reg": 0.4533386826515198, "step": 930 }, { "epoch": 0.00931, "grad_norm": 0.5228180885314941, "grad_norm_var": 0.06579892938554226, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.906866490840912, "loss/hidden": 0.0, "loss/logits": 0.18020262941718102, "loss/reg": 0.45317989587783813, "step": 931 }, { "epoch": 0.00932, "grad_norm": 0.4682345688343048, "grad_norm_var": 0.0639544861876036, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.7971653938293457, "loss/hidden": 0.0, "loss/logits": 0.18075479194521904, "loss/reg": 0.45344164967536926, "step": 932 }, { "epoch": 0.00933, "grad_norm": 0.5203787088394165, "grad_norm_var": 0.061630887637985944, "learning_rate": 5e-05, "loss": 0.1954, "loss/crossentropy": 3.0123391151428223, "loss/hidden": 0.0, "loss/logits": 0.19535603374242783, "loss/reg": 0.45359858870506287, "step": 933 }, { "epoch": 0.00934, "grad_norm": 0.4239829182624817, "grad_norm_var": 0.06040310028455581, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.703702449798584, "loss/hidden": 0.0, "loss/logits": 0.18251239508390427, "loss/reg": 0.45342186093330383, "step": 934 }, { "epoch": 0.00935, "grad_norm": 0.4638396203517914, "grad_norm_var": 0.06041185460144103, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.7625121474266052, "loss/hidden": 0.0, "loss/logits": 0.18419161438941956, "loss/reg": 0.4532279372215271, "step": 935 }, { "epoch": 0.00936, "grad_norm": 0.38679009675979614, "grad_norm_var": 0.02864273589855153, "learning_rate": 5e-05, "loss": 0.1771, "loss/crossentropy": 2.8018913865089417, "loss/hidden": 0.0, "loss/logits": 0.17705075070261955, "loss/reg": 0.45283743739128113, "step": 936 }, { "epoch": 0.00937, "grad_norm": 0.44610390067100525, "grad_norm_var": 0.02882083957938445, "learning_rate": 5e-05, "loss": 0.1942, "loss/crossentropy": 2.654189169406891, "loss/hidden": 0.0, "loss/logits": 0.19421999901533127, "loss/reg": 0.4525349736213684, "step": 937 }, { "epoch": 0.00938, "grad_norm": 2.368285655975342, "grad_norm_var": 0.24363892921545985, "learning_rate": 5e-05, "loss": 0.205, "loss/crossentropy": 3.0635355710983276, "loss/hidden": 0.0, "loss/logits": 0.20504865795373917, "loss/reg": 0.45214948058128357, "step": 938 }, { "epoch": 0.00939, "grad_norm": 0.5260770320892334, "grad_norm_var": 0.24240515359636686, "learning_rate": 5e-05, "loss": 0.1862, "loss/crossentropy": 2.7869638204574585, "loss/hidden": 0.0, "loss/logits": 0.1862172782421112, "loss/reg": 0.45161131024360657, "step": 939 }, { "epoch": 0.0094, "grad_norm": 0.4882296621799469, "grad_norm_var": 0.24137140276222482, "learning_rate": 5e-05, "loss": 0.1861, "loss/crossentropy": 2.8594043850898743, "loss/hidden": 0.0, "loss/logits": 0.18606602773070335, "loss/reg": 0.451252818107605, "step": 940 }, { "epoch": 0.00941, "grad_norm": 0.4782823324203491, "grad_norm_var": 0.2413372675984066, "learning_rate": 5e-05, "loss": 0.2065, "loss/crossentropy": 2.7439804077148438, "loss/hidden": 0.0, "loss/logits": 0.20654015988111496, "loss/reg": 0.45089903473854065, "step": 941 }, { "epoch": 0.00942, "grad_norm": 0.45780977606773376, "grad_norm_var": 0.24136408046319194, "learning_rate": 5e-05, "loss": 0.1914, "loss/crossentropy": 2.9148364067077637, "loss/hidden": 0.0, "loss/logits": 0.19141338765621185, "loss/reg": 0.4505336880683899, "step": 942 }, { "epoch": 0.00943, "grad_norm": 0.5787521004676819, "grad_norm_var": 0.23854198660495454, "learning_rate": 5e-05, "loss": 0.1909, "loss/crossentropy": 2.844089090824127, "loss/hidden": 0.0, "loss/logits": 0.19092074409127235, "loss/reg": 0.4501431882381439, "step": 943 }, { "epoch": 0.00944, "grad_norm": 0.3920629024505615, "grad_norm_var": 0.23935479536635693, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.8257625699043274, "loss/hidden": 0.0, "loss/logits": 0.17557938396930695, "loss/reg": 0.4501745402812958, "step": 944 }, { "epoch": 0.00945, "grad_norm": 0.399366170167923, "grad_norm_var": 0.2427259095845141, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.77220618724823, "loss/hidden": 0.0, "loss/logits": 0.18044748902320862, "loss/reg": 0.4498633146286011, "step": 945 }, { "epoch": 0.00946, "grad_norm": 0.449258029460907, "grad_norm_var": 0.2287676949115643, "learning_rate": 5e-05, "loss": 0.1994, "loss/crossentropy": 2.893985390663147, "loss/hidden": 0.0, "loss/logits": 0.19944066181778908, "loss/reg": 0.44981735944747925, "step": 946 }, { "epoch": 0.00947, "grad_norm": 0.4526951313018799, "grad_norm_var": 0.22966240793809412, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.8645131587982178, "loss/hidden": 0.0, "loss/logits": 0.1906878985464573, "loss/reg": 0.44999638199806213, "step": 947 }, { "epoch": 0.00948, "grad_norm": 0.40988269448280334, "grad_norm_var": 0.23075457732630453, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.8552953004837036, "loss/hidden": 0.0, "loss/logits": 0.17793813347816467, "loss/reg": 0.4502427279949188, "step": 948 }, { "epoch": 0.00949, "grad_norm": 0.7006927132606506, "grad_norm_var": 0.23141064628918426, "learning_rate": 5e-05, "loss": 0.2413, "loss/crossentropy": 2.769843816757202, "loss/hidden": 0.0, "loss/logits": 0.24132106825709343, "loss/reg": 0.4506417512893677, "step": 949 }, { "epoch": 0.0095, "grad_norm": 0.4160483479499817, "grad_norm_var": 0.23158903482464407, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.7790079712867737, "loss/hidden": 0.0, "loss/logits": 0.17358482256531715, "loss/reg": 0.45081761479377747, "step": 950 }, { "epoch": 0.00951, "grad_norm": 0.5285090804100037, "grad_norm_var": 0.23077650547679346, "learning_rate": 5e-05, "loss": 0.2042, "loss/crossentropy": 2.90445077419281, "loss/hidden": 0.0, "loss/logits": 0.2042471021413803, "loss/reg": 0.4510262608528137, "step": 951 }, { "epoch": 0.00952, "grad_norm": 0.42531558871269226, "grad_norm_var": 0.2298129627330251, "learning_rate": 5e-05, "loss": 0.1873, "loss/crossentropy": 2.6535099148750305, "loss/hidden": 0.0, "loss/logits": 0.1872611828148365, "loss/reg": 0.4517408609390259, "step": 952 }, { "epoch": 0.00953, "grad_norm": 0.41266629099845886, "grad_norm_var": 0.23054594043256468, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.7583318948745728, "loss/hidden": 0.0, "loss/logits": 0.18252769112586975, "loss/reg": 0.4515722095966339, "step": 953 }, { "epoch": 0.00954, "grad_norm": 0.6826277375221252, "grad_norm_var": 0.009075738310713947, "learning_rate": 5e-05, "loss": 0.2384, "loss/crossentropy": 2.8189947605133057, "loss/hidden": 0.0, "loss/logits": 0.23843104392290115, "loss/reg": 0.4517481029033661, "step": 954 }, { "epoch": 0.00955, "grad_norm": 0.45150309801101685, "grad_norm_var": 0.009038667348381072, "learning_rate": 5e-05, "loss": 0.1934, "loss/crossentropy": 2.6897183060646057, "loss/hidden": 0.0, "loss/logits": 0.1933554969727993, "loss/reg": 0.45197048783302307, "step": 955 }, { "epoch": 0.00956, "grad_norm": 0.4133051037788391, "grad_norm_var": 0.009334595159048602, "learning_rate": 5e-05, "loss": 0.1745, "loss/crossentropy": 2.950718879699707, "loss/hidden": 0.0, "loss/logits": 0.1745009385049343, "loss/reg": 0.45231127738952637, "step": 956 }, { "epoch": 0.00957, "grad_norm": 0.42602217197418213, "grad_norm_var": 0.009503661560804337, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.7819709181785583, "loss/hidden": 0.0, "loss/logits": 0.18229759857058525, "loss/reg": 0.45236602425575256, "step": 957 }, { "epoch": 0.00958, "grad_norm": 0.43896785378456116, "grad_norm_var": 0.009568489539776896, "learning_rate": 5e-05, "loss": 0.2028, "loss/crossentropy": 2.7178964614868164, "loss/hidden": 0.0, "loss/logits": 0.20281242206692696, "loss/reg": 0.45251110196113586, "step": 958 }, { "epoch": 0.00959, "grad_norm": 0.3740888237953186, "grad_norm_var": 0.009317122073169974, "learning_rate": 5e-05, "loss": 0.166, "loss/crossentropy": 2.731823742389679, "loss/hidden": 0.0, "loss/logits": 0.1660252995789051, "loss/reg": 0.4525800943374634, "step": 959 }, { "epoch": 0.0096, "grad_norm": 0.40352582931518555, "grad_norm_var": 0.009220257155676873, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 2.8784964084625244, "loss/hidden": 0.0, "loss/logits": 0.1770310401916504, "loss/reg": 0.4525856077671051, "step": 960 }, { "epoch": 0.00961, "grad_norm": 0.41794857382774353, "grad_norm_var": 0.009087819139161306, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.654646098613739, "loss/hidden": 0.0, "loss/logits": 0.18068451806902885, "loss/reg": 0.4523184895515442, "step": 961 }, { "epoch": 0.00962, "grad_norm": 0.5824657678604126, "grad_norm_var": 0.00995825338432269, "learning_rate": 5e-05, "loss": 0.198, "loss/crossentropy": 2.893343985080719, "loss/hidden": 0.0, "loss/logits": 0.19798346236348152, "loss/reg": 0.45190852880477905, "step": 962 }, { "epoch": 0.00963, "grad_norm": 0.4421478807926178, "grad_norm_var": 0.009990971578127495, "learning_rate": 5e-05, "loss": 0.1813, "loss/crossentropy": 2.8475064039230347, "loss/hidden": 0.0, "loss/logits": 0.18132250756025314, "loss/reg": 0.4514501392841339, "step": 963 }, { "epoch": 0.00964, "grad_norm": 0.3532029092311859, "grad_norm_var": 0.010648784334464641, "learning_rate": 5e-05, "loss": 0.1689, "loss/crossentropy": 2.8276309967041016, "loss/hidden": 0.0, "loss/logits": 0.1689227819442749, "loss/reg": 0.4511524438858032, "step": 964 }, { "epoch": 0.00965, "grad_norm": 0.3737221956253052, "grad_norm_var": 0.007134486795176528, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.885461390018463, "loss/hidden": 0.0, "loss/logits": 0.17570112273097038, "loss/reg": 0.4508180022239685, "step": 965 }, { "epoch": 0.00966, "grad_norm": 0.4158072769641876, "grad_norm_var": 0.00713546534593927, "learning_rate": 5e-05, "loss": 0.1878, "loss/crossentropy": 3.0338741540908813, "loss/hidden": 0.0, "loss/logits": 0.18783221393823624, "loss/reg": 0.45032256841659546, "step": 966 }, { "epoch": 0.00967, "grad_norm": 0.39915457367897034, "grad_norm_var": 0.006764476293504155, "learning_rate": 5e-05, "loss": 0.18, "loss/crossentropy": 2.6552807688713074, "loss/hidden": 0.0, "loss/logits": 0.18003039062023163, "loss/reg": 0.4495224058628082, "step": 967 }, { "epoch": 0.00968, "grad_norm": 0.38653719425201416, "grad_norm_var": 0.006925490719824293, "learning_rate": 5e-05, "loss": 0.1715, "loss/crossentropy": 2.851729452610016, "loss/hidden": 0.0, "loss/logits": 0.1714877150952816, "loss/reg": 0.4490043520927429, "step": 968 }, { "epoch": 0.00969, "grad_norm": 0.39180833101272583, "grad_norm_var": 0.007017173156203546, "learning_rate": 5e-05, "loss": 0.1703, "loss/crossentropy": 2.557051658630371, "loss/hidden": 0.0, "loss/logits": 0.1703101471066475, "loss/reg": 0.44843992590904236, "step": 969 }, { "epoch": 0.0097, "grad_norm": 0.38021141290664673, "grad_norm_var": 0.0027302049114087622, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.8946127891540527, "loss/hidden": 0.0, "loss/logits": 0.17180591076612473, "loss/reg": 0.44775623083114624, "step": 970 }, { "epoch": 0.00971, "grad_norm": 0.40566161274909973, "grad_norm_var": 0.0026424110439055054, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.7066898345947266, "loss/hidden": 0.0, "loss/logits": 0.1843091994524002, "loss/reg": 0.44743096828460693, "step": 971 }, { "epoch": 0.00972, "grad_norm": 0.3949342370033264, "grad_norm_var": 0.0026622328037577804, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.8437857031822205, "loss/hidden": 0.0, "loss/logits": 0.18090396374464035, "loss/reg": 0.44689276814460754, "step": 972 }, { "epoch": 0.00973, "grad_norm": 0.38992777466773987, "grad_norm_var": 0.0026744326718429077, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.869152069091797, "loss/hidden": 0.0, "loss/logits": 0.1793271265923977, "loss/reg": 0.4464467167854309, "step": 973 }, { "epoch": 0.00974, "grad_norm": 0.3669184148311615, "grad_norm_var": 0.0027146587016174894, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.8612890243530273, "loss/hidden": 0.0, "loss/logits": 0.18707513436675072, "loss/reg": 0.44594046473503113, "step": 974 }, { "epoch": 0.00975, "grad_norm": 0.7573450207710266, "grad_norm_var": 0.010321591094650496, "learning_rate": 5e-05, "loss": 0.2121, "loss/crossentropy": 2.830893874168396, "loss/hidden": 0.0, "loss/logits": 0.21212119981646538, "loss/reg": 0.44525057077407837, "step": 975 }, { "epoch": 0.00976, "grad_norm": 0.4311162829399109, "grad_norm_var": 0.010276072057486551, "learning_rate": 5e-05, "loss": 0.1927, "loss/crossentropy": 2.6743762493133545, "loss/hidden": 0.0, "loss/logits": 0.19268564134836197, "loss/reg": 0.4448193907737732, "step": 976 }, { "epoch": 0.00977, "grad_norm": 0.5329715609550476, "grad_norm_var": 0.010909599620461856, "learning_rate": 5e-05, "loss": 0.2072, "loss/crossentropy": 2.816002666950226, "loss/hidden": 0.0, "loss/logits": 0.20718568563461304, "loss/reg": 0.4447753429412842, "step": 977 }, { "epoch": 0.00978, "grad_norm": 0.3862622082233429, "grad_norm_var": 0.009529645796977052, "learning_rate": 5e-05, "loss": 0.1754, "loss/crossentropy": 2.7823938727378845, "loss/hidden": 0.0, "loss/logits": 0.17542218044400215, "loss/reg": 0.4442117214202881, "step": 978 }, { "epoch": 0.00979, "grad_norm": 0.4390604794025421, "grad_norm_var": 0.009523381415930269, "learning_rate": 5e-05, "loss": 0.205, "loss/crossentropy": 2.769808530807495, "loss/hidden": 0.0, "loss/logits": 0.20497067272663116, "loss/reg": 0.4441469609737396, "step": 979 }, { "epoch": 0.0098, "grad_norm": 0.38309335708618164, "grad_norm_var": 0.0092919255851909, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.874597430229187, "loss/hidden": 0.0, "loss/logits": 0.18691887333989143, "loss/reg": 0.44392186403274536, "step": 980 }, { "epoch": 0.00981, "grad_norm": 0.36563828587532043, "grad_norm_var": 0.009353606229279625, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.7618335485458374, "loss/hidden": 0.0, "loss/logits": 0.17423971369862556, "loss/reg": 0.44422435760498047, "step": 981 }, { "epoch": 0.00982, "grad_norm": 0.3815692961215973, "grad_norm_var": 0.009476382615197313, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.8218480944633484, "loss/hidden": 0.0, "loss/logits": 0.16729803010821342, "loss/reg": 0.444234162569046, "step": 982 }, { "epoch": 0.00983, "grad_norm": 0.4299367666244507, "grad_norm_var": 0.009431525157682, "learning_rate": 5e-05, "loss": 0.1915, "loss/crossentropy": 2.848261833190918, "loss/hidden": 0.0, "loss/logits": 0.191523015499115, "loss/reg": 0.44451257586479187, "step": 983 }, { "epoch": 0.00984, "grad_norm": 0.43842414021492004, "grad_norm_var": 0.009323753794835557, "learning_rate": 5e-05, "loss": 0.1959, "loss/crossentropy": 2.8166709542274475, "loss/hidden": 0.0, "loss/logits": 0.19589243829250336, "loss/reg": 0.444320946931839, "step": 984 }, { "epoch": 0.00985, "grad_norm": 0.4201236665248871, "grad_norm_var": 0.009230884008565735, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.926607131958008, "loss/hidden": 0.0, "loss/logits": 0.17902707681059837, "loss/reg": 0.44440218806266785, "step": 985 }, { "epoch": 0.00986, "grad_norm": 0.3997006416320801, "grad_norm_var": 0.009121477537223236, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.625101923942566, "loss/hidden": 0.0, "loss/logits": 0.18231799080967903, "loss/reg": 0.44436293840408325, "step": 986 }, { "epoch": 0.00987, "grad_norm": 0.3660185933113098, "grad_norm_var": 0.009362447824935182, "learning_rate": 5e-05, "loss": 0.1628, "loss/crossentropy": 2.6331620812416077, "loss/hidden": 0.0, "loss/logits": 0.16284004971385002, "loss/reg": 0.44439151883125305, "step": 987 }, { "epoch": 0.00988, "grad_norm": 0.39418521523475647, "grad_norm_var": 0.009366003871928803, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.7910391688346863, "loss/hidden": 0.0, "loss/logits": 0.1792105995118618, "loss/reg": 0.4439309239387512, "step": 988 }, { "epoch": 0.00989, "grad_norm": 0.3637595474720001, "grad_norm_var": 0.009549118007335678, "learning_rate": 5e-05, "loss": 0.1677, "loss/crossentropy": 2.816851496696472, "loss/hidden": 0.0, "loss/logits": 0.16771914064884186, "loss/reg": 0.44346797466278076, "step": 989 }, { "epoch": 0.0099, "grad_norm": 0.4578511416912079, "grad_norm_var": 0.009319184462051456, "learning_rate": 5e-05, "loss": 0.1981, "loss/crossentropy": 2.852989077568054, "loss/hidden": 0.0, "loss/logits": 0.1980816312134266, "loss/reg": 0.4433007836341858, "step": 990 }, { "epoch": 0.00991, "grad_norm": 0.4427335858345032, "grad_norm_var": 0.0019497304934493346, "learning_rate": 5e-05, "loss": 0.1813, "loss/crossentropy": 2.801695942878723, "loss/hidden": 0.0, "loss/logits": 0.18131743371486664, "loss/reg": 0.4427722990512848, "step": 991 }, { "epoch": 0.00992, "grad_norm": 0.4192834794521332, "grad_norm_var": 0.0019323096749670624, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.731178879737854, "loss/hidden": 0.0, "loss/logits": 0.17753103002905846, "loss/reg": 0.442390114068985, "step": 992 }, { "epoch": 0.00993, "grad_norm": 0.4066467583179474, "grad_norm_var": 0.00092224077602359, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.7846652269363403, "loss/hidden": 0.0, "loss/logits": 0.18284928798675537, "loss/reg": 0.44175902009010315, "step": 993 }, { "epoch": 0.00994, "grad_norm": 0.39476749300956726, "grad_norm_var": 0.0009045000138922436, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.7397571802139282, "loss/hidden": 0.0, "loss/logits": 0.17823457345366478, "loss/reg": 0.4413183927536011, "step": 994 }, { "epoch": 0.00995, "grad_norm": 0.43000832200050354, "grad_norm_var": 0.0008702312584380405, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.684629499912262, "loss/hidden": 0.0, "loss/logits": 0.1813807711005211, "loss/reg": 0.44085076451301575, "step": 995 }, { "epoch": 0.00996, "grad_norm": 0.3862597942352295, "grad_norm_var": 0.0008612465379275217, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.802727162837982, "loss/hidden": 0.0, "loss/logits": 0.1762963905930519, "loss/reg": 0.44044408202171326, "step": 996 }, { "epoch": 0.00997, "grad_norm": 0.41826558113098145, "grad_norm_var": 0.0007507338494742388, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.901082396507263, "loss/hidden": 0.0, "loss/logits": 0.18072273582220078, "loss/reg": 0.43991121649742126, "step": 997 }, { "epoch": 0.00998, "grad_norm": 0.3943028450012207, "grad_norm_var": 0.0007137085445982868, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.8276814818382263, "loss/hidden": 0.0, "loss/logits": 0.19031721353530884, "loss/reg": 0.4392421841621399, "step": 998 }, { "epoch": 0.00999, "grad_norm": 0.4577745795249939, "grad_norm_var": 0.0008356159623724009, "learning_rate": 5e-05, "loss": 0.2203, "loss/crossentropy": 2.698966920375824, "loss/hidden": 0.0, "loss/logits": 0.22030482068657875, "loss/reg": 0.43889522552490234, "step": 999 }, { "epoch": 0.01, "grad_norm": 0.42141038179397583, "grad_norm_var": 0.0007934958980271152, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.7145577669143677, "loss/hidden": 0.0, "loss/logits": 0.18840471655130386, "loss/reg": 0.43870753049850464, "step": 1000 }, { "epoch": 0.01001, "grad_norm": 0.3973335921764374, "grad_norm_var": 0.0007976813938209185, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.753108501434326, "loss/hidden": 0.0, "loss/logits": 0.1865297518670559, "loss/reg": 0.43850383162498474, "step": 1001 }, { "epoch": 0.01002, "grad_norm": 0.45845192670822144, "grad_norm_var": 0.0009374817179577703, "learning_rate": 5e-05, "loss": 0.1952, "loss/crossentropy": 2.84415066242218, "loss/hidden": 0.0, "loss/logits": 0.1952270343899727, "loss/reg": 0.4381890594959259, "step": 1002 }, { "epoch": 0.01003, "grad_norm": 0.5364959836006165, "grad_norm_var": 0.0016844924508086093, "learning_rate": 5e-05, "loss": 0.2292, "loss/crossentropy": 2.8212199211120605, "loss/hidden": 0.0, "loss/logits": 0.22923918068408966, "loss/reg": 0.4376964271068573, "step": 1003 }, { "epoch": 0.01004, "grad_norm": 0.42497745156288147, "grad_norm_var": 0.0016224909971378635, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.799984097480774, "loss/hidden": 0.0, "loss/logits": 0.18833930045366287, "loss/reg": 0.4374016523361206, "step": 1004 }, { "epoch": 0.01005, "grad_norm": 0.4084516763687134, "grad_norm_var": 0.001378554379228562, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 2.874702036380768, "loss/hidden": 0.0, "loss/logits": 0.18371347710490227, "loss/reg": 0.4368451237678528, "step": 1005 }, { "epoch": 0.01006, "grad_norm": 0.3965741991996765, "grad_norm_var": 0.001372923740065938, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.6980846524238586, "loss/hidden": 0.0, "loss/logits": 0.18822360411286354, "loss/reg": 0.4365234375, "step": 1006 }, { "epoch": 0.01007, "grad_norm": 0.4088714122772217, "grad_norm_var": 0.0013627556568882296, "learning_rate": 5e-05, "loss": 0.1899, "loss/crossentropy": 2.666550040245056, "loss/hidden": 0.0, "loss/logits": 0.18985696509480476, "loss/reg": 0.43632182478904724, "step": 1007 }, { "epoch": 0.01008, "grad_norm": 0.38951027393341064, "grad_norm_var": 0.001430896313727929, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.778207540512085, "loss/hidden": 0.0, "loss/logits": 0.18486633524298668, "loss/reg": 0.4357018768787384, "step": 1008 }, { "epoch": 0.01009, "grad_norm": 0.3931121826171875, "grad_norm_var": 0.0014675821709053682, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.895179033279419, "loss/hidden": 0.0, "loss/logits": 0.18503106012940407, "loss/reg": 0.4354025423526764, "step": 1009 }, { "epoch": 0.0101, "grad_norm": 0.4337068796157837, "grad_norm_var": 0.0014324580629842172, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.759913742542267, "loss/hidden": 0.0, "loss/logits": 0.18221255391836166, "loss/reg": 0.43510448932647705, "step": 1010 }, { "epoch": 0.01011, "grad_norm": 0.4512951076030731, "grad_norm_var": 0.0014828859192159668, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.76470810174942, "loss/hidden": 0.0, "loss/logits": 0.19029200822114944, "loss/reg": 0.43460795283317566, "step": 1011 }, { "epoch": 0.01012, "grad_norm": 0.4190237820148468, "grad_norm_var": 0.0013870765784545676, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.7507981061935425, "loss/hidden": 0.0, "loss/logits": 0.17900363355875015, "loss/reg": 0.4343041181564331, "step": 1012 }, { "epoch": 0.01013, "grad_norm": 0.4443920850753784, "grad_norm_var": 0.0014041981958264522, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.7142964601516724, "loss/hidden": 0.0, "loss/logits": 0.18760843947529793, "loss/reg": 0.4338569641113281, "step": 1013 }, { "epoch": 0.01014, "grad_norm": 0.42451298236846924, "grad_norm_var": 0.001328606689992151, "learning_rate": 5e-05, "loss": 0.1697, "loss/crossentropy": 2.6924031376838684, "loss/hidden": 0.0, "loss/logits": 0.16973352432250977, "loss/reg": 0.4334600567817688, "step": 1014 }, { "epoch": 0.01015, "grad_norm": 0.41509300470352173, "grad_norm_var": 0.0012793852433893083, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 2.784221351146698, "loss/hidden": 0.0, "loss/logits": 0.1786673478782177, "loss/reg": 0.4331037104129791, "step": 1015 }, { "epoch": 0.01016, "grad_norm": 0.3989344537258148, "grad_norm_var": 0.00132606330201398, "learning_rate": 5e-05, "loss": 0.1956, "loss/crossentropy": 2.8916009068489075, "loss/hidden": 0.0, "loss/logits": 0.1956050619482994, "loss/reg": 0.43280115723609924, "step": 1016 }, { "epoch": 0.01017, "grad_norm": 0.3787098526954651, "grad_norm_var": 0.0014165556742197882, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.8997355103492737, "loss/hidden": 0.0, "loss/logits": 0.18841223791241646, "loss/reg": 0.43238887190818787, "step": 1017 }, { "epoch": 0.01018, "grad_norm": 0.3883640766143799, "grad_norm_var": 0.0014005179985682994, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.6832340359687805, "loss/hidden": 0.0, "loss/logits": 0.17886852845549583, "loss/reg": 0.43206512928009033, "step": 1018 }, { "epoch": 0.01019, "grad_norm": 0.6079189777374268, "grad_norm_var": 0.002833491094149432, "learning_rate": 5e-05, "loss": 0.2001, "loss/crossentropy": 2.790126860141754, "loss/hidden": 0.0, "loss/logits": 0.20009617507457733, "loss/reg": 0.4323020577430725, "step": 1019 }, { "epoch": 0.0102, "grad_norm": 0.44089069962501526, "grad_norm_var": 0.0028514651326813447, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.766815960407257, "loss/hidden": 0.0, "loss/logits": 0.182229433208704, "loss/reg": 0.4319862425327301, "step": 1020 }, { "epoch": 0.01021, "grad_norm": 0.39064323902130127, "grad_norm_var": 0.002910484980806875, "learning_rate": 5e-05, "loss": 0.1707, "loss/crossentropy": 2.7756189703941345, "loss/hidden": 0.0, "loss/logits": 0.17074931040406227, "loss/reg": 0.43202945590019226, "step": 1021 }, { "epoch": 0.01022, "grad_norm": 0.4634140431880951, "grad_norm_var": 0.0029466524705999125, "learning_rate": 5e-05, "loss": 0.2562, "loss/crossentropy": 2.7660391330718994, "loss/hidden": 0.0, "loss/logits": 0.2561802379786968, "loss/reg": 0.43214890360832214, "step": 1022 }, { "epoch": 0.01023, "grad_norm": 0.4250680208206177, "grad_norm_var": 0.00292168595295295, "learning_rate": 5e-05, "loss": 0.1864, "loss/crossentropy": 2.7163020372390747, "loss/hidden": 0.0, "loss/logits": 0.1863585039973259, "loss/reg": 0.43258315324783325, "step": 1023 }, { "epoch": 0.01024, "grad_norm": 0.4023502767086029, "grad_norm_var": 0.0028643205379897878, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.857658326625824, "loss/hidden": 0.0, "loss/logits": 0.1806233823299408, "loss/reg": 0.432801753282547, "step": 1024 }, { "epoch": 0.01025, "grad_norm": 0.4190579354763031, "grad_norm_var": 0.0027793392037530357, "learning_rate": 5e-05, "loss": 0.1834, "loss/crossentropy": 2.8029609322547913, "loss/hidden": 0.0, "loss/logits": 0.1834169626235962, "loss/reg": 0.43298864364624023, "step": 1025 }, { "epoch": 0.01026, "grad_norm": 0.39690741896629333, "grad_norm_var": 0.0028529569228337454, "learning_rate": 5e-05, "loss": 0.1945, "loss/crossentropy": 2.741085112094879, "loss/hidden": 0.0, "loss/logits": 0.1945127472281456, "loss/reg": 0.4332742989063263, "step": 1026 }, { "epoch": 0.01027, "grad_norm": 0.38460633158683777, "grad_norm_var": 0.002934106078913601, "learning_rate": 5e-05, "loss": 0.1784, "loss/crossentropy": 2.699503183364868, "loss/hidden": 0.0, "loss/logits": 0.17837752774357796, "loss/reg": 0.43398159742355347, "step": 1027 }, { "epoch": 0.01028, "grad_norm": 0.37660011649131775, "grad_norm_var": 0.0030803560657114647, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.82311350107193, "loss/hidden": 0.0, "loss/logits": 0.18070043250918388, "loss/reg": 0.4336981773376465, "step": 1028 }, { "epoch": 0.01029, "grad_norm": 0.4088154733181, "grad_norm_var": 0.0030548638644731303, "learning_rate": 5e-05, "loss": 0.191, "loss/crossentropy": 2.5536622405052185, "loss/hidden": 0.0, "loss/logits": 0.19099323451519012, "loss/reg": 0.4342043399810791, "step": 1029 }, { "epoch": 0.0103, "grad_norm": 0.36473724246025085, "grad_norm_var": 0.0032431560675841927, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.701414465904236, "loss/hidden": 0.0, "loss/logits": 0.17212556675076485, "loss/reg": 0.43442556262016296, "step": 1030 }, { "epoch": 0.01031, "grad_norm": 0.4276142120361328, "grad_norm_var": 0.0032508029741573187, "learning_rate": 5e-05, "loss": 0.1951, "loss/crossentropy": 2.7619796991348267, "loss/hidden": 0.0, "loss/logits": 0.19509128108620644, "loss/reg": 0.43470045924186707, "step": 1031 }, { "epoch": 0.01032, "grad_norm": 0.4067860245704651, "grad_norm_var": 0.0032355712977365616, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.6683972477912903, "loss/hidden": 0.0, "loss/logits": 0.18123599141836166, "loss/reg": 0.43542391061782837, "step": 1032 }, { "epoch": 0.01033, "grad_norm": 0.40173107385635376, "grad_norm_var": 0.003149152100705399, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.798967659473419, "loss/hidden": 0.0, "loss/logits": 0.1906563937664032, "loss/reg": 0.43590816855430603, "step": 1033 }, { "epoch": 0.01034, "grad_norm": 0.364787757396698, "grad_norm_var": 0.003280492303607352, "learning_rate": 5e-05, "loss": 0.1696, "loss/crossentropy": 2.7895652055740356, "loss/hidden": 0.0, "loss/logits": 0.16963185742497444, "loss/reg": 0.43589916825294495, "step": 1034 }, { "epoch": 0.01035, "grad_norm": 0.42282813787460327, "grad_norm_var": 0.0007253232826366252, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.78161758184433, "loss/hidden": 0.0, "loss/logits": 0.19401485472917557, "loss/reg": 0.4364420175552368, "step": 1035 }, { "epoch": 0.01036, "grad_norm": 0.3859194815158844, "grad_norm_var": 0.0006588406082654097, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.7565722465515137, "loss/hidden": 0.0, "loss/logits": 0.19064166769385338, "loss/reg": 0.4365929365158081, "step": 1036 }, { "epoch": 0.01037, "grad_norm": 0.40090373158454895, "grad_norm_var": 0.0006490400194754258, "learning_rate": 5e-05, "loss": 0.1765, "loss/crossentropy": 2.8080559372901917, "loss/hidden": 0.0, "loss/logits": 0.17650295794010162, "loss/reg": 0.43707337975502014, "step": 1037 }, { "epoch": 0.01038, "grad_norm": 0.3851446211338043, "grad_norm_var": 0.0004041371191311427, "learning_rate": 5e-05, "loss": 0.1716, "loss/crossentropy": 2.91619074344635, "loss/hidden": 0.0, "loss/logits": 0.17159418016672134, "loss/reg": 0.43753373622894287, "step": 1038 }, { "epoch": 0.01039, "grad_norm": 0.395150363445282, "grad_norm_var": 0.0003535642993816257, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.8508363366127014, "loss/hidden": 0.0, "loss/logits": 0.17736652493476868, "loss/reg": 0.43729671835899353, "step": 1039 }, { "epoch": 0.0104, "grad_norm": 0.4481015205383301, "grad_norm_var": 0.0005200982770067975, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.9783459305763245, "loss/hidden": 0.0, "loss/logits": 0.18066665157675743, "loss/reg": 0.43756961822509766, "step": 1040 }, { "epoch": 0.01041, "grad_norm": 0.4589756727218628, "grad_norm_var": 0.0007245497934502391, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.8084189891815186, "loss/hidden": 0.0, "loss/logits": 0.18706010654568672, "loss/reg": 0.43774843215942383, "step": 1041 }, { "epoch": 0.01042, "grad_norm": 2.015770435333252, "grad_norm_var": 0.16345241934848195, "learning_rate": 5e-05, "loss": 0.2452, "loss/crossentropy": 2.935393810272217, "loss/hidden": 0.0, "loss/logits": 0.24519557878375053, "loss/reg": 0.438432902097702, "step": 1042 }, { "epoch": 0.01043, "grad_norm": 0.4372752010822296, "grad_norm_var": 0.16279416628890162, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.744641423225403, "loss/hidden": 0.0, "loss/logits": 0.1750364825129509, "loss/reg": 0.4387211203575134, "step": 1043 }, { "epoch": 0.01044, "grad_norm": 0.4032389223575592, "grad_norm_var": 0.16237776886372132, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.932245194911957, "loss/hidden": 0.0, "loss/logits": 0.1716589629650116, "loss/reg": 0.4392080307006836, "step": 1044 }, { "epoch": 0.01045, "grad_norm": 0.5461344718933105, "grad_norm_var": 0.16174056315610294, "learning_rate": 5e-05, "loss": 0.2078, "loss/crossentropy": 2.7803109884262085, "loss/hidden": 0.0, "loss/logits": 0.2078443132340908, "loss/reg": 0.4389750063419342, "step": 1045 }, { "epoch": 0.01046, "grad_norm": 0.4110037088394165, "grad_norm_var": 0.16093772256612185, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.7787723541259766, "loss/hidden": 0.0, "loss/logits": 0.1813787966966629, "loss/reg": 0.4390513300895691, "step": 1046 }, { "epoch": 0.01047, "grad_norm": 0.3953849971294403, "grad_norm_var": 0.16139732649444802, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.8947722911834717, "loss/hidden": 0.0, "loss/logits": 0.18008361384272575, "loss/reg": 0.4391411244869232, "step": 1047 }, { "epoch": 0.01048, "grad_norm": 0.4086288809776306, "grad_norm_var": 0.16137034802410424, "learning_rate": 5e-05, "loss": 0.1744, "loss/crossentropy": 2.732118248939514, "loss/hidden": 0.0, "loss/logits": 0.17438285425305367, "loss/reg": 0.43918898701667786, "step": 1048 }, { "epoch": 0.01049, "grad_norm": 0.4181691110134125, "grad_norm_var": 0.1611333667988295, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.5847079753875732, "loss/hidden": 0.0, "loss/logits": 0.1746259443461895, "loss/reg": 0.4390736520290375, "step": 1049 }, { "epoch": 0.0105, "grad_norm": 0.408033549785614, "grad_norm_var": 0.16036342251187544, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.8558157682418823, "loss/hidden": 0.0, "loss/logits": 0.17915260791778564, "loss/reg": 0.438984215259552, "step": 1050 }, { "epoch": 0.01051, "grad_norm": 0.4414747953414917, "grad_norm_var": 0.16014035213367503, "learning_rate": 5e-05, "loss": 0.1915, "loss/crossentropy": 2.880300223827362, "loss/hidden": 0.0, "loss/logits": 0.19145521894097328, "loss/reg": 0.4388161599636078, "step": 1051 }, { "epoch": 0.01052, "grad_norm": 0.43685099482536316, "grad_norm_var": 0.15937527107491087, "learning_rate": 5e-05, "loss": 0.2031, "loss/crossentropy": 2.6596705317497253, "loss/hidden": 0.0, "loss/logits": 0.20309368893504143, "loss/reg": 0.4384072422981262, "step": 1052 }, { "epoch": 0.01053, "grad_norm": 1.7331510782241821, "grad_norm_var": 0.24814817223950963, "learning_rate": 5e-05, "loss": 0.2713, "loss/crossentropy": 2.8987337350845337, "loss/hidden": 0.0, "loss/logits": 0.27134161442518234, "loss/reg": 0.4379825294017792, "step": 1053 }, { "epoch": 0.01054, "grad_norm": 0.4830080270767212, "grad_norm_var": 0.24582701630065965, "learning_rate": 5e-05, "loss": 0.1838, "loss/crossentropy": 2.7467854022979736, "loss/hidden": 0.0, "loss/logits": 0.18378573283553123, "loss/reg": 0.4377513527870178, "step": 1054 }, { "epoch": 0.01055, "grad_norm": 0.5275254845619202, "grad_norm_var": 0.24304147695515094, "learning_rate": 5e-05, "loss": 0.1974, "loss/crossentropy": 2.7590964436531067, "loss/hidden": 0.0, "loss/logits": 0.19737279787659645, "loss/reg": 0.4374655485153198, "step": 1055 }, { "epoch": 0.01056, "grad_norm": 0.43856480717658997, "grad_norm_var": 0.24326993113889547, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.6534964442253113, "loss/hidden": 0.0, "loss/logits": 0.18304353207349777, "loss/reg": 0.43743401765823364, "step": 1056 }, { "epoch": 0.01057, "grad_norm": 0.4638268053531647, "grad_norm_var": 0.24316550259033898, "learning_rate": 5e-05, "loss": 0.2056, "loss/crossentropy": 2.725363314151764, "loss/hidden": 0.0, "loss/logits": 0.2056051567196846, "loss/reg": 0.43679267168045044, "step": 1057 }, { "epoch": 0.01058, "grad_norm": 0.4190973937511444, "grad_norm_var": 0.1059948175383668, "learning_rate": 5e-05, "loss": 0.1888, "loss/crossentropy": 2.7155872583389282, "loss/hidden": 0.0, "loss/logits": 0.1887688748538494, "loss/reg": 0.4364500045776367, "step": 1058 }, { "epoch": 0.01059, "grad_norm": 0.36503633856773376, "grad_norm_var": 0.1071486867708654, "learning_rate": 5e-05, "loss": 0.1652, "loss/crossentropy": 2.7481898069381714, "loss/hidden": 0.0, "loss/logits": 0.1651594489812851, "loss/reg": 0.4361024498939514, "step": 1059 }, { "epoch": 0.0106, "grad_norm": 0.37737390398979187, "grad_norm_var": 0.10758867104745205, "learning_rate": 5e-05, "loss": 0.1699, "loss/crossentropy": 2.838256597518921, "loss/hidden": 0.0, "loss/logits": 0.1699293553829193, "loss/reg": 0.4357636272907257, "step": 1060 }, { "epoch": 0.01061, "grad_norm": 0.40362313389778137, "grad_norm_var": 0.10830591616233574, "learning_rate": 5e-05, "loss": 0.1727, "loss/crossentropy": 2.952218174934387, "loss/hidden": 0.0, "loss/logits": 0.17272095754742622, "loss/reg": 0.43527907133102417, "step": 1061 }, { "epoch": 0.01062, "grad_norm": 0.4272617995738983, "grad_norm_var": 0.10811180025084609, "learning_rate": 5e-05, "loss": 0.1923, "loss/crossentropy": 2.7458476424217224, "loss/hidden": 0.0, "loss/logits": 0.19231901317834854, "loss/reg": 0.43470820784568787, "step": 1062 }, { "epoch": 0.01063, "grad_norm": 0.46011021733283997, "grad_norm_var": 0.10739150995517388, "learning_rate": 5e-05, "loss": 0.1904, "loss/crossentropy": 2.6868544816970825, "loss/hidden": 0.0, "loss/logits": 0.19039755687117577, "loss/reg": 0.4341670572757721, "step": 1063 }, { "epoch": 0.01064, "grad_norm": 0.39737728238105774, "grad_norm_var": 0.1075563516143618, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.731410562992096, "loss/hidden": 0.0, "loss/logits": 0.17728161439299583, "loss/reg": 0.433709055185318, "step": 1064 }, { "epoch": 0.01065, "grad_norm": 0.4020436406135559, "grad_norm_var": 0.107775486003374, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.791599929332733, "loss/hidden": 0.0, "loss/logits": 0.17558511346578598, "loss/reg": 0.43316689133644104, "step": 1065 }, { "epoch": 0.01066, "grad_norm": 0.4394177198410034, "grad_norm_var": 0.10740399127369506, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.5972419381141663, "loss/hidden": 0.0, "loss/logits": 0.17948077619075775, "loss/reg": 0.43282970786094666, "step": 1066 }, { "epoch": 0.01067, "grad_norm": 0.38576647639274597, "grad_norm_var": 0.10813282278765678, "learning_rate": 5e-05, "loss": 0.1838, "loss/crossentropy": 2.7133710384368896, "loss/hidden": 0.0, "loss/logits": 0.18376357853412628, "loss/reg": 0.4323788285255432, "step": 1067 }, { "epoch": 0.01068, "grad_norm": 0.3966830372810364, "grad_norm_var": 0.10862544224004128, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.861550509929657, "loss/hidden": 0.0, "loss/logits": 0.1766049601137638, "loss/reg": 0.4316334128379822, "step": 1068 }, { "epoch": 0.01069, "grad_norm": 0.4183769226074219, "grad_norm_var": 0.0018028476221244075, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.708063840866089, "loss/hidden": 0.0, "loss/logits": 0.19027618691325188, "loss/reg": 0.43050616979599, "step": 1069 }, { "epoch": 0.0107, "grad_norm": 0.48006752133369446, "grad_norm_var": 0.0017807697744954868, "learning_rate": 5e-05, "loss": 0.2057, "loss/crossentropy": 2.712974727153778, "loss/hidden": 0.0, "loss/logits": 0.20573649555444717, "loss/reg": 0.4299947917461395, "step": 1070 }, { "epoch": 0.01071, "grad_norm": 0.3677997887134552, "grad_norm_var": 0.0011946928715354558, "learning_rate": 5e-05, "loss": 0.1715, "loss/crossentropy": 2.7771891951560974, "loss/hidden": 0.0, "loss/logits": 0.17148924246430397, "loss/reg": 0.4293523132801056, "step": 1071 }, { "epoch": 0.01072, "grad_norm": 0.38207536935806274, "grad_norm_var": 0.0012177879462686668, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.7082371711730957, "loss/hidden": 0.0, "loss/logits": 0.17500527575612068, "loss/reg": 0.4285602867603302, "step": 1072 }, { "epoch": 0.01073, "grad_norm": 0.45550402998924255, "grad_norm_var": 0.0011641843680753976, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.927842855453491, "loss/hidden": 0.0, "loss/logits": 0.19062582775950432, "loss/reg": 0.4279721975326538, "step": 1073 }, { "epoch": 0.01074, "grad_norm": 0.408554345369339, "grad_norm_var": 0.0011598906359289432, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.6692814230918884, "loss/hidden": 0.0, "loss/logits": 0.18984434753656387, "loss/reg": 0.42759957909584045, "step": 1074 }, { "epoch": 0.01075, "grad_norm": 0.4410555958747864, "grad_norm_var": 0.0010608466150636856, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.7485557794570923, "loss/hidden": 0.0, "loss/logits": 0.19052665308117867, "loss/reg": 0.4270976483821869, "step": 1075 }, { "epoch": 0.01076, "grad_norm": 0.42680755257606506, "grad_norm_var": 0.0009643043651584786, "learning_rate": 5e-05, "loss": 0.169, "loss/crossentropy": 2.8404088616371155, "loss/hidden": 0.0, "loss/logits": 0.16903214901685715, "loss/reg": 0.4262998700141907, "step": 1076 }, { "epoch": 0.01077, "grad_norm": 0.4795473515987396, "grad_norm_var": 0.001176181866958818, "learning_rate": 5e-05, "loss": 0.1861, "loss/crossentropy": 2.7634173035621643, "loss/hidden": 0.0, "loss/logits": 0.1861177161335945, "loss/reg": 0.425531804561615, "step": 1077 }, { "epoch": 0.01078, "grad_norm": 0.4072960317134857, "grad_norm_var": 0.0011898256602089763, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.616054594516754, "loss/hidden": 0.0, "loss/logits": 0.18167918547987938, "loss/reg": 0.42517319321632385, "step": 1078 }, { "epoch": 0.01079, "grad_norm": 1.7397806644439697, "grad_norm_var": 0.11007707942226762, "learning_rate": 5e-05, "loss": 0.2715, "loss/crossentropy": 2.7692105770111084, "loss/hidden": 0.0, "loss/logits": 0.2714749500155449, "loss/reg": 0.42386171221733093, "step": 1079 }, { "epoch": 0.0108, "grad_norm": 0.4348938763141632, "grad_norm_var": 0.10964290539640305, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 2.7332602739334106, "loss/hidden": 0.0, "loss/logits": 0.1846938394010067, "loss/reg": 0.42332133650779724, "step": 1080 }, { "epoch": 0.01081, "grad_norm": 0.4931792616844177, "grad_norm_var": 0.10892182933798715, "learning_rate": 5e-05, "loss": 0.1858, "loss/crossentropy": 2.5767018795013428, "loss/hidden": 0.0, "loss/logits": 0.18576065450906754, "loss/reg": 0.4223513901233673, "step": 1081 }, { "epoch": 0.01082, "grad_norm": 0.4487575888633728, "grad_norm_var": 0.10883963280806083, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.8224688172340393, "loss/hidden": 0.0, "loss/logits": 0.17560873180627823, "loss/reg": 0.4217327833175659, "step": 1082 }, { "epoch": 0.01083, "grad_norm": 0.5026548504829407, "grad_norm_var": 0.10775138355144258, "learning_rate": 5e-05, "loss": 0.1946, "loss/crossentropy": 2.9502413272857666, "loss/hidden": 0.0, "loss/logits": 0.19463224709033966, "loss/reg": 0.4210765063762665, "step": 1083 }, { "epoch": 0.01084, "grad_norm": 0.5106552243232727, "grad_norm_var": 0.10672438607311911, "learning_rate": 5e-05, "loss": 0.1719, "loss/crossentropy": 2.7701436281204224, "loss/hidden": 0.0, "loss/logits": 0.17188283801078796, "loss/reg": 0.42058122158050537, "step": 1084 }, { "epoch": 0.01085, "grad_norm": 0.43835264444351196, "grad_norm_var": 0.1064658407548888, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.7805017828941345, "loss/hidden": 0.0, "loss/logits": 0.1790996640920639, "loss/reg": 0.4204482138156891, "step": 1085 }, { "epoch": 0.01086, "grad_norm": 0.45970335602760315, "grad_norm_var": 0.10661664293048828, "learning_rate": 5e-05, "loss": 0.1996, "loss/crossentropy": 2.8365654945373535, "loss/hidden": 0.0, "loss/logits": 0.1996338926255703, "loss/reg": 0.42006853222846985, "step": 1086 }, { "epoch": 0.01087, "grad_norm": 0.390616238117218, "grad_norm_var": 0.10617158953854876, "learning_rate": 5e-05, "loss": 0.1666, "loss/crossentropy": 2.841938316822052, "loss/hidden": 0.0, "loss/logits": 0.16664209216833115, "loss/reg": 0.4195478856563568, "step": 1087 }, { "epoch": 0.01088, "grad_norm": 0.44494307041168213, "grad_norm_var": 0.10521038413697206, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.7355661392211914, "loss/hidden": 0.0, "loss/logits": 0.18256185576319695, "loss/reg": 0.4192197322845459, "step": 1088 }, { "epoch": 0.01089, "grad_norm": 0.40124809741973877, "grad_norm_var": 0.10593431955170646, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.7722306847572327, "loss/hidden": 0.0, "loss/logits": 0.17683351039886475, "loss/reg": 0.4187287390232086, "step": 1089 }, { "epoch": 0.0109, "grad_norm": 0.40142685174942017, "grad_norm_var": 0.10604982251177912, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.7355655431747437, "loss/hidden": 0.0, "loss/logits": 0.18393169716000557, "loss/reg": 0.418019562959671, "step": 1090 }, { "epoch": 0.01091, "grad_norm": 0.4214020073413849, "grad_norm_var": 0.10629736447692638, "learning_rate": 5e-05, "loss": 0.1913, "loss/crossentropy": 2.7572532296180725, "loss/hidden": 0.0, "loss/logits": 0.19131805375218391, "loss/reg": 0.41784152388572693, "step": 1091 }, { "epoch": 0.01092, "grad_norm": 0.4756261706352234, "grad_norm_var": 0.1058066542961391, "learning_rate": 5e-05, "loss": 0.2016, "loss/crossentropy": 2.9134355187416077, "loss/hidden": 0.0, "loss/logits": 0.20157011970877647, "loss/reg": 0.4178382158279419, "step": 1092 }, { "epoch": 0.01093, "grad_norm": 1.2561731338500977, "grad_norm_var": 0.1384726200767847, "learning_rate": 5e-05, "loss": 0.2223, "loss/crossentropy": 2.973529875278473, "loss/hidden": 0.0, "loss/logits": 0.2223297692835331, "loss/reg": 0.4177326560020447, "step": 1093 }, { "epoch": 0.01094, "grad_norm": 0.4797002375125885, "grad_norm_var": 0.13716515638225932, "learning_rate": 5e-05, "loss": 0.1899, "loss/crossentropy": 2.7411617636680603, "loss/hidden": 0.0, "loss/logits": 0.18986839801073074, "loss/reg": 0.4177788496017456, "step": 1094 }, { "epoch": 0.01095, "grad_norm": 0.4734818637371063, "grad_norm_var": 0.04176920787397708, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.818439304828644, "loss/hidden": 0.0, "loss/logits": 0.18243376910686493, "loss/reg": 0.417839378118515, "step": 1095 }, { "epoch": 0.01096, "grad_norm": 0.6227381229400635, "grad_norm_var": 0.04229254100242295, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.8773878812789917, "loss/hidden": 0.0, "loss/logits": 0.18506751209497452, "loss/reg": 0.4173845052719116, "step": 1096 }, { "epoch": 0.01097, "grad_norm": 0.5801162123680115, "grad_norm_var": 0.04252599322899469, "learning_rate": 5e-05, "loss": 0.1921, "loss/crossentropy": 2.9086549878120422, "loss/hidden": 0.0, "loss/logits": 0.1921495720744133, "loss/reg": 0.4169389009475708, "step": 1097 }, { "epoch": 0.01098, "grad_norm": 0.4532332122325897, "grad_norm_var": 0.04248519392849221, "learning_rate": 5e-05, "loss": 0.1635, "loss/crossentropy": 2.719603717327118, "loss/hidden": 0.0, "loss/logits": 0.16354722902178764, "loss/reg": 0.416939377784729, "step": 1098 }, { "epoch": 0.01099, "grad_norm": 0.6343590617179871, "grad_norm_var": 0.043273430350800474, "learning_rate": 5e-05, "loss": 0.2141, "loss/crossentropy": 2.823514997959137, "loss/hidden": 0.0, "loss/logits": 0.21414609253406525, "loss/reg": 0.41646242141723633, "step": 1099 }, { "epoch": 0.011, "grad_norm": 0.43662020564079285, "grad_norm_var": 0.043784614117601824, "learning_rate": 5e-05, "loss": 0.195, "loss/crossentropy": 2.772375226020813, "loss/hidden": 0.0, "loss/logits": 0.19500497728586197, "loss/reg": 0.4163966178894043, "step": 1100 }, { "epoch": 0.01101, "grad_norm": 0.44469979405403137, "grad_norm_var": 0.043715404028499744, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.8713322281837463, "loss/hidden": 0.0, "loss/logits": 0.1860225833952427, "loss/reg": 0.41602981090545654, "step": 1101 }, { "epoch": 0.01102, "grad_norm": 0.6034563779830933, "grad_norm_var": 0.043784062538657395, "learning_rate": 5e-05, "loss": 0.1942, "loss/crossentropy": 2.7249866127967834, "loss/hidden": 0.0, "loss/logits": 0.1941785290837288, "loss/reg": 0.41555050015449524, "step": 1102 }, { "epoch": 0.01103, "grad_norm": 0.483451247215271, "grad_norm_var": 0.042566594615286966, "learning_rate": 5e-05, "loss": 0.1863, "loss/crossentropy": 2.687652349472046, "loss/hidden": 0.0, "loss/logits": 0.18631885200738907, "loss/reg": 0.41518479585647583, "step": 1103 }, { "epoch": 0.01104, "grad_norm": 0.5887930393218994, "grad_norm_var": 0.04206945898437322, "learning_rate": 5e-05, "loss": 0.2488, "loss/crossentropy": 3.034087896347046, "loss/hidden": 0.0, "loss/logits": 0.24880356714129448, "loss/reg": 0.4147547483444214, "step": 1104 }, { "epoch": 0.01105, "grad_norm": 0.42360809445381165, "grad_norm_var": 0.0416653288514856, "learning_rate": 5e-05, "loss": 0.1726, "loss/crossentropy": 2.705663800239563, "loss/hidden": 0.0, "loss/logits": 0.17257391661405563, "loss/reg": 0.41442447900772095, "step": 1105 }, { "epoch": 0.01106, "grad_norm": 0.4743936061859131, "grad_norm_var": 0.04056547338864664, "learning_rate": 5e-05, "loss": 0.1875, "loss/crossentropy": 2.8113759756088257, "loss/hidden": 0.0, "loss/logits": 0.18747293949127197, "loss/reg": 0.41447287797927856, "step": 1106 }, { "epoch": 0.01107, "grad_norm": 0.44849854707717896, "grad_norm_var": 0.040135045708098595, "learning_rate": 5e-05, "loss": 0.2004, "loss/crossentropy": 2.8940623998641968, "loss/hidden": 0.0, "loss/logits": 0.20044919475913048, "loss/reg": 0.4144785702228546, "step": 1107 }, { "epoch": 0.01108, "grad_norm": 0.45357024669647217, "grad_norm_var": 0.04039867826025951, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.788698971271515, "loss/hidden": 0.0, "loss/logits": 0.18842114508152008, "loss/reg": 0.41460055112838745, "step": 1108 }, { "epoch": 0.01109, "grad_norm": 0.4674374759197235, "grad_norm_var": 0.005389596380124421, "learning_rate": 5e-05, "loss": 0.1868, "loss/crossentropy": 2.7953092455863953, "loss/hidden": 0.0, "loss/logits": 0.18683188781142235, "loss/reg": 0.4149346351623535, "step": 1109 }, { "epoch": 0.0111, "grad_norm": 0.5756605267524719, "grad_norm_var": 0.005650887151044263, "learning_rate": 5e-05, "loss": 0.199, "loss/crossentropy": 2.81899094581604, "loss/hidden": 0.0, "loss/logits": 0.19900312274694443, "loss/reg": 0.4148696959018707, "step": 1110 }, { "epoch": 0.01111, "grad_norm": 0.44086745381355286, "grad_norm_var": 0.0058772898316640605, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.804613709449768, "loss/hidden": 0.0, "loss/logits": 0.17655496299266815, "loss/reg": 0.41513875126838684, "step": 1111 }, { "epoch": 0.01112, "grad_norm": 0.3948678970336914, "grad_norm_var": 0.005643191061064871, "learning_rate": 5e-05, "loss": 0.1737, "loss/crossentropy": 2.7425440549850464, "loss/hidden": 0.0, "loss/logits": 0.17369906604290009, "loss/reg": 0.41505956649780273, "step": 1112 }, { "epoch": 0.01113, "grad_norm": 0.42096200585365295, "grad_norm_var": 0.005398398826789974, "learning_rate": 5e-05, "loss": 0.1784, "loss/crossentropy": 2.8006120920181274, "loss/hidden": 0.0, "loss/logits": 0.17840884998440742, "loss/reg": 0.4150742292404175, "step": 1113 }, { "epoch": 0.01114, "grad_norm": 0.5609464049339294, "grad_norm_var": 0.00568123710904905, "learning_rate": 5e-05, "loss": 0.1989, "loss/crossentropy": 2.86473947763443, "loss/hidden": 0.0, "loss/logits": 0.19885210320353508, "loss/reg": 0.41486528515815735, "step": 1114 }, { "epoch": 0.01115, "grad_norm": 0.539185106754303, "grad_norm_var": 0.004425140498693129, "learning_rate": 5e-05, "loss": 0.2252, "loss/crossentropy": 2.655807375907898, "loss/hidden": 0.0, "loss/logits": 0.2251797616481781, "loss/reg": 0.4149053990840912, "step": 1115 }, { "epoch": 0.01116, "grad_norm": 0.4107459783554077, "grad_norm_var": 0.004633245063933785, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.8047462105751038, "loss/hidden": 0.0, "loss/logits": 0.1799166388809681, "loss/reg": 0.415025919675827, "step": 1116 }, { "epoch": 0.01117, "grad_norm": 0.4378933906555176, "grad_norm_var": 0.00467107704620191, "learning_rate": 5e-05, "loss": 0.2011, "loss/crossentropy": 2.609542429447174, "loss/hidden": 0.0, "loss/logits": 0.20113946497440338, "loss/reg": 0.41507139801979065, "step": 1117 }, { "epoch": 0.01118, "grad_norm": 0.39793679118156433, "grad_norm_var": 0.004003878691693575, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.786940336227417, "loss/hidden": 0.0, "loss/logits": 0.18305183202028275, "loss/reg": 0.4153191149234772, "step": 1118 }, { "epoch": 0.01119, "grad_norm": 0.4449251592159271, "grad_norm_var": 0.004027168840945982, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.8462105989456177, "loss/hidden": 0.0, "loss/logits": 0.18220242485404015, "loss/reg": 0.4148350656032562, "step": 1119 }, { "epoch": 0.0112, "grad_norm": 0.44335636496543884, "grad_norm_var": 0.0029974507082141445, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.6770092844963074, "loss/hidden": 0.0, "loss/logits": 0.18530552834272385, "loss/reg": 0.41451361775398254, "step": 1120 }, { "epoch": 0.01121, "grad_norm": 0.367841899394989, "grad_norm_var": 0.003450723918982194, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.755398452281952, "loss/hidden": 0.0, "loss/logits": 0.18507423251867294, "loss/reg": 0.4143860936164856, "step": 1121 }, { "epoch": 0.01122, "grad_norm": 0.41524001955986023, "grad_norm_var": 0.0035160112669585333, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.666959822177887, "loss/hidden": 0.0, "loss/logits": 0.19066079333424568, "loss/reg": 0.4141641855239868, "step": 1122 }, { "epoch": 0.01123, "grad_norm": 0.37930965423583984, "grad_norm_var": 0.0038405505392372975, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.7602951526641846, "loss/hidden": 0.0, "loss/logits": 0.18762869387865067, "loss/reg": 0.4140634834766388, "step": 1123 }, { "epoch": 0.01124, "grad_norm": 0.35346516966819763, "grad_norm_var": 0.004378123566987568, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.8690579533576965, "loss/hidden": 0.0, "loss/logits": 0.17516318708658218, "loss/reg": 0.4135870039463043, "step": 1124 }, { "epoch": 0.01125, "grad_norm": 0.38035848736763, "grad_norm_var": 0.00454120371634219, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.7827054262161255, "loss/hidden": 0.0, "loss/logits": 0.18210802599787712, "loss/reg": 0.4132291078567505, "step": 1125 }, { "epoch": 0.01126, "grad_norm": 0.4369087815284729, "grad_norm_var": 0.0031463231378213497, "learning_rate": 5e-05, "loss": 0.1999, "loss/crossentropy": 2.7080519795417786, "loss/hidden": 0.0, "loss/logits": 0.19993599876761436, "loss/reg": 0.41283077001571655, "step": 1126 }, { "epoch": 0.01127, "grad_norm": 0.44357267022132874, "grad_norm_var": 0.003151944528361961, "learning_rate": 5e-05, "loss": 0.2062, "loss/crossentropy": 2.962744176387787, "loss/hidden": 0.0, "loss/logits": 0.20617877319455147, "loss/reg": 0.41251081228256226, "step": 1127 }, { "epoch": 0.01128, "grad_norm": 0.3843441605567932, "grad_norm_var": 0.0032035597244117753, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.8469061255455017, "loss/hidden": 0.0, "loss/logits": 0.1866915337741375, "loss/reg": 0.4123833179473877, "step": 1128 }, { "epoch": 0.01129, "grad_norm": 0.40042608976364136, "grad_norm_var": 0.0032438818795351154, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.798237144947052, "loss/hidden": 0.0, "loss/logits": 0.18402211740612984, "loss/reg": 0.411937952041626, "step": 1129 }, { "epoch": 0.0113, "grad_norm": 0.4115356206893921, "grad_norm_var": 0.0019264454803736878, "learning_rate": 5e-05, "loss": 0.1875, "loss/crossentropy": 2.8366461396217346, "loss/hidden": 0.0, "loss/logits": 0.18746977671980858, "loss/reg": 0.41167354583740234, "step": 1130 }, { "epoch": 0.01131, "grad_norm": 0.4022926688194275, "grad_norm_var": 0.000839036886700865, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.737312376499176, "loss/hidden": 0.0, "loss/logits": 0.177461426705122, "loss/reg": 0.4111967980861664, "step": 1131 }, { "epoch": 0.01132, "grad_norm": 0.39588743448257446, "grad_norm_var": 0.000845185393207019, "learning_rate": 5e-05, "loss": 0.1929, "loss/crossentropy": 2.807273268699646, "loss/hidden": 0.0, "loss/logits": 0.19291535764932632, "loss/reg": 0.4109109938144684, "step": 1132 }, { "epoch": 0.01133, "grad_norm": 0.38276413083076477, "grad_norm_var": 0.0008003788853753561, "learning_rate": 5e-05, "loss": 0.1893, "loss/crossentropy": 2.7367629408836365, "loss/hidden": 0.0, "loss/logits": 0.18933599442243576, "loss/reg": 0.4103023409843445, "step": 1133 }, { "epoch": 0.01134, "grad_norm": 0.36996695399284363, "grad_norm_var": 0.0008663294825055197, "learning_rate": 5e-05, "loss": 0.1744, "loss/crossentropy": 2.9220885038375854, "loss/hidden": 0.0, "loss/logits": 0.17444844171404839, "loss/reg": 0.40956220030784607, "step": 1134 }, { "epoch": 0.01135, "grad_norm": 0.42899391055107117, "grad_norm_var": 0.0007883828059192262, "learning_rate": 5e-05, "loss": 0.1977, "loss/crossentropy": 2.6999526619911194, "loss/hidden": 0.0, "loss/logits": 0.19769981503486633, "loss/reg": 0.40861764550209045, "step": 1135 }, { "epoch": 0.01136, "grad_norm": 0.3876810073852539, "grad_norm_var": 0.0006585327278961513, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.768545150756836, "loss/hidden": 0.0, "loss/logits": 0.17736775428056717, "loss/reg": 0.40815556049346924, "step": 1136 }, { "epoch": 0.01137, "grad_norm": 0.39652732014656067, "grad_norm_var": 0.0006011672378754292, "learning_rate": 5e-05, "loss": 0.1761, "loss/crossentropy": 2.7440211176872253, "loss/hidden": 0.0, "loss/logits": 0.17609849944710732, "loss/reg": 0.40747296810150146, "step": 1137 }, { "epoch": 0.01138, "grad_norm": 0.469338983297348, "grad_norm_var": 0.0009078670943102995, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.8522775173187256, "loss/hidden": 0.0, "loss/logits": 0.18017977103590965, "loss/reg": 0.4069644510746002, "step": 1138 }, { "epoch": 0.01139, "grad_norm": 0.3591986298561096, "grad_norm_var": 0.00099254309747554, "learning_rate": 5e-05, "loss": 0.1625, "loss/crossentropy": 2.769225299358368, "loss/hidden": 0.0, "loss/logits": 0.16251084208488464, "loss/reg": 0.40675848722457886, "step": 1139 }, { "epoch": 0.0114, "grad_norm": 0.46715036034584045, "grad_norm_var": 0.0010918467568498938, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.886795938014984, "loss/hidden": 0.0, "loss/logits": 0.18530793488025665, "loss/reg": 0.4064421057701111, "step": 1140 }, { "epoch": 0.01141, "grad_norm": 0.41398993134498596, "grad_norm_var": 0.001041686696320913, "learning_rate": 5e-05, "loss": 0.191, "loss/crossentropy": 2.77803373336792, "loss/hidden": 0.0, "loss/logits": 0.19097021967172623, "loss/reg": 0.4059886932373047, "step": 1141 }, { "epoch": 0.01142, "grad_norm": 0.45268771052360535, "grad_norm_var": 0.0011150986655177146, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.7433528900146484, "loss/hidden": 0.0, "loss/logits": 0.1884620599448681, "loss/reg": 0.40582218766212463, "step": 1142 }, { "epoch": 0.01143, "grad_norm": 0.4692053198814392, "grad_norm_var": 0.0012695460628068627, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.661261022090912, "loss/hidden": 0.0, "loss/logits": 0.1842181384563446, "loss/reg": 0.4053380787372589, "step": 1143 }, { "epoch": 0.01144, "grad_norm": 0.6225918531417847, "grad_norm_var": 0.003938662819607434, "learning_rate": 5e-05, "loss": 0.191, "loss/crossentropy": 2.6497309803962708, "loss/hidden": 0.0, "loss/logits": 0.1909896731376648, "loss/reg": 0.4048023223876953, "step": 1144 }, { "epoch": 0.01145, "grad_norm": 0.4070091247558594, "grad_norm_var": 0.003918143075677409, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.7601521611213684, "loss/hidden": 0.0, "loss/logits": 0.18431993201375008, "loss/reg": 0.4043135344982147, "step": 1145 }, { "epoch": 0.01146, "grad_norm": 0.39389288425445557, "grad_norm_var": 0.003974683863487331, "learning_rate": 5e-05, "loss": 0.1872, "loss/crossentropy": 2.871561825275421, "loss/hidden": 0.0, "loss/logits": 0.18724697455763817, "loss/reg": 0.4039885997772217, "step": 1146 }, { "epoch": 0.01147, "grad_norm": 0.4135102927684784, "grad_norm_var": 0.003946792798648805, "learning_rate": 5e-05, "loss": 0.1855, "loss/crossentropy": 2.921668767929077, "loss/hidden": 0.0, "loss/logits": 0.18548932299017906, "loss/reg": 0.40349310636520386, "step": 1147 }, { "epoch": 0.01148, "grad_norm": 0.38799649477005005, "grad_norm_var": 0.003983313313333788, "learning_rate": 5e-05, "loss": 0.18, "loss/crossentropy": 2.86094868183136, "loss/hidden": 0.0, "loss/logits": 0.18000195547938347, "loss/reg": 0.40314486622810364, "step": 1148 }, { "epoch": 0.01149, "grad_norm": 0.36842799186706543, "grad_norm_var": 0.004079580469365082, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.7568915486335754, "loss/hidden": 0.0, "loss/logits": 0.17976509407162666, "loss/reg": 0.4026564061641693, "step": 1149 }, { "epoch": 0.0115, "grad_norm": 0.4031458795070648, "grad_norm_var": 0.003902666135315987, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.6163597106933594, "loss/hidden": 0.0, "loss/logits": 0.18089350685477257, "loss/reg": 0.40189653635025024, "step": 1150 }, { "epoch": 0.01151, "grad_norm": 0.48123985528945923, "grad_norm_var": 0.00408308856268755, "learning_rate": 5e-05, "loss": 0.1945, "loss/crossentropy": 2.694430708885193, "loss/hidden": 0.0, "loss/logits": 0.194487065076828, "loss/reg": 0.40127527713775635, "step": 1151 }, { "epoch": 0.01152, "grad_norm": 0.39554059505462646, "grad_norm_var": 0.004041711068257566, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.6914899945259094, "loss/hidden": 0.0, "loss/logits": 0.18563660606741905, "loss/reg": 0.4006398320198059, "step": 1152 }, { "epoch": 0.01153, "grad_norm": 0.39719343185424805, "grad_norm_var": 0.004038646841903418, "learning_rate": 5e-05, "loss": 0.1864, "loss/crossentropy": 2.738409399986267, "loss/hidden": 0.0, "loss/logits": 0.1863783597946167, "loss/reg": 0.40022245049476624, "step": 1153 }, { "epoch": 0.01154, "grad_norm": 0.39263221621513367, "grad_norm_var": 0.004018189376521504, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.8037460446357727, "loss/hidden": 0.0, "loss/logits": 0.17917369306087494, "loss/reg": 0.39995551109313965, "step": 1154 }, { "epoch": 0.01155, "grad_norm": 0.47140011191368103, "grad_norm_var": 0.0037968500941630884, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.8112247586250305, "loss/hidden": 0.0, "loss/logits": 0.18697210773825645, "loss/reg": 0.3995610177516937, "step": 1155 }, { "epoch": 0.01156, "grad_norm": 0.3979388177394867, "grad_norm_var": 0.003786638425626813, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.677973210811615, "loss/hidden": 0.0, "loss/logits": 0.18362467736005783, "loss/reg": 0.3991345465183258, "step": 1156 }, { "epoch": 0.01157, "grad_norm": 0.4398776590824127, "grad_norm_var": 0.0037757643608094527, "learning_rate": 5e-05, "loss": 0.2019, "loss/crossentropy": 2.7093088030815125, "loss/hidden": 0.0, "loss/logits": 0.201858002692461, "loss/reg": 0.39899328351020813, "step": 1157 }, { "epoch": 0.01158, "grad_norm": 0.372996062040329, "grad_norm_var": 0.003941107420197964, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.7619176506996155, "loss/hidden": 0.0, "loss/logits": 0.17626621574163437, "loss/reg": 0.39851176738739014, "step": 1158 }, { "epoch": 0.01159, "grad_norm": 0.351380318403244, "grad_norm_var": 0.004128646512850749, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.6536864042282104, "loss/hidden": 0.0, "loss/logits": 0.17249062657356262, "loss/reg": 0.397915780544281, "step": 1159 }, { "epoch": 0.0116, "grad_norm": 0.3482607305049896, "grad_norm_var": 0.0013688465187175407, "learning_rate": 5e-05, "loss": 0.1695, "loss/crossentropy": 2.8032846450805664, "loss/hidden": 0.0, "loss/logits": 0.16954632103443146, "loss/reg": 0.3972340524196625, "step": 1160 }, { "epoch": 0.01161, "grad_norm": 0.40863949060440063, "grad_norm_var": 0.0013702313959502537, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.7999401092529297, "loss/hidden": 0.0, "loss/logits": 0.18275900185108185, "loss/reg": 0.3963891267776489, "step": 1161 }, { "epoch": 0.01162, "grad_norm": 0.4455009996891022, "grad_norm_var": 0.001484317234321312, "learning_rate": 5e-05, "loss": 0.1765, "loss/crossentropy": 2.7998053431510925, "loss/hidden": 0.0, "loss/logits": 0.1765194609761238, "loss/reg": 0.3953721821308136, "step": 1162 }, { "epoch": 0.01163, "grad_norm": 0.4613651931285858, "grad_norm_var": 0.0016834715793174009, "learning_rate": 5e-05, "loss": 0.1974, "loss/crossentropy": 2.723604142665863, "loss/hidden": 0.0, "loss/logits": 0.1973644755780697, "loss/reg": 0.3948933184146881, "step": 1163 }, { "epoch": 0.01164, "grad_norm": 0.41313761472702026, "grad_norm_var": 0.0016568568688326716, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.760394811630249, "loss/hidden": 0.0, "loss/logits": 0.18329964950680733, "loss/reg": 0.3941228687763214, "step": 1164 }, { "epoch": 0.01165, "grad_norm": 0.3661276400089264, "grad_norm_var": 0.0016697212364988705, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.7048903703689575, "loss/hidden": 0.0, "loss/logits": 0.17398762330412865, "loss/reg": 0.3934180736541748, "step": 1165 }, { "epoch": 0.01166, "grad_norm": 0.4267931878566742, "grad_norm_var": 0.0016857447056405092, "learning_rate": 5e-05, "loss": 0.1811, "loss/crossentropy": 2.967951714992523, "loss/hidden": 0.0, "loss/logits": 0.1810765042901039, "loss/reg": 0.39277544617652893, "step": 1166 }, { "epoch": 0.01167, "grad_norm": 0.458570271730423, "grad_norm_var": 0.0015044273530598082, "learning_rate": 5e-05, "loss": 0.1976, "loss/crossentropy": 2.8066805005073547, "loss/hidden": 0.0, "loss/logits": 0.19757380709052086, "loss/reg": 0.39235779643058777, "step": 1167 }, { "epoch": 0.01168, "grad_norm": 0.4076763093471527, "grad_norm_var": 0.001491514248929789, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.594843864440918, "loss/hidden": 0.0, "loss/logits": 0.18307068943977356, "loss/reg": 0.3921445608139038, "step": 1168 }, { "epoch": 0.01169, "grad_norm": 0.4662540853023529, "grad_norm_var": 0.0016719695957838918, "learning_rate": 5e-05, "loss": 0.1979, "loss/crossentropy": 2.8613404631614685, "loss/hidden": 0.0, "loss/logits": 0.1979266256093979, "loss/reg": 0.39188241958618164, "step": 1169 }, { "epoch": 0.0117, "grad_norm": 0.3880429267883301, "grad_norm_var": 0.0016865350412459735, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.8231483697891235, "loss/hidden": 0.0, "loss/logits": 0.17990567535161972, "loss/reg": 0.39120861887931824, "step": 1170 }, { "epoch": 0.01171, "grad_norm": 0.41181349754333496, "grad_norm_var": 0.0014523890607224033, "learning_rate": 5e-05, "loss": 0.1879, "loss/crossentropy": 2.766329288482666, "loss/hidden": 0.0, "loss/logits": 0.18785357475280762, "loss/reg": 0.3904155194759369, "step": 1171 }, { "epoch": 0.01172, "grad_norm": 0.37270015478134155, "grad_norm_var": 0.0015337088094620754, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.772739887237549, "loss/hidden": 0.0, "loss/logits": 0.17615697532892227, "loss/reg": 0.390214741230011, "step": 1172 }, { "epoch": 0.01173, "grad_norm": 0.4047106206417084, "grad_norm_var": 0.0014647950075048304, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.711849868297577, "loss/hidden": 0.0, "loss/logits": 0.18094630539417267, "loss/reg": 0.3900716006755829, "step": 1173 }, { "epoch": 0.01174, "grad_norm": 0.373100608587265, "grad_norm_var": 0.001464328688076988, "learning_rate": 5e-05, "loss": 0.1682, "loss/crossentropy": 2.8127720952033997, "loss/hidden": 0.0, "loss/logits": 0.16817164048552513, "loss/reg": 0.3899447023868561, "step": 1174 }, { "epoch": 0.01175, "grad_norm": 0.3846966326236725, "grad_norm_var": 0.0012888305056238836, "learning_rate": 5e-05, "loss": 0.1699, "loss/crossentropy": 2.8145005106925964, "loss/hidden": 0.0, "loss/logits": 0.16986710205674171, "loss/reg": 0.38988345861434937, "step": 1175 }, { "epoch": 0.01176, "grad_norm": 0.4174414873123169, "grad_norm_var": 0.001031499651791871, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.7425543665885925, "loss/hidden": 0.0, "loss/logits": 0.17875215783715248, "loss/reg": 0.38948482275009155, "step": 1176 }, { "epoch": 0.01177, "grad_norm": 0.4087185263633728, "grad_norm_var": 0.0010314550320864293, "learning_rate": 5e-05, "loss": 0.1922, "loss/crossentropy": 2.625263512134552, "loss/hidden": 0.0, "loss/logits": 0.19215380772948265, "loss/reg": 0.38940492272377014, "step": 1177 }, { "epoch": 0.01178, "grad_norm": 0.40784165263175964, "grad_norm_var": 0.0009564749156922255, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.840769410133362, "loss/hidden": 0.0, "loss/logits": 0.1885039024055004, "loss/reg": 0.3891991972923279, "step": 1178 }, { "epoch": 0.01179, "grad_norm": 0.4150845408439636, "grad_norm_var": 0.0007768489869076966, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.80219566822052, "loss/hidden": 0.0, "loss/logits": 0.18589085713028908, "loss/reg": 0.3889690041542053, "step": 1179 }, { "epoch": 0.0118, "grad_norm": 0.4257360100746155, "grad_norm_var": 0.0007959544580349568, "learning_rate": 5e-05, "loss": 0.1895, "loss/crossentropy": 2.7795663475990295, "loss/hidden": 0.0, "loss/logits": 0.18950633704662323, "loss/reg": 0.38860997557640076, "step": 1180 }, { "epoch": 0.01181, "grad_norm": 0.40696948766708374, "grad_norm_var": 0.0006697013970932956, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.91402405500412, "loss/hidden": 0.0, "loss/logits": 0.17562247812747955, "loss/reg": 0.38857001066207886, "step": 1181 }, { "epoch": 0.01182, "grad_norm": 0.4075798988342285, "grad_norm_var": 0.0006523387740276202, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.8397974967956543, "loss/hidden": 0.0, "loss/logits": 0.1931845024228096, "loss/reg": 0.3886772692203522, "step": 1182 }, { "epoch": 0.01183, "grad_norm": 0.43371984362602234, "grad_norm_var": 0.000529368620324914, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.8052870631217957, "loss/hidden": 0.0, "loss/logits": 0.18758996576070786, "loss/reg": 0.388506680727005, "step": 1183 }, { "epoch": 0.01184, "grad_norm": 0.43078315258026123, "grad_norm_var": 0.0005609549030315335, "learning_rate": 5e-05, "loss": 0.1717, "loss/crossentropy": 2.781274437904358, "loss/hidden": 0.0, "loss/logits": 0.17167261615395546, "loss/reg": 0.38853806257247925, "step": 1184 }, { "epoch": 0.01185, "grad_norm": 0.3827318251132965, "grad_norm_var": 0.00036714477844779957, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.6675002574920654, "loss/hidden": 0.0, "loss/logits": 0.18670028448104858, "loss/reg": 0.3887313902378082, "step": 1185 }, { "epoch": 0.01186, "grad_norm": 0.40798628330230713, "grad_norm_var": 0.000348296833468531, "learning_rate": 5e-05, "loss": 0.18, "loss/crossentropy": 2.7974329590797424, "loss/hidden": 0.0, "loss/logits": 0.18003053218126297, "loss/reg": 0.388660192489624, "step": 1186 }, { "epoch": 0.01187, "grad_norm": 0.3918606638908386, "grad_norm_var": 0.00035698371764874003, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 2.8322020173072815, "loss/hidden": 0.0, "loss/logits": 0.17968391627073288, "loss/reg": 0.3883598744869232, "step": 1187 }, { "epoch": 0.01188, "grad_norm": 0.38590681552886963, "grad_norm_var": 0.00031192600765699295, "learning_rate": 5e-05, "loss": 0.1728, "loss/crossentropy": 2.764487147331238, "loss/hidden": 0.0, "loss/logits": 0.17282449826598167, "loss/reg": 0.3882608711719513, "step": 1188 }, { "epoch": 0.01189, "grad_norm": 0.3604799807071686, "grad_norm_var": 0.0004376987511141165, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.8978432416915894, "loss/hidden": 0.0, "loss/logits": 0.17470663413405418, "loss/reg": 0.38851919770240784, "step": 1189 }, { "epoch": 0.0119, "grad_norm": 0.5136921405792236, "grad_norm_var": 0.0011212182465380693, "learning_rate": 5e-05, "loss": 0.1767, "loss/crossentropy": 2.717037796974182, "loss/hidden": 0.0, "loss/logits": 0.1767074540257454, "loss/reg": 0.3881848156452179, "step": 1190 }, { "epoch": 0.01191, "grad_norm": 0.3986297845840454, "grad_norm_var": 0.0010838792361384335, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.81276398897171, "loss/hidden": 0.0, "loss/logits": 0.17595234140753746, "loss/reg": 0.38802841305732727, "step": 1191 }, { "epoch": 0.01192, "grad_norm": 0.4058785140514374, "grad_norm_var": 0.0010841510234490279, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.8653363585472107, "loss/hidden": 0.0, "loss/logits": 0.18028268590569496, "loss/reg": 0.38790231943130493, "step": 1192 }, { "epoch": 0.01193, "grad_norm": 0.3906726539134979, "grad_norm_var": 0.0011111366319204396, "learning_rate": 5e-05, "loss": 0.1811, "loss/crossentropy": 2.707718312740326, "loss/hidden": 0.0, "loss/logits": 0.1810658946633339, "loss/reg": 0.38783982396125793, "step": 1193 }, { "epoch": 0.01194, "grad_norm": 0.4029076099395752, "grad_norm_var": 0.0011143064305952207, "learning_rate": 5e-05, "loss": 0.1696, "loss/crossentropy": 2.769304037094116, "loss/hidden": 0.0, "loss/logits": 0.16964492574334145, "loss/reg": 0.3873360753059387, "step": 1194 }, { "epoch": 0.01195, "grad_norm": 0.4529467225074768, "grad_norm_var": 0.0012293758513203283, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 2.7121686935424805, "loss/hidden": 0.0, "loss/logits": 0.18959151208400726, "loss/reg": 0.38752281665802, "step": 1195 }, { "epoch": 0.01196, "grad_norm": 0.6221795678138733, "grad_norm_var": 0.003990425181644611, "learning_rate": 5e-05, "loss": 0.1927, "loss/crossentropy": 2.8244996070861816, "loss/hidden": 0.0, "loss/logits": 0.1927010491490364, "loss/reg": 0.3871219754219055, "step": 1196 }, { "epoch": 0.01197, "grad_norm": 0.4106963574886322, "grad_norm_var": 0.003982491246679428, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.697797477245331, "loss/hidden": 0.0, "loss/logits": 0.18256081640720367, "loss/reg": 0.38696321845054626, "step": 1197 }, { "epoch": 0.01198, "grad_norm": 0.4185578227043152, "grad_norm_var": 0.0039646485521913485, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.9091604948043823, "loss/hidden": 0.0, "loss/logits": 0.19026786088943481, "loss/reg": 0.3870019018650055, "step": 1198 }, { "epoch": 0.01199, "grad_norm": 0.41027286648750305, "grad_norm_var": 0.003973629637150407, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.6376752257347107, "loss/hidden": 0.0, "loss/logits": 0.18055420368909836, "loss/reg": 0.38691246509552, "step": 1199 }, { "epoch": 0.012, "grad_norm": 0.36383622884750366, "grad_norm_var": 0.004194417382578175, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.78502094745636, "loss/hidden": 0.0, "loss/logits": 0.1673405058681965, "loss/reg": 0.3871373236179352, "step": 1200 }, { "epoch": 0.01201, "grad_norm": 0.40081387758255005, "grad_norm_var": 0.004125116222483612, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.6785311102867126, "loss/hidden": 0.0, "loss/logits": 0.18023326620459557, "loss/reg": 0.3876773416996002, "step": 1201 }, { "epoch": 0.01202, "grad_norm": 0.3561922609806061, "grad_norm_var": 0.004383219873836985, "learning_rate": 5e-05, "loss": 0.167, "loss/crossentropy": 2.7064093351364136, "loss/hidden": 0.0, "loss/logits": 0.16697902977466583, "loss/reg": 0.3876340091228485, "step": 1202 }, { "epoch": 0.01203, "grad_norm": 0.3888217806816101, "grad_norm_var": 0.004394325595838507, "learning_rate": 5e-05, "loss": 0.1781, "loss/crossentropy": 2.7068673968315125, "loss/hidden": 0.0, "loss/logits": 0.17814315855503082, "loss/reg": 0.3875603675842285, "step": 1203 }, { "epoch": 0.01204, "grad_norm": 0.3849945664405823, "grad_norm_var": 0.004398239279775363, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.6956737637519836, "loss/hidden": 0.0, "loss/logits": 0.18480542674660683, "loss/reg": 0.3876267373561859, "step": 1204 }, { "epoch": 0.01205, "grad_norm": 0.4758051633834839, "grad_norm_var": 0.0043511922043554885, "learning_rate": 5e-05, "loss": 0.2061, "loss/crossentropy": 2.730830192565918, "loss/hidden": 0.0, "loss/logits": 0.20607834309339523, "loss/reg": 0.3874613642692566, "step": 1205 }, { "epoch": 0.01206, "grad_norm": 0.45632532238960266, "grad_norm_var": 0.003876995601762075, "learning_rate": 5e-05, "loss": 0.2019, "loss/crossentropy": 2.6466459035873413, "loss/hidden": 0.0, "loss/logits": 0.20192208513617516, "loss/reg": 0.38726064562797546, "step": 1206 }, { "epoch": 0.01207, "grad_norm": 0.4213567078113556, "grad_norm_var": 0.00384082141899527, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.737539827823639, "loss/hidden": 0.0, "loss/logits": 0.17825954034924507, "loss/reg": 0.38715770840644836, "step": 1207 }, { "epoch": 0.01208, "grad_norm": 0.3949061632156372, "grad_norm_var": 0.003872869317713009, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.861967980861664, "loss/hidden": 0.0, "loss/logits": 0.17246368899941444, "loss/reg": 0.38717973232269287, "step": 1208 }, { "epoch": 0.01209, "grad_norm": 0.3880098760128021, "grad_norm_var": 0.0038844189846908963, "learning_rate": 5e-05, "loss": 0.167, "loss/crossentropy": 2.7437891960144043, "loss/hidden": 0.0, "loss/logits": 0.1669784113764763, "loss/reg": 0.38724619150161743, "step": 1209 }, { "epoch": 0.0121, "grad_norm": 0.41461509466171265, "grad_norm_var": 0.003863511856052293, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.846351683139801, "loss/hidden": 0.0, "loss/logits": 0.1820857785642147, "loss/reg": 0.3870801031589508, "step": 1210 }, { "epoch": 0.01211, "grad_norm": 0.4895438849925995, "grad_norm_var": 0.004095689103905331, "learning_rate": 5e-05, "loss": 0.1952, "loss/crossentropy": 2.8813014030456543, "loss/hidden": 0.0, "loss/logits": 0.19519123807549477, "loss/reg": 0.38662147521972656, "step": 1211 }, { "epoch": 0.01212, "grad_norm": 0.40294018387794495, "grad_norm_var": 0.0013302580251414841, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.6987224817276, "loss/hidden": 0.0, "loss/logits": 0.18202592432498932, "loss/reg": 0.38647136092185974, "step": 1212 }, { "epoch": 0.01213, "grad_norm": 0.5841467976570129, "grad_norm_var": 0.0032011116205743445, "learning_rate": 5e-05, "loss": 0.2147, "loss/crossentropy": 2.8024128079414368, "loss/hidden": 0.0, "loss/logits": 0.21470030397176743, "loss/reg": 0.3860331177711487, "step": 1213 }, { "epoch": 0.01214, "grad_norm": 0.3863331377506256, "grad_norm_var": 0.0032805719393109575, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.7637922167778015, "loss/hidden": 0.0, "loss/logits": 0.1740952469408512, "loss/reg": 0.38549575209617615, "step": 1214 }, { "epoch": 0.01215, "grad_norm": 0.38788142800331116, "grad_norm_var": 0.0033407459144098556, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.726113021373749, "loss/hidden": 0.0, "loss/logits": 0.17422878369688988, "loss/reg": 0.3851011395454407, "step": 1215 }, { "epoch": 0.01216, "grad_norm": 0.3931741416454315, "grad_norm_var": 0.003180583288822088, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.77315354347229, "loss/hidden": 0.0, "loss/logits": 0.17214053496718407, "loss/reg": 0.3845239281654358, "step": 1216 }, { "epoch": 0.01217, "grad_norm": 0.40179377794265747, "grad_norm_var": 0.0031780887148862252, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.643063724040985, "loss/hidden": 0.0, "loss/logits": 0.18153265118598938, "loss/reg": 0.3844316303730011, "step": 1217 }, { "epoch": 0.01218, "grad_norm": 0.4170065224170685, "grad_norm_var": 0.002888381152953286, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.6516295075416565, "loss/hidden": 0.0, "loss/logits": 0.18707049265503883, "loss/reg": 0.38388311862945557, "step": 1218 }, { "epoch": 0.01219, "grad_norm": 0.4198005497455597, "grad_norm_var": 0.0028021142417196405, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.817598342895508, "loss/hidden": 0.0, "loss/logits": 0.18168970569968224, "loss/reg": 0.3834301829338074, "step": 1219 }, { "epoch": 0.0122, "grad_norm": 0.422403484582901, "grad_norm_var": 0.0026842283382210656, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 2.7150434255599976, "loss/hidden": 0.0, "loss/logits": 0.1749335676431656, "loss/reg": 0.38268253207206726, "step": 1220 }, { "epoch": 0.01221, "grad_norm": 0.4099593758583069, "grad_norm_var": 0.0025399179822859553, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.6408931016921997, "loss/hidden": 0.0, "loss/logits": 0.18979423120617867, "loss/reg": 0.3826306164264679, "step": 1221 }, { "epoch": 0.01222, "grad_norm": 0.3767073154449463, "grad_norm_var": 0.00259706138002412, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.6918176412582397, "loss/hidden": 0.0, "loss/logits": 0.17012697085738182, "loss/reg": 0.38235336542129517, "step": 1222 }, { "epoch": 0.01223, "grad_norm": 0.3730328381061554, "grad_norm_var": 0.00273047558644907, "learning_rate": 5e-05, "loss": 0.1706, "loss/crossentropy": 2.7631287574768066, "loss/hidden": 0.0, "loss/logits": 0.17058201506733894, "loss/reg": 0.3822631537914276, "step": 1223 }, { "epoch": 0.01224, "grad_norm": 0.42401471734046936, "grad_norm_var": 0.002700047006810513, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.7122631669044495, "loss/hidden": 0.0, "loss/logits": 0.18083498626947403, "loss/reg": 0.38199344277381897, "step": 1224 }, { "epoch": 0.01225, "grad_norm": 0.394697368144989, "grad_norm_var": 0.0026759135798303762, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.876702666282654, "loss/hidden": 0.0, "loss/logits": 0.17858517542481422, "loss/reg": 0.38171741366386414, "step": 1225 }, { "epoch": 0.01226, "grad_norm": 0.37036746740341187, "grad_norm_var": 0.0028219551995546156, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.860628366470337, "loss/hidden": 0.0, "loss/logits": 0.17175961285829544, "loss/reg": 0.381732702255249, "step": 1226 }, { "epoch": 0.01227, "grad_norm": 0.3923279047012329, "grad_norm_var": 0.0024575740562559844, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.796630024909973, "loss/hidden": 0.0, "loss/logits": 0.183272372931242, "loss/reg": 0.3814437985420227, "step": 1227 }, { "epoch": 0.01228, "grad_norm": 0.38148361444473267, "grad_norm_var": 0.0025059350787742427, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.782733201980591, "loss/hidden": 0.0, "loss/logits": 0.18401479348540306, "loss/reg": 0.381173312664032, "step": 1228 }, { "epoch": 0.01229, "grad_norm": 0.591835081577301, "grad_norm_var": 0.002689741474095427, "learning_rate": 5e-05, "loss": 0.1996, "loss/crossentropy": 2.6582566499710083, "loss/hidden": 0.0, "loss/logits": 0.1995810680091381, "loss/reg": 0.3809204697608948, "step": 1229 }, { "epoch": 0.0123, "grad_norm": 0.5596776604652405, "grad_norm_var": 0.004045577903319997, "learning_rate": 5e-05, "loss": 0.192, "loss/crossentropy": 2.790588676929474, "loss/hidden": 0.0, "loss/logits": 0.19195779785513878, "loss/reg": 0.3802122473716736, "step": 1230 }, { "epoch": 0.01231, "grad_norm": 0.4448102116584778, "grad_norm_var": 0.004006157319268411, "learning_rate": 5e-05, "loss": 0.1977, "loss/crossentropy": 2.7936501502990723, "loss/hidden": 0.0, "loss/logits": 0.19769395142793655, "loss/reg": 0.37915104627609253, "step": 1231 }, { "epoch": 0.01232, "grad_norm": 0.9720402359962463, "grad_norm_var": 0.022622442397275424, "learning_rate": 5e-05, "loss": 0.2495, "loss/crossentropy": 2.8077313899993896, "loss/hidden": 0.0, "loss/logits": 0.24949994310736656, "loss/reg": 0.378388911485672, "step": 1232 }, { "epoch": 0.01233, "grad_norm": 0.390862375497818, "grad_norm_var": 0.02271401504679423, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.8020628690719604, "loss/hidden": 0.0, "loss/logits": 0.17465639486908913, "loss/reg": 0.37808799743652344, "step": 1233 }, { "epoch": 0.01234, "grad_norm": 0.37001246213912964, "grad_norm_var": 0.023114004135869854, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 2.6619579195976257, "loss/hidden": 0.0, "loss/logits": 0.1704522706568241, "loss/reg": 0.3775133192539215, "step": 1234 }, { "epoch": 0.01235, "grad_norm": 0.3791680634021759, "grad_norm_var": 0.02341264191085755, "learning_rate": 5e-05, "loss": 0.172, "loss/crossentropy": 2.6803325414657593, "loss/hidden": 0.0, "loss/logits": 0.17197002843022346, "loss/reg": 0.3774578869342804, "step": 1235 }, { "epoch": 0.01236, "grad_norm": 0.3780006468296051, "grad_norm_var": 0.023719008801981453, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.8415806889533997, "loss/hidden": 0.0, "loss/logits": 0.1687580980360508, "loss/reg": 0.377366840839386, "step": 1236 }, { "epoch": 0.01237, "grad_norm": 0.5851230025291443, "grad_norm_var": 0.0246883641291566, "learning_rate": 5e-05, "loss": 0.2042, "loss/crossentropy": 2.771979033946991, "loss/hidden": 0.0, "loss/logits": 0.20417256653308868, "loss/reg": 0.37733784317970276, "step": 1237 }, { "epoch": 0.01238, "grad_norm": 0.4819883406162262, "grad_norm_var": 0.024190704030946187, "learning_rate": 5e-05, "loss": 0.1834, "loss/crossentropy": 2.8223971724510193, "loss/hidden": 0.0, "loss/logits": 0.18336526677012444, "loss/reg": 0.37682974338531494, "step": 1238 }, { "epoch": 0.01239, "grad_norm": 0.3840429484844208, "grad_norm_var": 0.024058734943816384, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.911860704421997, "loss/hidden": 0.0, "loss/logits": 0.1827603615820408, "loss/reg": 0.3761584162712097, "step": 1239 }, { "epoch": 0.0124, "grad_norm": 0.43960681557655334, "grad_norm_var": 0.023980868539642927, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.68120676279068, "loss/hidden": 0.0, "loss/logits": 0.19256151095032692, "loss/reg": 0.37579765915870667, "step": 1240 }, { "epoch": 0.01241, "grad_norm": 0.4627563953399658, "grad_norm_var": 0.023589277538794147, "learning_rate": 5e-05, "loss": 0.1897, "loss/crossentropy": 2.613134026527405, "loss/hidden": 0.0, "loss/logits": 0.18966776132583618, "loss/reg": 0.37544333934783936, "step": 1241 }, { "epoch": 0.01242, "grad_norm": 0.36657577753067017, "grad_norm_var": 0.023642571680739976, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.7777265310287476, "loss/hidden": 0.0, "loss/logits": 0.17212800681591034, "loss/reg": 0.3750650882720947, "step": 1242 }, { "epoch": 0.01243, "grad_norm": 0.43940117955207825, "grad_norm_var": 0.02326990217446575, "learning_rate": 5e-05, "loss": 0.1944, "loss/crossentropy": 2.7595353722572327, "loss/hidden": 0.0, "loss/logits": 0.1943615898489952, "loss/reg": 0.3749193251132965, "step": 1243 }, { "epoch": 0.01244, "grad_norm": 0.4149992763996124, "grad_norm_var": 0.02291455808975105, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 2.6648385524749756, "loss/hidden": 0.0, "loss/logits": 0.17874496802687645, "loss/reg": 0.3745426833629608, "step": 1244 }, { "epoch": 0.01245, "grad_norm": 0.395551860332489, "grad_norm_var": 0.022364414377559427, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.6421396732330322, "loss/hidden": 0.0, "loss/logits": 0.18270908296108246, "loss/reg": 0.37424394488334656, "step": 1245 }, { "epoch": 0.01246, "grad_norm": 0.42351123690605164, "grad_norm_var": 0.021832256547001418, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.8389651775360107, "loss/hidden": 0.0, "loss/logits": 0.17664410173892975, "loss/reg": 0.37399137020111084, "step": 1246 }, { "epoch": 0.01247, "grad_norm": 0.4203534722328186, "grad_norm_var": 0.021912742247351746, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.7708972692489624, "loss/hidden": 0.0, "loss/logits": 0.18704869225621223, "loss/reg": 0.3737372159957886, "step": 1247 }, { "epoch": 0.01248, "grad_norm": 0.6355656385421753, "grad_norm_var": 0.0058598470611779365, "learning_rate": 5e-05, "loss": 0.2194, "loss/crossentropy": 2.764471113681793, "loss/hidden": 0.0, "loss/logits": 0.21944145113229752, "loss/reg": 0.3731713891029358, "step": 1248 }, { "epoch": 0.01249, "grad_norm": 0.4484435021877289, "grad_norm_var": 0.005724597135704601, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.7338152527809143, "loss/hidden": 0.0, "loss/logits": 0.19031883776187897, "loss/reg": 0.37268662452697754, "step": 1249 }, { "epoch": 0.0125, "grad_norm": 0.5546287298202515, "grad_norm_var": 0.006154938860649895, "learning_rate": 5e-05, "loss": 0.1765, "loss/crossentropy": 2.7016512751579285, "loss/hidden": 0.0, "loss/logits": 0.17653854563832283, "loss/reg": 0.37214773893356323, "step": 1250 }, { "epoch": 0.01251, "grad_norm": 0.4070480167865753, "grad_norm_var": 0.00593795656665993, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.8438331484794617, "loss/hidden": 0.0, "loss/logits": 0.17753003537654877, "loss/reg": 0.37190675735473633, "step": 1251 }, { "epoch": 0.01252, "grad_norm": 0.42461103200912476, "grad_norm_var": 0.005611680529391746, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 2.6726028323173523, "loss/hidden": 0.0, "loss/logits": 0.18962230160832405, "loss/reg": 0.3711521327495575, "step": 1252 }, { "epoch": 0.01253, "grad_norm": 0.431164413690567, "grad_norm_var": 0.004427390779072863, "learning_rate": 5e-05, "loss": 0.1834, "loss/crossentropy": 2.7441052198410034, "loss/hidden": 0.0, "loss/logits": 0.18337387591600418, "loss/reg": 0.3706527054309845, "step": 1253 }, { "epoch": 0.01254, "grad_norm": 0.44941940903663635, "grad_norm_var": 0.004335845530325507, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.7429264187812805, "loss/hidden": 0.0, "loss/logits": 0.1865556761622429, "loss/reg": 0.37038347125053406, "step": 1254 }, { "epoch": 0.01255, "grad_norm": 0.393535315990448, "grad_norm_var": 0.004266092467774015, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.824369788169861, "loss/hidden": 0.0, "loss/logits": 0.1791834607720375, "loss/reg": 0.37008222937583923, "step": 1255 }, { "epoch": 0.01256, "grad_norm": 0.4587359130382538, "grad_norm_var": 0.004277251938203304, "learning_rate": 5e-05, "loss": 0.1989, "loss/crossentropy": 2.7429704666137695, "loss/hidden": 0.0, "loss/logits": 0.19885289296507835, "loss/reg": 0.3698022961616516, "step": 1256 }, { "epoch": 0.01257, "grad_norm": 0.5350925326347351, "grad_norm_var": 0.004771743090325827, "learning_rate": 5e-05, "loss": 0.2281, "loss/crossentropy": 2.9342111945152283, "loss/hidden": 0.0, "loss/logits": 0.22812102735042572, "loss/reg": 0.3694140911102295, "step": 1257 }, { "epoch": 0.01258, "grad_norm": 0.4469914138317108, "grad_norm_var": 0.004282341841473303, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.673809051513672, "loss/hidden": 0.0, "loss/logits": 0.19316794723272324, "loss/reg": 0.36933839321136475, "step": 1258 }, { "epoch": 0.01259, "grad_norm": 0.3693523108959198, "grad_norm_var": 0.004734157385839988, "learning_rate": 5e-05, "loss": 0.1767, "loss/crossentropy": 2.7626762986183167, "loss/hidden": 0.0, "loss/logits": 0.1767345704138279, "loss/reg": 0.3690941035747528, "step": 1259 }, { "epoch": 0.0126, "grad_norm": 0.40777453780174255, "grad_norm_var": 0.004771677933356063, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.6760921478271484, "loss/hidden": 0.0, "loss/logits": 0.18078167736530304, "loss/reg": 0.368704617023468, "step": 1260 }, { "epoch": 0.01261, "grad_norm": 0.40075087547302246, "grad_norm_var": 0.004735546642978679, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.873387634754181, "loss/hidden": 0.0, "loss/logits": 0.18389084190130234, "loss/reg": 0.3683547079563141, "step": 1261 }, { "epoch": 0.01262, "grad_norm": 0.3951131999492645, "grad_norm_var": 0.004887898286404691, "learning_rate": 5e-05, "loss": 0.1738, "loss/crossentropy": 2.8309091925621033, "loss/hidden": 0.0, "loss/logits": 0.17377372086048126, "loss/reg": 0.36804255843162537, "step": 1262 }, { "epoch": 0.01263, "grad_norm": 0.4044095277786255, "grad_norm_var": 0.00496396476492379, "learning_rate": 5e-05, "loss": 0.1854, "loss/crossentropy": 2.8515381813049316, "loss/hidden": 0.0, "loss/logits": 0.18542711809277534, "loss/reg": 0.3678068518638611, "step": 1263 }, { "epoch": 0.01264, "grad_norm": 0.42380544543266296, "grad_norm_var": 0.0024612903361025543, "learning_rate": 5e-05, "loss": 0.2023, "loss/crossentropy": 2.7520610690116882, "loss/hidden": 0.0, "loss/logits": 0.20231406763195992, "loss/reg": 0.36779168248176575, "step": 1264 }, { "epoch": 0.01265, "grad_norm": 0.3761654794216156, "grad_norm_var": 0.0026527459716399063, "learning_rate": 5e-05, "loss": 0.1748, "loss/crossentropy": 2.692035734653473, "loss/hidden": 0.0, "loss/logits": 0.1747821681201458, "loss/reg": 0.3675667345523834, "step": 1265 }, { "epoch": 0.01266, "grad_norm": 0.3921130895614624, "grad_norm_var": 0.00160100674544581, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.8044044375419617, "loss/hidden": 0.0, "loss/logits": 0.176239013671875, "loss/reg": 0.3676108121871948, "step": 1266 }, { "epoch": 0.01267, "grad_norm": 0.38018599152565, "grad_norm_var": 0.00169161670700159, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.635262131690979, "loss/hidden": 0.0, "loss/logits": 0.1790744848549366, "loss/reg": 0.36727651953697205, "step": 1267 }, { "epoch": 0.01268, "grad_norm": 0.3951775133609772, "grad_norm_var": 0.001720117027549147, "learning_rate": 5e-05, "loss": 0.1895, "loss/crossentropy": 2.7155182361602783, "loss/hidden": 0.0, "loss/logits": 0.1895393803715706, "loss/reg": 0.36737489700317383, "step": 1268 }, { "epoch": 0.01269, "grad_norm": 0.38271477818489075, "grad_norm_var": 0.0017703950782785823, "learning_rate": 5e-05, "loss": 0.1919, "loss/crossentropy": 2.7927077412605286, "loss/hidden": 0.0, "loss/logits": 0.19193745777010918, "loss/reg": 0.36742591857910156, "step": 1269 }, { "epoch": 0.0127, "grad_norm": 0.37830328941345215, "grad_norm_var": 0.0017431325280333235, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.739274740219116, "loss/hidden": 0.0, "loss/logits": 0.17395249009132385, "loss/reg": 0.3674521744251251, "step": 1270 }, { "epoch": 0.01271, "grad_norm": 0.5812702178955078, "grad_norm_var": 0.0035647174067425496, "learning_rate": 5e-05, "loss": 0.191, "loss/crossentropy": 2.699429929256439, "loss/hidden": 0.0, "loss/logits": 0.19097519293427467, "loss/reg": 0.36744165420532227, "step": 1271 }, { "epoch": 0.01272, "grad_norm": 0.38018864393234253, "grad_norm_var": 0.003549849734763137, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.7875409722328186, "loss/hidden": 0.0, "loss/logits": 0.1735709086060524, "loss/reg": 0.3670593500137329, "step": 1272 }, { "epoch": 0.01273, "grad_norm": 0.41786378622055054, "grad_norm_var": 0.0025408462086981332, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.9104411602020264, "loss/hidden": 0.0, "loss/logits": 0.1906200833618641, "loss/reg": 0.36682993173599243, "step": 1273 }, { "epoch": 0.01274, "grad_norm": 0.401813805103302, "grad_norm_var": 0.0024351116089095016, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.8041732907295227, "loss/hidden": 0.0, "loss/logits": 0.1754557266831398, "loss/reg": 0.3669557571411133, "step": 1274 }, { "epoch": 0.01275, "grad_norm": 0.4212268888950348, "grad_norm_var": 0.0023537090775912705, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.7287241220474243, "loss/hidden": 0.0, "loss/logits": 0.19033314660191536, "loss/reg": 0.3671852946281433, "step": 1275 }, { "epoch": 0.01276, "grad_norm": 0.455710768699646, "grad_norm_var": 0.0024915406282637444, "learning_rate": 5e-05, "loss": 0.2073, "loss/crossentropy": 2.8313666582107544, "loss/hidden": 0.0, "loss/logits": 0.20731393620371819, "loss/reg": 0.36742693185806274, "step": 1276 }, { "epoch": 0.01277, "grad_norm": 0.4010087549686432, "grad_norm_var": 0.002491169141681837, "learning_rate": 5e-05, "loss": 0.1813, "loss/crossentropy": 2.8288145661354065, "loss/hidden": 0.0, "loss/logits": 0.18133926391601562, "loss/reg": 0.3674502670764923, "step": 1277 }, { "epoch": 0.01278, "grad_norm": 0.8181006908416748, "grad_norm_var": 0.012738556450205486, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.857640504837036, "loss/hidden": 0.0, "loss/logits": 0.19062136486172676, "loss/reg": 0.36726728081703186, "step": 1278 }, { "epoch": 0.01279, "grad_norm": 0.45254287123680115, "grad_norm_var": 0.012666955634639977, "learning_rate": 5e-05, "loss": 0.1891, "loss/crossentropy": 2.7171207070350647, "loss/hidden": 0.0, "loss/logits": 0.18908429518342018, "loss/reg": 0.3672482371330261, "step": 1279 }, { "epoch": 0.0128, "grad_norm": 0.407359778881073, "grad_norm_var": 0.012721863245722172, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.7687178254127502, "loss/hidden": 0.0, "loss/logits": 0.18296590819954872, "loss/reg": 0.3672524690628052, "step": 1280 }, { "epoch": 0.01281, "grad_norm": 0.44425785541534424, "grad_norm_var": 0.012431105476501732, "learning_rate": 5e-05, "loss": 0.1908, "loss/crossentropy": 2.744910478591919, "loss/hidden": 0.0, "loss/logits": 0.19084115326404572, "loss/reg": 0.36750754714012146, "step": 1281 }, { "epoch": 0.01282, "grad_norm": 0.46303698420524597, "grad_norm_var": 0.012251372458759813, "learning_rate": 5e-05, "loss": 0.1944, "loss/crossentropy": 2.751496732234955, "loss/hidden": 0.0, "loss/logits": 0.1943591572344303, "loss/reg": 0.36741989850997925, "step": 1282 }, { "epoch": 0.01283, "grad_norm": 0.4303838610649109, "grad_norm_var": 0.011949640288087515, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.797963559627533, "loss/hidden": 0.0, "loss/logits": 0.19400545954704285, "loss/reg": 0.36759573221206665, "step": 1283 }, { "epoch": 0.01284, "grad_norm": 0.4107776880264282, "grad_norm_var": 0.01184679367118605, "learning_rate": 5e-05, "loss": 0.1872, "loss/crossentropy": 2.9499067664146423, "loss/hidden": 0.0, "loss/logits": 0.18718478828668594, "loss/reg": 0.36729204654693604, "step": 1284 }, { "epoch": 0.01285, "grad_norm": 0.3876706063747406, "grad_norm_var": 0.011801945263829305, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.7922670245170593, "loss/hidden": 0.0, "loss/logits": 0.18294870480895042, "loss/reg": 0.3674081265926361, "step": 1285 }, { "epoch": 0.01286, "grad_norm": 0.3949038088321686, "grad_norm_var": 0.011653348485858913, "learning_rate": 5e-05, "loss": 0.1862, "loss/crossentropy": 2.7257879972457886, "loss/hidden": 0.0, "loss/logits": 0.18615350499749184, "loss/reg": 0.367165207862854, "step": 1286 }, { "epoch": 0.01287, "grad_norm": 0.6826955676078796, "grad_norm_var": 0.014013936104369051, "learning_rate": 5e-05, "loss": 0.189, "loss/crossentropy": 2.4787283539772034, "loss/hidden": 0.0, "loss/logits": 0.1889822706580162, "loss/reg": 0.36681169271469116, "step": 1287 }, { "epoch": 0.01288, "grad_norm": 0.4110484719276428, "grad_norm_var": 0.013742607406505947, "learning_rate": 5e-05, "loss": 0.1855, "loss/crossentropy": 2.733353316783905, "loss/hidden": 0.0, "loss/logits": 0.18552840873599052, "loss/reg": 0.36641111969947815, "step": 1288 }, { "epoch": 0.01289, "grad_norm": 0.38021737337112427, "grad_norm_var": 0.014055364300353191, "learning_rate": 5e-05, "loss": 0.1724, "loss/crossentropy": 2.806664764881134, "loss/hidden": 0.0, "loss/logits": 0.17244770005345345, "loss/reg": 0.36629027128219604, "step": 1289 }, { "epoch": 0.0129, "grad_norm": 0.5418382883071899, "grad_norm_var": 0.014191244910938617, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.8575419783592224, "loss/hidden": 0.0, "loss/logits": 0.19032838568091393, "loss/reg": 0.36597147583961487, "step": 1290 }, { "epoch": 0.01291, "grad_norm": 0.45716896653175354, "grad_norm_var": 0.014043407821204553, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.8930081129074097, "loss/hidden": 0.0, "loss/logits": 0.17819616571068764, "loss/reg": 0.3657359480857849, "step": 1291 }, { "epoch": 0.01292, "grad_norm": 0.40643227100372314, "grad_norm_var": 0.014296756285762635, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.794131875038147, "loss/hidden": 0.0, "loss/logits": 0.18218058347702026, "loss/reg": 0.3652525544166565, "step": 1292 }, { "epoch": 0.01293, "grad_norm": 0.9682717323303223, "grad_norm_var": 0.02933474924526124, "learning_rate": 5e-05, "loss": 0.2289, "loss/crossentropy": 2.8689537048339844, "loss/hidden": 0.0, "loss/logits": 0.22890295088291168, "loss/reg": 0.36496689915657043, "step": 1293 }, { "epoch": 0.01294, "grad_norm": 0.44706085324287415, "grad_norm_var": 0.0223774262219655, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.908451497554779, "loss/hidden": 0.0, "loss/logits": 0.18601104617118835, "loss/reg": 0.36438408493995667, "step": 1294 }, { "epoch": 0.01295, "grad_norm": 0.42671775817871094, "grad_norm_var": 0.02251487379790934, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.8207754492759705, "loss/hidden": 0.0, "loss/logits": 0.1779862940311432, "loss/reg": 0.3639226257801056, "step": 1295 }, { "epoch": 0.01296, "grad_norm": 0.43073466420173645, "grad_norm_var": 0.02232655524917484, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 2.8013527393341064, "loss/hidden": 0.0, "loss/logits": 0.17767785117030144, "loss/reg": 0.36372074484825134, "step": 1296 }, { "epoch": 0.01297, "grad_norm": 0.4185357093811035, "grad_norm_var": 0.022491178518384305, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 2.676852524280548, "loss/hidden": 0.0, "loss/logits": 0.1836955025792122, "loss/reg": 0.3633468449115753, "step": 1297 }, { "epoch": 0.01298, "grad_norm": 0.4396490454673767, "grad_norm_var": 0.02257387678810385, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.7611002326011658, "loss/hidden": 0.0, "loss/logits": 0.18850326910614967, "loss/reg": 0.36287131905555725, "step": 1298 }, { "epoch": 0.01299, "grad_norm": 0.4823966920375824, "grad_norm_var": 0.02241876210525562, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.811491847038269, "loss/hidden": 0.0, "loss/logits": 0.18922247737646103, "loss/reg": 0.3623615503311157, "step": 1299 }, { "epoch": 0.013, "grad_norm": 0.4436049163341522, "grad_norm_var": 0.022181456184773293, "learning_rate": 5e-05, "loss": 0.1858, "loss/crossentropy": 2.712536931037903, "loss/hidden": 0.0, "loss/logits": 0.185833640396595, "loss/reg": 0.3620838522911072, "step": 1300 }, { "epoch": 0.01301, "grad_norm": 0.4246485233306885, "grad_norm_var": 0.02179969535476792, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.7811660766601562, "loss/hidden": 0.0, "loss/logits": 0.18463638797402382, "loss/reg": 0.3614282011985779, "step": 1301 }, { "epoch": 0.01302, "grad_norm": 0.6236568093299866, "grad_norm_var": 0.022329990184913925, "learning_rate": 5e-05, "loss": 0.2016, "loss/crossentropy": 2.909895420074463, "loss/hidden": 0.0, "loss/logits": 0.20157384127378464, "loss/reg": 0.3608205020427704, "step": 1302 }, { "epoch": 0.01303, "grad_norm": 0.6981467604637146, "grad_norm_var": 0.02272326622731979, "learning_rate": 5e-05, "loss": 0.1948, "loss/crossentropy": 2.8728304505348206, "loss/hidden": 0.0, "loss/logits": 0.19475984200835228, "loss/reg": 0.36067965626716614, "step": 1303 }, { "epoch": 0.01304, "grad_norm": 0.5345349907875061, "grad_norm_var": 0.022211615896722044, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.6592658162117004, "loss/hidden": 0.0, "loss/logits": 0.17566868662834167, "loss/reg": 0.36056801676750183, "step": 1304 }, { "epoch": 0.01305, "grad_norm": 0.44836506247520447, "grad_norm_var": 0.021343283884640425, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 2.8403496146202087, "loss/hidden": 0.0, "loss/logits": 0.19007616862654686, "loss/reg": 0.36047133803367615, "step": 1305 }, { "epoch": 0.01306, "grad_norm": 0.4141225814819336, "grad_norm_var": 0.021854378975383188, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.9146127104759216, "loss/hidden": 0.0, "loss/logits": 0.17743303999304771, "loss/reg": 0.36049824953079224, "step": 1306 }, { "epoch": 0.01307, "grad_norm": 0.43955376744270325, "grad_norm_var": 0.02198377110692302, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.8313023447990417, "loss/hidden": 0.0, "loss/logits": 0.18189503252506256, "loss/reg": 0.36038738489151, "step": 1307 }, { "epoch": 0.01308, "grad_norm": 0.6660140156745911, "grad_norm_var": 0.02285628437013636, "learning_rate": 5e-05, "loss": 0.1976, "loss/crossentropy": 3.007749915122986, "loss/hidden": 0.0, "loss/logits": 0.19761746004223824, "loss/reg": 0.36027249693870544, "step": 1308 }, { "epoch": 0.01309, "grad_norm": 0.6222243905067444, "grad_norm_var": 0.009617151034902172, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.9097059965133667, "loss/hidden": 0.0, "loss/logits": 0.18738599494099617, "loss/reg": 0.3602071702480316, "step": 1309 }, { "epoch": 0.0131, "grad_norm": 0.5275961756706238, "grad_norm_var": 0.00948092689924209, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.7722021341323853, "loss/hidden": 0.0, "loss/logits": 0.190641887485981, "loss/reg": 0.3599284589290619, "step": 1310 }, { "epoch": 0.01311, "grad_norm": 0.4349927604198456, "grad_norm_var": 0.009401558924372349, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.8899876475334167, "loss/hidden": 0.0, "loss/logits": 0.18514901772141457, "loss/reg": 0.3597680926322937, "step": 1311 }, { "epoch": 0.01312, "grad_norm": 0.43490666151046753, "grad_norm_var": 0.00936242099145093, "learning_rate": 5e-05, "loss": 0.1919, "loss/crossentropy": 2.7673835158348083, "loss/hidden": 0.0, "loss/logits": 0.1919158361852169, "loss/reg": 0.35951030254364014, "step": 1312 }, { "epoch": 0.01313, "grad_norm": 0.455925315618515, "grad_norm_var": 0.009027174731603912, "learning_rate": 5e-05, "loss": 0.1858, "loss/crossentropy": 2.766758382320404, "loss/hidden": 0.0, "loss/logits": 0.18576138094067574, "loss/reg": 0.35917556285858154, "step": 1313 }, { "epoch": 0.01314, "grad_norm": 0.503605306148529, "grad_norm_var": 0.008720034497195783, "learning_rate": 5e-05, "loss": 0.2111, "loss/crossentropy": 2.734840750694275, "loss/hidden": 0.0, "loss/logits": 0.21114115417003632, "loss/reg": 0.3587586581707001, "step": 1314 }, { "epoch": 0.01315, "grad_norm": 0.39140117168426514, "grad_norm_var": 0.009568123407985551, "learning_rate": 5e-05, "loss": 0.1909, "loss/crossentropy": 2.7055675387382507, "loss/hidden": 0.0, "loss/logits": 0.19085485860705376, "loss/reg": 0.35820409655570984, "step": 1315 }, { "epoch": 0.01316, "grad_norm": 0.42627131938934326, "grad_norm_var": 0.009726382401697231, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.6431260108947754, "loss/hidden": 0.0, "loss/logits": 0.18668852373957634, "loss/reg": 0.3576488792896271, "step": 1316 }, { "epoch": 0.01317, "grad_norm": 0.45641186833381653, "grad_norm_var": 0.009458150442148627, "learning_rate": 5e-05, "loss": 0.2056, "loss/crossentropy": 2.639075458049774, "loss/hidden": 0.0, "loss/logits": 0.20559605583548546, "loss/reg": 0.3573777377605438, "step": 1317 }, { "epoch": 0.01318, "grad_norm": 0.3929937183856964, "grad_norm_var": 0.009129826885408088, "learning_rate": 5e-05, "loss": 0.1805, "loss/crossentropy": 2.7497580647468567, "loss/hidden": 0.0, "loss/logits": 0.18045229092240334, "loss/reg": 0.35725221037864685, "step": 1318 }, { "epoch": 0.01319, "grad_norm": 0.4257049858570099, "grad_norm_var": 0.006223851688840117, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.831692695617676, "loss/hidden": 0.0, "loss/logits": 0.1819627769291401, "loss/reg": 0.3572128117084503, "step": 1319 }, { "epoch": 0.0132, "grad_norm": 0.4246101677417755, "grad_norm_var": 0.006083239896430115, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.876562237739563, "loss/hidden": 0.0, "loss/logits": 0.1768844649195671, "loss/reg": 0.35708630084991455, "step": 1320 }, { "epoch": 0.01321, "grad_norm": 0.4127863049507141, "grad_norm_var": 0.0062485918607618944, "learning_rate": 5e-05, "loss": 0.1781, "loss/crossentropy": 2.756684720516205, "loss/hidden": 0.0, "loss/logits": 0.17806973680853844, "loss/reg": 0.3572588860988617, "step": 1321 }, { "epoch": 0.01322, "grad_norm": 0.3985500633716583, "grad_norm_var": 0.006367975079097713, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 2.749648094177246, "loss/hidden": 0.0, "loss/logits": 0.17969444021582603, "loss/reg": 0.35717645287513733, "step": 1322 }, { "epoch": 0.01323, "grad_norm": 0.4082765579223633, "grad_norm_var": 0.006528340313648976, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.893884599208832, "loss/hidden": 0.0, "loss/logits": 0.17889782413840294, "loss/reg": 0.3569119870662689, "step": 1323 }, { "epoch": 0.01324, "grad_norm": 0.40776604413986206, "grad_norm_var": 0.0036508258895222097, "learning_rate": 5e-05, "loss": 0.1813, "loss/crossentropy": 2.7674633860588074, "loss/hidden": 0.0, "loss/logits": 0.1813463345170021, "loss/reg": 0.35726845264434814, "step": 1324 }, { "epoch": 0.01325, "grad_norm": 0.420671671628952, "grad_norm_var": 0.0014338769157644057, "learning_rate": 5e-05, "loss": 0.1966, "loss/crossentropy": 2.887144446372986, "loss/hidden": 0.0, "loss/logits": 0.19661224633455276, "loss/reg": 0.35728538036346436, "step": 1325 }, { "epoch": 0.01326, "grad_norm": 0.40821748971939087, "grad_norm_var": 0.0008133777701522978, "learning_rate": 5e-05, "loss": 0.1904, "loss/crossentropy": 2.716045618057251, "loss/hidden": 0.0, "loss/logits": 0.19044547900557518, "loss/reg": 0.3572133481502533, "step": 1326 }, { "epoch": 0.01327, "grad_norm": 0.4067571759223938, "grad_norm_var": 0.0008263129911417348, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.907903254032135, "loss/hidden": 0.0, "loss/logits": 0.18197864666581154, "loss/reg": 0.3574914038181305, "step": 1327 }, { "epoch": 0.01328, "grad_norm": 0.4145456552505493, "grad_norm_var": 0.0008210626995405238, "learning_rate": 5e-05, "loss": 0.1923, "loss/crossentropy": 2.834200441837311, "loss/hidden": 0.0, "loss/logits": 0.1922735497355461, "loss/reg": 0.35732537508010864, "step": 1328 }, { "epoch": 0.01329, "grad_norm": 0.4048939645290375, "grad_norm_var": 0.0007540521138342175, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.6791593432426453, "loss/hidden": 0.0, "loss/logits": 0.18364739418029785, "loss/reg": 0.3576776087284088, "step": 1329 }, { "epoch": 0.0133, "grad_norm": 0.402102530002594, "grad_norm_var": 0.0002525010032796223, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.715786099433899, "loss/hidden": 0.0, "loss/logits": 0.18091581389307976, "loss/reg": 0.35780033469200134, "step": 1330 }, { "epoch": 0.01331, "grad_norm": 0.4056035280227661, "grad_norm_var": 0.00022492188904028968, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.83488392829895, "loss/hidden": 0.0, "loss/logits": 0.18117047101259232, "loss/reg": 0.3576710522174835, "step": 1331 }, { "epoch": 0.01332, "grad_norm": 0.40305331349372864, "grad_norm_var": 0.00021910906374788356, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.7181341648101807, "loss/hidden": 0.0, "loss/logits": 0.17213882133364677, "loss/reg": 0.35758498311042786, "step": 1332 }, { "epoch": 0.01333, "grad_norm": 0.8445563316345215, "grad_norm_var": 0.011930489513323438, "learning_rate": 5e-05, "loss": 0.2299, "loss/crossentropy": 2.697631597518921, "loss/hidden": 0.0, "loss/logits": 0.22989913821220398, "loss/reg": 0.357497900724411, "step": 1333 }, { "epoch": 0.01334, "grad_norm": 0.4477691650390625, "grad_norm_var": 0.011801596424529315, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.789981961250305, "loss/hidden": 0.0, "loss/logits": 0.18439063429832458, "loss/reg": 0.35725006461143494, "step": 1334 }, { "epoch": 0.01335, "grad_norm": 0.49369895458221436, "grad_norm_var": 0.011963291515966583, "learning_rate": 5e-05, "loss": 0.1714, "loss/crossentropy": 2.6917919516563416, "loss/hidden": 0.0, "loss/logits": 0.17137034237384796, "loss/reg": 0.35690128803253174, "step": 1335 }, { "epoch": 0.01336, "grad_norm": 0.5500026345252991, "grad_norm_var": 0.012621964838578728, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.7222613096237183, "loss/hidden": 0.0, "loss/logits": 0.17991899698972702, "loss/reg": 0.35641443729400635, "step": 1336 }, { "epoch": 0.01337, "grad_norm": 0.5038531422615051, "grad_norm_var": 0.012666231498029153, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.821013391017914, "loss/hidden": 0.0, "loss/logits": 0.1826494000852108, "loss/reg": 0.35586827993392944, "step": 1337 }, { "epoch": 0.01338, "grad_norm": 0.4341466426849365, "grad_norm_var": 0.01246554306727496, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.855484127998352, "loss/hidden": 0.0, "loss/logits": 0.1931704320013523, "loss/reg": 0.3552441895008087, "step": 1338 }, { "epoch": 0.01339, "grad_norm": 0.45805656909942627, "grad_norm_var": 0.012278810035475459, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.6258126497268677, "loss/hidden": 0.0, "loss/logits": 0.18601882085204124, "loss/reg": 0.3550010025501251, "step": 1339 }, { "epoch": 0.0134, "grad_norm": 0.4218948781490326, "grad_norm_var": 0.01218750575993539, "learning_rate": 5e-05, "loss": 0.1854, "loss/crossentropy": 2.760298430919647, "loss/hidden": 0.0, "loss/logits": 0.1853986717760563, "loss/reg": 0.35461604595184326, "step": 1340 }, { "epoch": 0.01341, "grad_norm": 0.40807053446769714, "grad_norm_var": 0.012269789655525424, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.875471830368042, "loss/hidden": 0.0, "loss/logits": 0.17345694452524185, "loss/reg": 0.3539476692676544, "step": 1341 }, { "epoch": 0.01342, "grad_norm": 0.39526405930519104, "grad_norm_var": 0.01237480888038365, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.7369576692581177, "loss/hidden": 0.0, "loss/logits": 0.19056104496121407, "loss/reg": 0.3537426292896271, "step": 1342 }, { "epoch": 0.01343, "grad_norm": 0.4251244068145752, "grad_norm_var": 0.01226025857874185, "learning_rate": 5e-05, "loss": 0.1913, "loss/crossentropy": 2.9759350419044495, "loss/hidden": 0.0, "loss/logits": 0.1913316249847412, "loss/reg": 0.35308822989463806, "step": 1343 }, { "epoch": 0.01344, "grad_norm": 0.43192777037620544, "grad_norm_var": 0.012166172092030672, "learning_rate": 5e-05, "loss": 0.1873, "loss/crossentropy": 2.80673348903656, "loss/hidden": 0.0, "loss/logits": 0.187309380620718, "loss/reg": 0.3524065911769867, "step": 1344 }, { "epoch": 0.01345, "grad_norm": 0.44460955262184143, "grad_norm_var": 0.011949772416254693, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.791440010070801, "loss/hidden": 0.0, "loss/logits": 0.1765591837465763, "loss/reg": 0.3519209623336792, "step": 1345 }, { "epoch": 0.01346, "grad_norm": 0.45511841773986816, "grad_norm_var": 0.01166769503468276, "learning_rate": 5e-05, "loss": 0.1887, "loss/crossentropy": 2.8598415851593018, "loss/hidden": 0.0, "loss/logits": 0.1886826753616333, "loss/reg": 0.3508933186531067, "step": 1346 }, { "epoch": 0.01347, "grad_norm": 0.470003604888916, "grad_norm_var": 0.011372478172222896, "learning_rate": 5e-05, "loss": 0.1919, "loss/crossentropy": 2.7885149717330933, "loss/hidden": 0.0, "loss/logits": 0.19185973331332207, "loss/reg": 0.35025668144226074, "step": 1347 }, { "epoch": 0.01348, "grad_norm": 0.4061746299266815, "grad_norm_var": 0.011343478877303411, "learning_rate": 5e-05, "loss": 0.1951, "loss/crossentropy": 2.7892218232154846, "loss/hidden": 0.0, "loss/logits": 0.19506042450666428, "loss/reg": 0.3496808707714081, "step": 1348 }, { "epoch": 0.01349, "grad_norm": 0.38061806559562683, "grad_norm_var": 0.0018981093943051955, "learning_rate": 5e-05, "loss": 0.1684, "loss/crossentropy": 2.9718902707099915, "loss/hidden": 0.0, "loss/logits": 0.16839584335684776, "loss/reg": 0.34894415736198425, "step": 1349 }, { "epoch": 0.0135, "grad_norm": 0.4164532423019409, "grad_norm_var": 0.0019494925102219582, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.813636839389801, "loss/hidden": 0.0, "loss/logits": 0.18450842052698135, "loss/reg": 0.34820446372032166, "step": 1350 }, { "epoch": 0.01351, "grad_norm": 0.4747794270515442, "grad_norm_var": 0.0018450772598940569, "learning_rate": 5e-05, "loss": 0.2015, "loss/crossentropy": 2.771188795566559, "loss/hidden": 0.0, "loss/logits": 0.20152737200260162, "loss/reg": 0.3478449881076813, "step": 1351 }, { "epoch": 0.01352, "grad_norm": 0.41241517663002014, "grad_norm_var": 0.0010516119491009205, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.8299490213394165, "loss/hidden": 0.0, "loss/logits": 0.18560518696904182, "loss/reg": 0.3475385904312134, "step": 1352 }, { "epoch": 0.01353, "grad_norm": 0.39533287286758423, "grad_norm_var": 0.0007719569007219077, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.6570538878440857, "loss/hidden": 0.0, "loss/logits": 0.188511710613966, "loss/reg": 0.34721770882606506, "step": 1353 }, { "epoch": 0.01354, "grad_norm": 0.3972763121128082, "grad_norm_var": 0.0008211698961015442, "learning_rate": 5e-05, "loss": 0.1691, "loss/crossentropy": 2.7883129119873047, "loss/hidden": 0.0, "loss/logits": 0.16907838359475136, "loss/reg": 0.3471667766571045, "step": 1354 }, { "epoch": 0.01355, "grad_norm": 0.37970274686813354, "grad_norm_var": 0.0008550370828134294, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.9191458225250244, "loss/hidden": 0.0, "loss/logits": 0.17461097985506058, "loss/reg": 0.346833199262619, "step": 1355 }, { "epoch": 0.01356, "grad_norm": 0.4333128035068512, "grad_norm_var": 0.0008665679307727659, "learning_rate": 5e-05, "loss": 0.1923, "loss/crossentropy": 2.9888545274734497, "loss/hidden": 0.0, "loss/logits": 0.19225793331861496, "loss/reg": 0.34646132588386536, "step": 1356 }, { "epoch": 0.01357, "grad_norm": 0.5785092115402222, "grad_norm_var": 0.00240227013164039, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.8805437684059143, "loss/hidden": 0.0, "loss/logits": 0.18391995877027512, "loss/reg": 0.34628087282180786, "step": 1357 }, { "epoch": 0.01358, "grad_norm": 0.48817262053489685, "grad_norm_var": 0.00249859839218692, "learning_rate": 5e-05, "loss": 0.2066, "loss/crossentropy": 2.9120084643363953, "loss/hidden": 0.0, "loss/logits": 0.20661276578903198, "loss/reg": 0.34612569212913513, "step": 1358 }, { "epoch": 0.01359, "grad_norm": 0.5280107259750366, "grad_norm_var": 0.0029994035878629995, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.723498046398163, "loss/hidden": 0.0, "loss/logits": 0.18603845313191414, "loss/reg": 0.34589317440986633, "step": 1359 }, { "epoch": 0.0136, "grad_norm": 0.4707956612110138, "grad_norm_var": 0.003035011864084621, "learning_rate": 5e-05, "loss": 0.1894, "loss/crossentropy": 2.7618637681007385, "loss/hidden": 0.0, "loss/logits": 0.18941478431224823, "loss/reg": 0.345647394657135, "step": 1360 }, { "epoch": 0.01361, "grad_norm": 0.37757036089897156, "grad_norm_var": 0.0033256972448435433, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.726761758327484, "loss/hidden": 0.0, "loss/logits": 0.1938854083418846, "loss/reg": 0.34527337551116943, "step": 1361 }, { "epoch": 0.01362, "grad_norm": 0.4949932396411896, "grad_norm_var": 0.003497394894564284, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.7343544960021973, "loss/hidden": 0.0, "loss/logits": 0.17429197952151299, "loss/reg": 0.3448824882507324, "step": 1362 }, { "epoch": 0.01363, "grad_norm": 0.47695019841194153, "grad_norm_var": 0.00352448871806376, "learning_rate": 5e-05, "loss": 0.1924, "loss/crossentropy": 2.6532238125801086, "loss/hidden": 0.0, "loss/logits": 0.19241830334067345, "loss/reg": 0.3445933759212494, "step": 1363 }, { "epoch": 0.01364, "grad_norm": 0.47103217244148254, "grad_norm_var": 0.003456473884767878, "learning_rate": 5e-05, "loss": 0.201, "loss/crossentropy": 2.9378132820129395, "loss/hidden": 0.0, "loss/logits": 0.201010599732399, "loss/reg": 0.3440409004688263, "step": 1364 }, { "epoch": 0.01365, "grad_norm": 0.41810598969459534, "grad_norm_var": 0.003205031019833271, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.7378239035606384, "loss/hidden": 0.0, "loss/logits": 0.18085715547204018, "loss/reg": 0.3436071276664734, "step": 1365 }, { "epoch": 0.01366, "grad_norm": 0.3912304639816284, "grad_norm_var": 0.003360431020964327, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.756010591983795, "loss/hidden": 0.0, "loss/logits": 0.17090249806642532, "loss/reg": 0.34308111667633057, "step": 1366 }, { "epoch": 0.01367, "grad_norm": 0.43735939264297485, "grad_norm_var": 0.0033206315116185436, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.8463319540023804, "loss/hidden": 0.0, "loss/logits": 0.18690800294280052, "loss/reg": 0.34272658824920654, "step": 1367 }, { "epoch": 0.01368, "grad_norm": 0.42633768916130066, "grad_norm_var": 0.0032686879772579965, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.807410418987274, "loss/hidden": 0.0, "loss/logits": 0.18012291565537453, "loss/reg": 0.34212711453437805, "step": 1368 }, { "epoch": 0.01369, "grad_norm": 0.5415582656860352, "grad_norm_var": 0.003582248775155605, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.875243842601776, "loss/hidden": 0.0, "loss/logits": 0.1841946467757225, "loss/reg": 0.34170976281166077, "step": 1369 }, { "epoch": 0.0137, "grad_norm": 0.41164860129356384, "grad_norm_var": 0.0034808397361633177, "learning_rate": 5e-05, "loss": 0.1771, "loss/crossentropy": 2.738476812839508, "loss/hidden": 0.0, "loss/logits": 0.17706900462508202, "loss/reg": 0.341091126203537, "step": 1370 }, { "epoch": 0.01371, "grad_norm": 0.4110642969608307, "grad_norm_var": 0.003215616526687178, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 3.0371379256248474, "loss/hidden": 0.0, "loss/logits": 0.1777106560766697, "loss/reg": 0.3410275876522064, "step": 1371 }, { "epoch": 0.01372, "grad_norm": 0.468261182308197, "grad_norm_var": 0.0031685719325398615, "learning_rate": 5e-05, "loss": 0.1868, "loss/crossentropy": 2.7719425559043884, "loss/hidden": 0.0, "loss/logits": 0.18684683740139008, "loss/reg": 0.3406158685684204, "step": 1372 }, { "epoch": 0.01373, "grad_norm": 0.4473446309566498, "grad_norm_var": 0.002205809747595829, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.8690379858016968, "loss/hidden": 0.0, "loss/logits": 0.1756535805761814, "loss/reg": 0.34044918417930603, "step": 1373 }, { "epoch": 0.01374, "grad_norm": 0.39668112993240356, "grad_norm_var": 0.0023093931674128616, "learning_rate": 5e-05, "loss": 0.1643, "loss/crossentropy": 2.769678294658661, "loss/hidden": 0.0, "loss/logits": 0.16430196538567543, "loss/reg": 0.3400375545024872, "step": 1374 }, { "epoch": 0.01375, "grad_norm": 0.4229075312614441, "grad_norm_var": 0.0018793865216040798, "learning_rate": 5e-05, "loss": 0.1941, "loss/crossentropy": 2.7854629158973694, "loss/hidden": 0.0, "loss/logits": 0.1941203661262989, "loss/reg": 0.3396475613117218, "step": 1375 }, { "epoch": 0.01376, "grad_norm": 0.4098586142063141, "grad_norm_var": 0.0018733629349512023, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.7028385996818542, "loss/hidden": 0.0, "loss/logits": 0.17688624188303947, "loss/reg": 0.33919209241867065, "step": 1376 }, { "epoch": 0.01377, "grad_norm": 0.4565340280532837, "grad_norm_var": 0.0016301874248425662, "learning_rate": 5e-05, "loss": 0.1917, "loss/crossentropy": 2.787802815437317, "loss/hidden": 0.0, "loss/logits": 0.19168948754668236, "loss/reg": 0.338827908039093, "step": 1377 }, { "epoch": 0.01378, "grad_norm": 0.41312897205352783, "grad_norm_var": 0.0014773451994814094, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.788445830345154, "loss/hidden": 0.0, "loss/logits": 0.18398717790842056, "loss/reg": 0.33837568759918213, "step": 1378 }, { "epoch": 0.01379, "grad_norm": 0.40348657965660095, "grad_norm_var": 0.0014282330103511617, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.753566026687622, "loss/hidden": 0.0, "loss/logits": 0.1768352910876274, "loss/reg": 0.33801165223121643, "step": 1379 }, { "epoch": 0.0138, "grad_norm": 0.42194879055023193, "grad_norm_var": 0.0013293096039085684, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.813223361968994, "loss/hidden": 0.0, "loss/logits": 0.1807214431464672, "loss/reg": 0.33759328722953796, "step": 1380 }, { "epoch": 0.01381, "grad_norm": 0.3823777735233307, "grad_norm_var": 0.0014649940384886132, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.830634295940399, "loss/hidden": 0.0, "loss/logits": 0.17822764441370964, "loss/reg": 0.3375193178653717, "step": 1381 }, { "epoch": 0.01382, "grad_norm": 0.4772758483886719, "grad_norm_var": 0.001510382881625664, "learning_rate": 5e-05, "loss": 0.2046, "loss/crossentropy": 2.818925619125366, "loss/hidden": 0.0, "loss/logits": 0.2046113833785057, "loss/reg": 0.33736860752105713, "step": 1382 }, { "epoch": 0.01383, "grad_norm": 0.415394127368927, "grad_norm_var": 0.0015277285832525226, "learning_rate": 5e-05, "loss": 0.1967, "loss/crossentropy": 2.740165054798126, "loss/hidden": 0.0, "loss/logits": 0.19668644294142723, "loss/reg": 0.3373785614967346, "step": 1383 }, { "epoch": 0.01384, "grad_norm": 0.46583065390586853, "grad_norm_var": 0.0015974310992484992, "learning_rate": 5e-05, "loss": 0.2137, "loss/crossentropy": 2.6997068524360657, "loss/hidden": 0.0, "loss/logits": 0.21372303366661072, "loss/reg": 0.3371827304363251, "step": 1384 }, { "epoch": 0.01385, "grad_norm": 0.3715943396091461, "grad_norm_var": 0.0009672873231867977, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.852622926235199, "loss/hidden": 0.0, "loss/logits": 0.17364206910133362, "loss/reg": 0.3370041251182556, "step": 1385 }, { "epoch": 0.01386, "grad_norm": 0.4350415766239166, "grad_norm_var": 0.0009646532421251279, "learning_rate": 5e-05, "loss": 0.2007, "loss/crossentropy": 2.9117844104766846, "loss/hidden": 0.0, "loss/logits": 0.2006595954298973, "loss/reg": 0.33671754598617554, "step": 1386 }, { "epoch": 0.01387, "grad_norm": 0.42346110939979553, "grad_norm_var": 0.0009513550542828467, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.68190997838974, "loss/hidden": 0.0, "loss/logits": 0.1907116025686264, "loss/reg": 0.33680006861686707, "step": 1387 }, { "epoch": 0.01388, "grad_norm": 0.4912080764770508, "grad_norm_var": 0.0011144986249159198, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.6720213890075684, "loss/hidden": 0.0, "loss/logits": 0.17883950099349022, "loss/reg": 0.3367910385131836, "step": 1388 }, { "epoch": 0.01389, "grad_norm": 0.4056718349456787, "grad_norm_var": 0.00111071531038075, "learning_rate": 5e-05, "loss": 0.1781, "loss/crossentropy": 2.7933679819107056, "loss/hidden": 0.0, "loss/logits": 0.1781182959675789, "loss/reg": 0.3366079032421112, "step": 1389 }, { "epoch": 0.0139, "grad_norm": 0.39825910329818726, "grad_norm_var": 0.001105012671029822, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.8361067175865173, "loss/hidden": 0.0, "loss/logits": 0.18307946622371674, "loss/reg": 0.33660727739334106, "step": 1390 }, { "epoch": 0.01391, "grad_norm": 0.46018874645233154, "grad_norm_var": 0.0011833500278785973, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.8498005867004395, "loss/hidden": 0.0, "loss/logits": 0.17801924422383308, "loss/reg": 0.3365802466869354, "step": 1391 }, { "epoch": 0.01392, "grad_norm": 0.3541456162929535, "grad_norm_var": 0.0015043357444050683, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.8014222979545593, "loss/hidden": 0.0, "loss/logits": 0.17398259043693542, "loss/reg": 0.3365350663661957, "step": 1392 }, { "epoch": 0.01393, "grad_norm": 0.4250631630420685, "grad_norm_var": 0.0014275033555532317, "learning_rate": 5e-05, "loss": 0.1838, "loss/crossentropy": 2.800377666950226, "loss/hidden": 0.0, "loss/logits": 0.18383284658193588, "loss/reg": 0.33616167306900024, "step": 1393 }, { "epoch": 0.01394, "grad_norm": 0.4328445792198181, "grad_norm_var": 0.0014297795708706337, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.814865827560425, "loss/hidden": 0.0, "loss/logits": 0.1938939057290554, "loss/reg": 0.3359866142272949, "step": 1394 }, { "epoch": 0.01395, "grad_norm": 0.4183562099933624, "grad_norm_var": 0.0014054324821738925, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.743912696838379, "loss/hidden": 0.0, "loss/logits": 0.1840612031519413, "loss/reg": 0.3359890580177307, "step": 1395 }, { "epoch": 0.01396, "grad_norm": 0.8850646018981934, "grad_norm_var": 0.014704141406926618, "learning_rate": 5e-05, "loss": 0.2091, "loss/crossentropy": 2.82315331697464, "loss/hidden": 0.0, "loss/logits": 0.2090906910598278, "loss/reg": 0.3356591761112213, "step": 1396 }, { "epoch": 0.01397, "grad_norm": 0.64495849609375, "grad_norm_var": 0.016554509324091287, "learning_rate": 5e-05, "loss": 0.1983, "loss/crossentropy": 2.828568756580353, "loss/hidden": 0.0, "loss/logits": 0.19834281504154205, "loss/reg": 0.33533018827438354, "step": 1397 }, { "epoch": 0.01398, "grad_norm": 0.6836656928062439, "grad_norm_var": 0.019443931585553284, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.9815375208854675, "loss/hidden": 0.0, "loss/logits": 0.18353765830397606, "loss/reg": 0.33512067794799805, "step": 1398 }, { "epoch": 0.01399, "grad_norm": 0.6029654741287231, "grad_norm_var": 0.019979046395336472, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.761033535003662, "loss/hidden": 0.0, "loss/logits": 0.18222413212060928, "loss/reg": 0.3350771963596344, "step": 1399 }, { "epoch": 0.014, "grad_norm": 0.47109782695770264, "grad_norm_var": 0.01996124664179851, "learning_rate": 5e-05, "loss": 0.2, "loss/crossentropy": 2.7463910579681396, "loss/hidden": 0.0, "loss/logits": 0.1999712735414505, "loss/reg": 0.3351133465766907, "step": 1400 }, { "epoch": 0.01401, "grad_norm": 0.5447381734848022, "grad_norm_var": 0.019009679499538554, "learning_rate": 5e-05, "loss": 0.1917, "loss/crossentropy": 2.8365535140037537, "loss/hidden": 0.0, "loss/logits": 0.1917371228337288, "loss/reg": 0.33500272035598755, "step": 1401 }, { "epoch": 0.01402, "grad_norm": 0.46354740858078003, "grad_norm_var": 0.018795346212056297, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.7419124841690063, "loss/hidden": 0.0, "loss/logits": 0.178000096231699, "loss/reg": 0.3347359597682953, "step": 1402 }, { "epoch": 0.01403, "grad_norm": 0.491256445646286, "grad_norm_var": 0.018331290897623552, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.8391591906547546, "loss/hidden": 0.0, "loss/logits": 0.17574496194720268, "loss/reg": 0.3345773220062256, "step": 1403 }, { "epoch": 0.01404, "grad_norm": 0.49702465534210205, "grad_norm_var": 0.018318199820014057, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.6861689686775208, "loss/hidden": 0.0, "loss/logits": 0.18665464594960213, "loss/reg": 0.3345455229282379, "step": 1404 }, { "epoch": 0.01405, "grad_norm": 0.44650334119796753, "grad_norm_var": 0.017848003802608627, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.756381392478943, "loss/hidden": 0.0, "loss/logits": 0.18604868277907372, "loss/reg": 0.3343205153942108, "step": 1405 }, { "epoch": 0.01406, "grad_norm": 0.401306688785553, "grad_norm_var": 0.017801663309980245, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.8077072501182556, "loss/hidden": 0.0, "loss/logits": 0.18225860595703125, "loss/reg": 0.3340727686882019, "step": 1406 }, { "epoch": 0.01407, "grad_norm": 0.41499000787734985, "grad_norm_var": 0.01825316018244531, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.7671008706092834, "loss/hidden": 0.0, "loss/logits": 0.17718957737088203, "loss/reg": 0.33401182293891907, "step": 1407 }, { "epoch": 0.01408, "grad_norm": 1.1966899633407593, "grad_norm_var": 0.04498908691302337, "learning_rate": 5e-05, "loss": 0.2134, "loss/crossentropy": 2.739284873008728, "loss/hidden": 0.0, "loss/logits": 0.21337352693080902, "loss/reg": 0.333707332611084, "step": 1408 }, { "epoch": 0.01409, "grad_norm": 0.38434508442878723, "grad_norm_var": 0.04584567574871746, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.7987306118011475, "loss/hidden": 0.0, "loss/logits": 0.16806652024388313, "loss/reg": 0.3335602283477783, "step": 1409 }, { "epoch": 0.0141, "grad_norm": 0.4422290325164795, "grad_norm_var": 0.045690561842025475, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.8423770666122437, "loss/hidden": 0.0, "loss/logits": 0.17789217457175255, "loss/reg": 0.3335031569004059, "step": 1410 }, { "epoch": 0.01411, "grad_norm": 0.5002337098121643, "grad_norm_var": 0.04454362285175512, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.893128514289856, "loss/hidden": 0.0, "loss/logits": 0.17233337834477425, "loss/reg": 0.33329179883003235, "step": 1411 }, { "epoch": 0.01412, "grad_norm": 0.5137535929679871, "grad_norm_var": 0.037409548750080866, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.744915783405304, "loss/hidden": 0.0, "loss/logits": 0.18043798580765724, "loss/reg": 0.33311185240745544, "step": 1412 }, { "epoch": 0.01413, "grad_norm": 0.5447583794593811, "grad_norm_var": 0.036684325799484024, "learning_rate": 5e-05, "loss": 0.1927, "loss/crossentropy": 2.7760583758354187, "loss/hidden": 0.0, "loss/logits": 0.19274943321943283, "loss/reg": 0.3327620029449463, "step": 1413 }, { "epoch": 0.01414, "grad_norm": 0.392659455537796, "grad_norm_var": 0.036303595481089045, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.7774463891983032, "loss/hidden": 0.0, "loss/logits": 0.17896684631705284, "loss/reg": 0.3325439393520355, "step": 1414 }, { "epoch": 0.01415, "grad_norm": 0.41512471437454224, "grad_norm_var": 0.03641231982942665, "learning_rate": 5e-05, "loss": 0.1746, "loss/crossentropy": 2.742201328277588, "loss/hidden": 0.0, "loss/logits": 0.17458953335881233, "loss/reg": 0.3325901925563812, "step": 1415 }, { "epoch": 0.01416, "grad_norm": 0.3924265503883362, "grad_norm_var": 0.03718115312046063, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.9012945890426636, "loss/hidden": 0.0, "loss/logits": 0.17746540158987045, "loss/reg": 0.3325088322162628, "step": 1416 }, { "epoch": 0.01417, "grad_norm": 0.40935465693473816, "grad_norm_var": 0.037566040337381645, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.635285258293152, "loss/hidden": 0.0, "loss/logits": 0.17738812789320946, "loss/reg": 0.3322452902793884, "step": 1417 }, { "epoch": 0.01418, "grad_norm": 0.4004879593849182, "grad_norm_var": 0.038071772400388545, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.7103660702705383, "loss/hidden": 0.0, "loss/logits": 0.18693046271800995, "loss/reg": 0.33252954483032227, "step": 1418 }, { "epoch": 0.01419, "grad_norm": 0.4192851185798645, "grad_norm_var": 0.03838534311213927, "learning_rate": 5e-05, "loss": 0.1899, "loss/crossentropy": 2.7931013703346252, "loss/hidden": 0.0, "loss/logits": 0.1899217963218689, "loss/reg": 0.33239296078681946, "step": 1419 }, { "epoch": 0.0142, "grad_norm": 0.43285539746284485, "grad_norm_var": 0.03854579184943554, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.877627670764923, "loss/hidden": 0.0, "loss/logits": 0.1849670521914959, "loss/reg": 0.3321034014225006, "step": 1420 }, { "epoch": 0.01421, "grad_norm": 0.527352511882782, "grad_norm_var": 0.038575044821860145, "learning_rate": 5e-05, "loss": 0.2061, "loss/crossentropy": 2.8799533247947693, "loss/hidden": 0.0, "loss/logits": 0.20612693578004837, "loss/reg": 0.3322935402393341, "step": 1421 }, { "epoch": 0.01422, "grad_norm": 0.4230918884277344, "grad_norm_var": 0.038356547111059076, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.646717071533203, "loss/hidden": 0.0, "loss/logits": 0.1847631335258484, "loss/reg": 0.332378089427948, "step": 1422 }, { "epoch": 0.01423, "grad_norm": 0.45158520340919495, "grad_norm_var": 0.03808350610884447, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.7849010825157166, "loss/hidden": 0.0, "loss/logits": 0.18853705003857613, "loss/reg": 0.3325144052505493, "step": 1423 }, { "epoch": 0.01424, "grad_norm": 0.4236609935760498, "grad_norm_var": 0.0026331131141552277, "learning_rate": 5e-05, "loss": 0.191, "loss/crossentropy": 2.865929663181305, "loss/hidden": 0.0, "loss/logits": 0.1910322792828083, "loss/reg": 0.3322553038597107, "step": 1424 }, { "epoch": 0.01425, "grad_norm": 0.44136399030685425, "grad_norm_var": 0.0023974154388894794, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.716752767562866, "loss/hidden": 0.0, "loss/logits": 0.1796225607395172, "loss/reg": 0.3321329355239868, "step": 1425 }, { "epoch": 0.01426, "grad_norm": 0.41880685091018677, "grad_norm_var": 0.0024423518696018074, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.8317967653274536, "loss/hidden": 0.0, "loss/logits": 0.1803799755871296, "loss/reg": 0.3319120407104492, "step": 1426 }, { "epoch": 0.01427, "grad_norm": 0.4953824281692505, "grad_norm_var": 0.0024075619636581298, "learning_rate": 5e-05, "loss": 0.2021, "loss/crossentropy": 2.7110130190849304, "loss/hidden": 0.0, "loss/logits": 0.2020895779132843, "loss/reg": 0.3315524458885193, "step": 1427 }, { "epoch": 0.01428, "grad_norm": 0.43826693296432495, "grad_norm_var": 0.002060349845043182, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.6995094418525696, "loss/hidden": 0.0, "loss/logits": 0.1828344240784645, "loss/reg": 0.33108481764793396, "step": 1428 }, { "epoch": 0.01429, "grad_norm": 0.48634955286979675, "grad_norm_var": 0.0014511434278135504, "learning_rate": 5e-05, "loss": 0.1987, "loss/crossentropy": 2.8200928568840027, "loss/hidden": 0.0, "loss/logits": 0.19867318496108055, "loss/reg": 0.33056724071502686, "step": 1429 }, { "epoch": 0.0143, "grad_norm": 0.42899009585380554, "grad_norm_var": 0.0013260984761105635, "learning_rate": 5e-05, "loss": 0.1931, "loss/crossentropy": 2.751803755760193, "loss/hidden": 0.0, "loss/logits": 0.1930784285068512, "loss/reg": 0.33048540353775024, "step": 1430 }, { "epoch": 0.01431, "grad_norm": 0.41038912534713745, "grad_norm_var": 0.0013418011499895224, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.675851881504059, "loss/hidden": 0.0, "loss/logits": 0.1889328770339489, "loss/reg": 0.32985273003578186, "step": 1431 }, { "epoch": 0.01432, "grad_norm": 0.41080325841903687, "grad_norm_var": 0.0012525211085030721, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 2.768611788749695, "loss/hidden": 0.0, "loss/logits": 0.17488233372569084, "loss/reg": 0.3295130729675293, "step": 1432 }, { "epoch": 0.01433, "grad_norm": 0.4548375904560089, "grad_norm_var": 0.0012042980358920633, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.832592725753784, "loss/hidden": 0.0, "loss/logits": 0.18888084217905998, "loss/reg": 0.329073041677475, "step": 1433 }, { "epoch": 0.01434, "grad_norm": 0.39353147149086, "grad_norm_var": 0.0012453340801292492, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.720345437526703, "loss/hidden": 0.0, "loss/logits": 0.17862798646092415, "loss/reg": 0.3285079896450043, "step": 1434 }, { "epoch": 0.01435, "grad_norm": 0.4022473990917206, "grad_norm_var": 0.0013128848624843574, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.736500561237335, "loss/hidden": 0.0, "loss/logits": 0.17573554441332817, "loss/reg": 0.3281388580799103, "step": 1435 }, { "epoch": 0.01436, "grad_norm": 0.476757287979126, "grad_norm_var": 0.0013917018707874835, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.707496464252472, "loss/hidden": 0.0, "loss/logits": 0.18208077177405357, "loss/reg": 0.3277563154697418, "step": 1436 }, { "epoch": 0.01437, "grad_norm": 0.5132519602775574, "grad_norm_var": 0.001245000968983175, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.9361810088157654, "loss/hidden": 0.0, "loss/logits": 0.18756385147571564, "loss/reg": 0.32780468463897705, "step": 1437 }, { "epoch": 0.01438, "grad_norm": 5.252004623413086, "grad_norm_var": 1.4465788113241271, "learning_rate": 5e-05, "loss": 0.3706, "loss/crossentropy": 2.7840746641159058, "loss/hidden": 0.0, "loss/logits": 0.3705543540418148, "loss/reg": 0.32727310061454773, "step": 1438 }, { "epoch": 0.01439, "grad_norm": 0.5200448632240295, "grad_norm_var": 1.4442058751115485, "learning_rate": 5e-05, "loss": 0.1887, "loss/crossentropy": 2.597719967365265, "loss/hidden": 0.0, "loss/logits": 0.1886601708829403, "loss/reg": 0.32672905921936035, "step": 1439 }, { "epoch": 0.0144, "grad_norm": 0.4328918755054474, "grad_norm_var": 1.4438121102339199, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 2.904754638671875, "loss/hidden": 0.0, "loss/logits": 0.18260694295167923, "loss/reg": 0.32659485936164856, "step": 1440 }, { "epoch": 0.01441, "grad_norm": 0.43532076478004456, "grad_norm_var": 1.4440618676626207, "learning_rate": 5e-05, "loss": 0.2003, "loss/crossentropy": 2.6880961060523987, "loss/hidden": 0.0, "loss/logits": 0.20031968131661415, "loss/reg": 0.3261314928531647, "step": 1441 }, { "epoch": 0.01442, "grad_norm": 0.5566297769546509, "grad_norm_var": 1.4391975286332512, "learning_rate": 5e-05, "loss": 0.2007, "loss/crossentropy": 2.7995364665985107, "loss/hidden": 0.0, "loss/logits": 0.2006579264998436, "loss/reg": 0.325594425201416, "step": 1442 }, { "epoch": 0.01443, "grad_norm": 0.38898059725761414, "grad_norm_var": 1.443612844523848, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.795620858669281, "loss/hidden": 0.0, "loss/logits": 0.16808659955859184, "loss/reg": 0.3255971372127533, "step": 1443 }, { "epoch": 0.01444, "grad_norm": 0.41226285696029663, "grad_norm_var": 1.444736232919824, "learning_rate": 5e-05, "loss": 0.1862, "loss/crossentropy": 2.6894450187683105, "loss/hidden": 0.0, "loss/logits": 0.1862075813114643, "loss/reg": 0.324940025806427, "step": 1444 }, { "epoch": 0.01445, "grad_norm": 0.39230281114578247, "grad_norm_var": 1.4485757309770675, "learning_rate": 5e-05, "loss": 0.1674, "loss/crossentropy": 2.6630411744117737, "loss/hidden": 0.0, "loss/logits": 0.16739049926400185, "loss/reg": 0.32446718215942383, "step": 1445 }, { "epoch": 0.01446, "grad_norm": 0.44306260347366333, "grad_norm_var": 1.4479997126304482, "learning_rate": 5e-05, "loss": 0.2032, "loss/crossentropy": 2.6949720978736877, "loss/hidden": 0.0, "loss/logits": 0.2031894102692604, "loss/reg": 0.3243202269077301, "step": 1446 }, { "epoch": 0.01447, "grad_norm": 0.38080981373786926, "grad_norm_var": 1.44936798692938, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 2.8046231269836426, "loss/hidden": 0.0, "loss/logits": 0.1815660074353218, "loss/reg": 0.3241801857948303, "step": 1447 }, { "epoch": 0.01448, "grad_norm": 0.41833654046058655, "grad_norm_var": 1.44903926037765, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.7694694995880127, "loss/hidden": 0.0, "loss/logits": 0.186565563082695, "loss/reg": 0.3241211771965027, "step": 1448 }, { "epoch": 0.01449, "grad_norm": 0.42230433225631714, "grad_norm_var": 1.450351400104452, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 2.736854612827301, "loss/hidden": 0.0, "loss/logits": 0.18841412290930748, "loss/reg": 0.32373905181884766, "step": 1449 }, { "epoch": 0.0145, "grad_norm": 0.4756649136543274, "grad_norm_var": 1.4469782924191208, "learning_rate": 5e-05, "loss": 0.1989, "loss/crossentropy": 2.843453288078308, "loss/hidden": 0.0, "loss/logits": 0.19888193905353546, "loss/reg": 0.323408842086792, "step": 1450 }, { "epoch": 0.01451, "grad_norm": 0.40586841106414795, "grad_norm_var": 1.4468135437055694, "learning_rate": 5e-05, "loss": 0.1937, "loss/crossentropy": 2.7633622884750366, "loss/hidden": 0.0, "loss/logits": 0.19368966296315193, "loss/reg": 0.3229864537715912, "step": 1451 }, { "epoch": 0.01452, "grad_norm": 0.39573049545288086, "grad_norm_var": 1.4501262419895353, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.831085741519928, "loss/hidden": 0.0, "loss/logits": 0.18024462461471558, "loss/reg": 0.3226442039012909, "step": 1452 }, { "epoch": 0.01453, "grad_norm": 0.470074325799942, "grad_norm_var": 1.4515501209646136, "learning_rate": 5e-05, "loss": 0.1913, "loss/crossentropy": 2.713973581790924, "loss/hidden": 0.0, "loss/logits": 0.19133643433451653, "loss/reg": 0.3224208354949951, "step": 1453 }, { "epoch": 0.01454, "grad_norm": 0.43695881962776184, "grad_norm_var": 0.0023440839446212556, "learning_rate": 5e-05, "loss": 0.193, "loss/crossentropy": 2.843654155731201, "loss/hidden": 0.0, "loss/logits": 0.19298715144395828, "loss/reg": 0.3222367763519287, "step": 1454 }, { "epoch": 0.01455, "grad_norm": 0.4292144179344177, "grad_norm_var": 0.0018503859054969578, "learning_rate": 5e-05, "loss": 0.1909, "loss/crossentropy": 2.729588508605957, "loss/hidden": 0.0, "loss/logits": 0.1909087374806404, "loss/reg": 0.32221949100494385, "step": 1455 }, { "epoch": 0.01456, "grad_norm": 0.41604289412498474, "grad_norm_var": 0.0018639367982278092, "learning_rate": 5e-05, "loss": 0.186, "loss/crossentropy": 2.8195661902427673, "loss/hidden": 0.0, "loss/logits": 0.18599796295166016, "loss/reg": 0.32196280360221863, "step": 1456 }, { "epoch": 0.01457, "grad_norm": 0.43275174498558044, "grad_norm_var": 0.001862517410224144, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.6253294944763184, "loss/hidden": 0.0, "loss/logits": 0.18651529401540756, "loss/reg": 0.3218734562397003, "step": 1457 }, { "epoch": 0.01458, "grad_norm": 0.4816407561302185, "grad_norm_var": 0.0009459869622255016, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.692448377609253, "loss/hidden": 0.0, "loss/logits": 0.1812092363834381, "loss/reg": 0.3217640817165375, "step": 1458 }, { "epoch": 0.01459, "grad_norm": 0.4057413339614868, "grad_norm_var": 0.0008827694785993619, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.7211183309555054, "loss/hidden": 0.0, "loss/logits": 0.18299619108438492, "loss/reg": 0.3218323588371277, "step": 1459 }, { "epoch": 0.0146, "grad_norm": 0.45075273513793945, "grad_norm_var": 0.0009039750686443989, "learning_rate": 5e-05, "loss": 0.2014, "loss/crossentropy": 2.7342368960380554, "loss/hidden": 0.0, "loss/logits": 0.20144187659025192, "loss/reg": 0.32162514328956604, "step": 1460 }, { "epoch": 0.01461, "grad_norm": 0.38863077759742737, "grad_norm_var": 0.0009225785766095525, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.7896840572357178, "loss/hidden": 0.0, "loss/logits": 0.1782398670911789, "loss/reg": 0.3213615417480469, "step": 1461 }, { "epoch": 0.01462, "grad_norm": 0.43722450733184814, "grad_norm_var": 0.0009132555739983024, "learning_rate": 5e-05, "loss": 0.1949, "loss/crossentropy": 2.655126214027405, "loss/hidden": 0.0, "loss/logits": 0.1948925107717514, "loss/reg": 0.32134559750556946, "step": 1462 }, { "epoch": 0.01463, "grad_norm": 0.44507652521133423, "grad_norm_var": 0.0007671615595201094, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.930400013923645, "loss/hidden": 0.0, "loss/logits": 0.18984581902623177, "loss/reg": 0.3215232491493225, "step": 1463 }, { "epoch": 0.01464, "grad_norm": 0.4086571931838989, "grad_norm_var": 0.0007906520438079749, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.8166850805282593, "loss/hidden": 0.0, "loss/logits": 0.18298694863915443, "loss/reg": 0.3212527930736542, "step": 1464 }, { "epoch": 0.01465, "grad_norm": 0.4061455726623535, "grad_norm_var": 0.0008265589034583779, "learning_rate": 5e-05, "loss": 0.1945, "loss/crossentropy": 2.8387283086776733, "loss/hidden": 0.0, "loss/logits": 0.19454342499375343, "loss/reg": 0.3211488127708435, "step": 1465 }, { "epoch": 0.01466, "grad_norm": 0.3903484642505646, "grad_norm_var": 0.0007664180414064248, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.6449853777885437, "loss/hidden": 0.0, "loss/logits": 0.18307576328516006, "loss/reg": 0.32094722986221313, "step": 1466 }, { "epoch": 0.01467, "grad_norm": 0.4060763418674469, "grad_norm_var": 0.0007658888489930941, "learning_rate": 5e-05, "loss": 0.1913, "loss/crossentropy": 2.8921971321105957, "loss/hidden": 0.0, "loss/logits": 0.19125835224986076, "loss/reg": 0.32103174924850464, "step": 1467 }, { "epoch": 0.01468, "grad_norm": 0.3721385896205902, "grad_norm_var": 0.0008929545140344312, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.7999913692474365, "loss/hidden": 0.0, "loss/logits": 0.17828701436519623, "loss/reg": 0.32092857360839844, "step": 1468 }, { "epoch": 0.01469, "grad_norm": 0.43893900513648987, "grad_norm_var": 0.0007605776884580824, "learning_rate": 5e-05, "loss": 0.2017, "loss/crossentropy": 2.703360438346863, "loss/hidden": 0.0, "loss/logits": 0.20167458429932594, "loss/reg": 0.32091856002807617, "step": 1469 }, { "epoch": 0.0147, "grad_norm": 0.3954939842224121, "grad_norm_var": 0.0007833781061746851, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.711470603942871, "loss/hidden": 0.0, "loss/logits": 0.18445312604308128, "loss/reg": 0.3209270238876343, "step": 1470 }, { "epoch": 0.01471, "grad_norm": 0.4242197275161743, "grad_norm_var": 0.0007781713218279251, "learning_rate": 5e-05, "loss": 0.1784, "loss/crossentropy": 2.800477921962738, "loss/hidden": 0.0, "loss/logits": 0.17844953760504723, "loss/reg": 0.3208945393562317, "step": 1471 }, { "epoch": 0.01472, "grad_norm": 0.5070668458938599, "grad_norm_var": 0.0012632423537154215, "learning_rate": 5e-05, "loss": 0.217, "loss/crossentropy": 2.6673134565353394, "loss/hidden": 0.0, "loss/logits": 0.217046070843935, "loss/reg": 0.3206978142261505, "step": 1472 }, { "epoch": 0.01473, "grad_norm": 0.6107890009880066, "grad_norm_var": 0.003441829709368316, "learning_rate": 5e-05, "loss": 0.1978, "loss/crossentropy": 2.7472557425498962, "loss/hidden": 0.0, "loss/logits": 0.1977555900812149, "loss/reg": 0.32051151990890503, "step": 1473 }, { "epoch": 0.01474, "grad_norm": 0.4118281602859497, "grad_norm_var": 0.0033174956470275797, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.9128127098083496, "loss/hidden": 0.0, "loss/logits": 0.17218394204974174, "loss/reg": 0.32021796703338623, "step": 1474 }, { "epoch": 0.01475, "grad_norm": 0.41620731353759766, "grad_norm_var": 0.0032888212549308294, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.5322376489639282, "loss/hidden": 0.0, "loss/logits": 0.18099921196699142, "loss/reg": 0.3202148973941803, "step": 1475 }, { "epoch": 0.01476, "grad_norm": 0.4206521213054657, "grad_norm_var": 0.003269583347101568, "learning_rate": 5e-05, "loss": 0.1991, "loss/crossentropy": 2.9256187677383423, "loss/hidden": 0.0, "loss/logits": 0.19911302253603935, "loss/reg": 0.31978023052215576, "step": 1476 }, { "epoch": 0.01477, "grad_norm": 0.402765154838562, "grad_norm_var": 0.00320416546100963, "learning_rate": 5e-05, "loss": 0.1739, "loss/crossentropy": 2.8196975588798523, "loss/hidden": 0.0, "loss/logits": 0.1738714836537838, "loss/reg": 0.3198913335800171, "step": 1477 }, { "epoch": 0.01478, "grad_norm": 0.4302436411380768, "grad_norm_var": 0.003201279622525144, "learning_rate": 5e-05, "loss": 0.1975, "loss/crossentropy": 2.8199687600135803, "loss/hidden": 0.0, "loss/logits": 0.19751255586743355, "loss/reg": 0.3197462856769562, "step": 1478 }, { "epoch": 0.01479, "grad_norm": 0.3871346414089203, "grad_norm_var": 0.0032978433289319597, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.894818365573883, "loss/hidden": 0.0, "loss/logits": 0.18093406781554222, "loss/reg": 0.31995436549186707, "step": 1479 }, { "epoch": 0.0148, "grad_norm": 0.4253666400909424, "grad_norm_var": 0.00327488596708051, "learning_rate": 5e-05, "loss": 0.2238, "loss/crossentropy": 2.6779895424842834, "loss/hidden": 0.0, "loss/logits": 0.22382857650518417, "loss/reg": 0.32006698846817017, "step": 1480 }, { "epoch": 0.01481, "grad_norm": 0.3774517774581909, "grad_norm_var": 0.003409337799882724, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.9879230856895447, "loss/hidden": 0.0, "loss/logits": 0.18503187596797943, "loss/reg": 0.3200933635234833, "step": 1481 }, { "epoch": 0.01482, "grad_norm": 0.3996993899345398, "grad_norm_var": 0.0033702965911779495, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 2.8133975863456726, "loss/hidden": 0.0, "loss/logits": 0.17768998816609383, "loss/reg": 0.3199408948421478, "step": 1482 }, { "epoch": 0.01483, "grad_norm": 0.39417120814323425, "grad_norm_var": 0.003411779966968931, "learning_rate": 5e-05, "loss": 0.1743, "loss/crossentropy": 2.8280540704727173, "loss/hidden": 0.0, "loss/logits": 0.17427058145403862, "loss/reg": 0.31996890902519226, "step": 1483 }, { "epoch": 0.01484, "grad_norm": 0.38665926456451416, "grad_norm_var": 0.003320899333326699, "learning_rate": 5e-05, "loss": 0.1667, "loss/crossentropy": 2.8020249605178833, "loss/hidden": 0.0, "loss/logits": 0.16668550670146942, "loss/reg": 0.32019588351249695, "step": 1484 }, { "epoch": 0.01485, "grad_norm": 0.37642037868499756, "grad_norm_var": 0.0034639385604008145, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 2.76164311170578, "loss/hidden": 0.0, "loss/logits": 0.17771953716874123, "loss/reg": 0.31992995738983154, "step": 1485 }, { "epoch": 0.01486, "grad_norm": 0.37894299626350403, "grad_norm_var": 0.003541507236000992, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.8712287545204163, "loss/hidden": 0.0, "loss/logits": 0.1796080395579338, "loss/reg": 0.3199402987957001, "step": 1486 }, { "epoch": 0.01487, "grad_norm": 0.4233422875404358, "grad_norm_var": 0.003541278249186149, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.6870920658111572, "loss/hidden": 0.0, "loss/logits": 0.18476100638508797, "loss/reg": 0.32012447714805603, "step": 1487 }, { "epoch": 0.01488, "grad_norm": 0.40877363085746765, "grad_norm_var": 0.0030275900443976172, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.7137195467948914, "loss/hidden": 0.0, "loss/logits": 0.18385036662220955, "loss/reg": 0.3199234902858734, "step": 1488 }, { "epoch": 0.01489, "grad_norm": 0.3917202651500702, "grad_norm_var": 0.00032727415881818783, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.790957808494568, "loss/hidden": 0.0, "loss/logits": 0.1822296306490898, "loss/reg": 0.31979694962501526, "step": 1489 }, { "epoch": 0.0149, "grad_norm": 0.3916443884372711, "grad_norm_var": 0.0003261819805256601, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.8087695240974426, "loss/hidden": 0.0, "loss/logits": 0.18354440852999687, "loss/reg": 0.31964197754859924, "step": 1490 }, { "epoch": 0.01491, "grad_norm": 0.5379776358604431, "grad_norm_var": 0.0015047150749272948, "learning_rate": 5e-05, "loss": 0.1972, "loss/crossentropy": 2.773056209087372, "loss/hidden": 0.0, "loss/logits": 0.19723474234342575, "loss/reg": 0.31959447264671326, "step": 1491 }, { "epoch": 0.01492, "grad_norm": 0.3563421368598938, "grad_norm_var": 0.001657374311148004, "learning_rate": 5e-05, "loss": 0.1671, "loss/crossentropy": 2.7765135765075684, "loss/hidden": 0.0, "loss/logits": 0.16713140532374382, "loss/reg": 0.31944823265075684, "step": 1492 }, { "epoch": 0.01493, "grad_norm": 2.004755973815918, "grad_norm_var": 0.16172987467091335, "learning_rate": 5e-05, "loss": 0.2782, "loss/crossentropy": 2.8666852712631226, "loss/hidden": 0.0, "loss/logits": 0.2781948372721672, "loss/reg": 0.3195701837539673, "step": 1493 }, { "epoch": 0.01494, "grad_norm": 0.44003087282180786, "grad_norm_var": 0.1616390700598601, "learning_rate": 5e-05, "loss": 0.1854, "loss/crossentropy": 2.8815794587135315, "loss/hidden": 0.0, "loss/logits": 0.18543830513954163, "loss/reg": 0.319428026676178, "step": 1494 }, { "epoch": 0.01495, "grad_norm": 0.5326512455940247, "grad_norm_var": 0.16067513812889322, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.8269136548042297, "loss/hidden": 0.0, "loss/logits": 0.1911088153719902, "loss/reg": 0.31969374418258667, "step": 1495 }, { "epoch": 0.01496, "grad_norm": 0.561933159828186, "grad_norm_var": 0.16022465644667688, "learning_rate": 5e-05, "loss": 0.1945, "loss/crossentropy": 2.7607564330101013, "loss/hidden": 0.0, "loss/logits": 0.1944938227534294, "loss/reg": 0.31940361857414246, "step": 1496 }, { "epoch": 0.01497, "grad_norm": 0.40673887729644775, "grad_norm_var": 0.159711245063064, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.820189416408539, "loss/hidden": 0.0, "loss/logits": 0.18759127333760262, "loss/reg": 0.3194156289100647, "step": 1497 }, { "epoch": 0.01498, "grad_norm": 0.4590235650539398, "grad_norm_var": 0.1589441428618287, "learning_rate": 5e-05, "loss": 0.1852, "loss/crossentropy": 2.7224557995796204, "loss/hidden": 0.0, "loss/logits": 0.1851620338857174, "loss/reg": 0.31923454999923706, "step": 1498 }, { "epoch": 0.01499, "grad_norm": 0.38967129588127136, "grad_norm_var": 0.1590258214404742, "learning_rate": 5e-05, "loss": 0.1734, "loss/crossentropy": 2.8817696571350098, "loss/hidden": 0.0, "loss/logits": 0.1734498143196106, "loss/reg": 0.31931790709495544, "step": 1499 }, { "epoch": 0.015, "grad_norm": 0.39763343334198, "grad_norm_var": 0.1588266609931339, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 2.7269198298454285, "loss/hidden": 0.0, "loss/logits": 0.18366099521517754, "loss/reg": 0.31902599334716797, "step": 1500 }, { "epoch": 0.01501, "grad_norm": 0.39096909761428833, "grad_norm_var": 0.1585446873380513, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.6625503301620483, "loss/hidden": 0.0, "loss/logits": 0.1763039343059063, "loss/reg": 0.31906858086586, "step": 1501 }, { "epoch": 0.01502, "grad_norm": 0.41807821393013, "grad_norm_var": 0.1578547501517055, "learning_rate": 5e-05, "loss": 0.1719, "loss/crossentropy": 2.8757044076919556, "loss/hidden": 0.0, "loss/logits": 0.17192861065268517, "loss/reg": 0.31935369968414307, "step": 1502 }, { "epoch": 0.01503, "grad_norm": 0.41775619983673096, "grad_norm_var": 0.1579375967265563, "learning_rate": 5e-05, "loss": 0.1683, "loss/crossentropy": 2.9289286136627197, "loss/hidden": 0.0, "loss/logits": 0.16832125186920166, "loss/reg": 0.3191233277320862, "step": 1503 }, { "epoch": 0.01504, "grad_norm": 0.39584484696388245, "grad_norm_var": 0.15815978733492111, "learning_rate": 5e-05, "loss": 0.1662, "loss/crossentropy": 2.938508689403534, "loss/hidden": 0.0, "loss/logits": 0.16621734574437141, "loss/reg": 0.31895801424980164, "step": 1504 }, { "epoch": 0.01505, "grad_norm": 0.45554959774017334, "grad_norm_var": 0.1572307902437639, "learning_rate": 5e-05, "loss": 0.1933, "loss/crossentropy": 2.700239896774292, "loss/hidden": 0.0, "loss/logits": 0.1932990886271, "loss/reg": 0.3190600872039795, "step": 1505 }, { "epoch": 0.01506, "grad_norm": 0.3987845778465271, "grad_norm_var": 0.15709770074840582, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.8826014399528503, "loss/hidden": 0.0, "loss/logits": 0.1852746531367302, "loss/reg": 0.31903666257858276, "step": 1506 }, { "epoch": 0.01507, "grad_norm": 0.43090760707855225, "grad_norm_var": 0.15777502911638946, "learning_rate": 5e-05, "loss": 0.1897, "loss/crossentropy": 2.8248300552368164, "loss/hidden": 0.0, "loss/logits": 0.18968594819307327, "loss/reg": 0.3190354108810425, "step": 1507 }, { "epoch": 0.01508, "grad_norm": 0.4077306091785431, "grad_norm_var": 0.15676019972870667, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.803373098373413, "loss/hidden": 0.0, "loss/logits": 0.17760509997606277, "loss/reg": 0.31906116008758545, "step": 1508 }, { "epoch": 0.01509, "grad_norm": 0.3773565888404846, "grad_norm_var": 0.0026652641656208175, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.786570191383362, "loss/hidden": 0.0, "loss/logits": 0.17787037789821625, "loss/reg": 0.31900233030319214, "step": 1509 }, { "epoch": 0.0151, "grad_norm": 0.5807658433914185, "grad_norm_var": 0.004090612062763255, "learning_rate": 5e-05, "loss": 0.1988, "loss/crossentropy": 2.9518463611602783, "loss/hidden": 0.0, "loss/logits": 0.1988038308918476, "loss/reg": 0.31916406750679016, "step": 1510 }, { "epoch": 0.01511, "grad_norm": 0.41419580578804016, "grad_norm_var": 0.0034858877916350414, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.862063944339752, "loss/hidden": 0.0, "loss/logits": 0.17959555611014366, "loss/reg": 0.3191160261631012, "step": 1511 }, { "epoch": 0.01512, "grad_norm": 0.4236568808555603, "grad_norm_var": 0.002274911217305927, "learning_rate": 5e-05, "loss": 0.1805, "loss/crossentropy": 2.842565596103668, "loss/hidden": 0.0, "loss/logits": 0.18049268424510956, "loss/reg": 0.3188512921333313, "step": 1512 }, { "epoch": 0.01513, "grad_norm": 0.4198692739009857, "grad_norm_var": 0.002257583139114235, "learning_rate": 5e-05, "loss": 0.1821, "loss/crossentropy": 2.8672720789909363, "loss/hidden": 0.0, "loss/logits": 0.18206734582781792, "loss/reg": 0.31882813572883606, "step": 1513 }, { "epoch": 0.01514, "grad_norm": 0.39526188373565674, "grad_norm_var": 0.0022106274462791566, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.7634100317955017, "loss/hidden": 0.0, "loss/logits": 0.17760274186730385, "loss/reg": 0.3191683292388916, "step": 1514 }, { "epoch": 0.01515, "grad_norm": 0.3696017861366272, "grad_norm_var": 0.0023159609878911783, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 2.8186591267585754, "loss/hidden": 0.0, "loss/logits": 0.16728117316961288, "loss/reg": 0.31897711753845215, "step": 1515 }, { "epoch": 0.01516, "grad_norm": 0.6076210737228394, "grad_norm_var": 0.004491222937357468, "learning_rate": 5e-05, "loss": 0.2058, "loss/crossentropy": 2.740968942642212, "loss/hidden": 0.0, "loss/logits": 0.2058316096663475, "loss/reg": 0.31920987367630005, "step": 1516 }, { "epoch": 0.01517, "grad_norm": 0.63004469871521, "grad_norm_var": 0.006771650904836716, "learning_rate": 5e-05, "loss": 0.2211, "loss/crossentropy": 2.8083962202072144, "loss/hidden": 0.0, "loss/logits": 0.22105122730135918, "loss/reg": 0.3194941580295563, "step": 1517 }, { "epoch": 0.01518, "grad_norm": 0.4568759500980377, "grad_norm_var": 0.006719018205195163, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 2.689901053905487, "loss/hidden": 0.0, "loss/logits": 0.1836688332259655, "loss/reg": 0.3191361129283905, "step": 1518 }, { "epoch": 0.01519, "grad_norm": 0.4169345498085022, "grad_norm_var": 0.006722468357496351, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.9693748354911804, "loss/hidden": 0.0, "loss/logits": 0.1809803619980812, "loss/reg": 0.31907111406326294, "step": 1519 }, { "epoch": 0.0152, "grad_norm": 0.39265206456184387, "grad_norm_var": 0.006745654074318526, "learning_rate": 5e-05, "loss": 0.1679, "loss/crossentropy": 2.7168623208999634, "loss/hidden": 0.0, "loss/logits": 0.16789568215608597, "loss/reg": 0.3189171552658081, "step": 1520 }, { "epoch": 0.01521, "grad_norm": 0.4453774690628052, "grad_norm_var": 0.006742713158711781, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 2.722808003425598, "loss/hidden": 0.0, "loss/logits": 0.18961326777935028, "loss/reg": 0.3186608850955963, "step": 1521 }, { "epoch": 0.01522, "grad_norm": 0.46020424365997314, "grad_norm_var": 0.006575633280915419, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.84376323223114, "loss/hidden": 0.0, "loss/logits": 0.180158082395792, "loss/reg": 0.3183158338069916, "step": 1522 }, { "epoch": 0.01523, "grad_norm": 0.3949492573738098, "grad_norm_var": 0.006756690235379425, "learning_rate": 5e-05, "loss": 0.1727, "loss/crossentropy": 2.7806177735328674, "loss/hidden": 0.0, "loss/logits": 0.1727108433842659, "loss/reg": 0.3180563449859619, "step": 1523 }, { "epoch": 0.01524, "grad_norm": 0.47901901602745056, "grad_norm_var": 0.006676642158245842, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.852499783039093, "loss/hidden": 0.0, "loss/logits": 0.19404323399066925, "loss/reg": 0.31775060296058655, "step": 1524 }, { "epoch": 0.01525, "grad_norm": 0.4576343297958374, "grad_norm_var": 0.006258797916706445, "learning_rate": 5e-05, "loss": 0.1948, "loss/crossentropy": 2.712754011154175, "loss/hidden": 0.0, "loss/logits": 0.19483289867639542, "loss/reg": 0.3175033926963806, "step": 1525 }, { "epoch": 0.01526, "grad_norm": 0.38728490471839905, "grad_norm_var": 0.0054582990269161835, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.691767692565918, "loss/hidden": 0.0, "loss/logits": 0.17616674676537514, "loss/reg": 0.31736278533935547, "step": 1526 }, { "epoch": 0.01527, "grad_norm": 0.41456514596939087, "grad_norm_var": 0.0054566946124174264, "learning_rate": 5e-05, "loss": 0.1857, "loss/crossentropy": 2.791685700416565, "loss/hidden": 0.0, "loss/logits": 0.1857350431382656, "loss/reg": 0.31689000129699707, "step": 1527 }, { "epoch": 0.01528, "grad_norm": 0.5231043100357056, "grad_norm_var": 0.005765655634896598, "learning_rate": 5e-05, "loss": 0.1988, "loss/crossentropy": 2.8516095876693726, "loss/hidden": 0.0, "loss/logits": 0.19882458820939064, "loss/reg": 0.31651976704597473, "step": 1528 }, { "epoch": 0.01529, "grad_norm": 0.4133322536945343, "grad_norm_var": 0.005797366677779137, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.648034989833832, "loss/hidden": 0.0, "loss/logits": 0.1840059570968151, "loss/reg": 0.3160998225212097, "step": 1529 }, { "epoch": 0.0153, "grad_norm": 0.41592681407928467, "grad_norm_var": 0.005665578526458381, "learning_rate": 5e-05, "loss": 0.19, "loss/crossentropy": 2.808839738368988, "loss/hidden": 0.0, "loss/logits": 0.19004080072045326, "loss/reg": 0.3158363699913025, "step": 1530 }, { "epoch": 0.01531, "grad_norm": 0.3932291567325592, "grad_norm_var": 0.005434366195139721, "learning_rate": 5e-05, "loss": 0.19, "loss/crossentropy": 2.8083248138427734, "loss/hidden": 0.0, "loss/logits": 0.19004608318209648, "loss/reg": 0.31548190116882324, "step": 1531 }, { "epoch": 0.01532, "grad_norm": 0.40512368083000183, "grad_norm_var": 0.0038912491649181905, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.8550087809562683, "loss/hidden": 0.0, "loss/logits": 0.1842578835785389, "loss/reg": 0.3152494728565216, "step": 1532 }, { "epoch": 0.01533, "grad_norm": 0.4335882067680359, "grad_norm_var": 0.0014011080485289208, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.7314233779907227, "loss/hidden": 0.0, "loss/logits": 0.18525658175349236, "loss/reg": 0.31498342752456665, "step": 1533 }, { "epoch": 0.01534, "grad_norm": 0.5044191479682922, "grad_norm_var": 0.00170886619534573, "learning_rate": 5e-05, "loss": 0.213, "loss/crossentropy": 2.749406576156616, "loss/hidden": 0.0, "loss/logits": 0.21298989281058311, "loss/reg": 0.31500688195228577, "step": 1534 }, { "epoch": 0.01535, "grad_norm": 0.42915329337120056, "grad_norm_var": 0.0016910725301432412, "learning_rate": 5e-05, "loss": 0.202, "loss/crossentropy": 2.7210662364959717, "loss/hidden": 0.0, "loss/logits": 0.2020144984126091, "loss/reg": 0.3147103786468506, "step": 1535 }, { "epoch": 0.01536, "grad_norm": 0.49651697278022766, "grad_norm_var": 0.00178788894928588, "learning_rate": 5e-05, "loss": 0.1925, "loss/crossentropy": 2.95786452293396, "loss/hidden": 0.0, "loss/logits": 0.19249118492007256, "loss/reg": 0.3142845034599304, "step": 1536 }, { "epoch": 0.01537, "grad_norm": 0.43308401107788086, "grad_norm_var": 0.0017898958186787699, "learning_rate": 5e-05, "loss": 0.1925, "loss/crossentropy": 2.7552489042282104, "loss/hidden": 0.0, "loss/logits": 0.19254972785711288, "loss/reg": 0.31401926279067993, "step": 1537 }, { "epoch": 0.01538, "grad_norm": 0.4031049311161041, "grad_norm_var": 0.0018403866901356351, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.8317719101905823, "loss/hidden": 0.0, "loss/logits": 0.1740109920501709, "loss/reg": 0.31365281343460083, "step": 1538 }, { "epoch": 0.01539, "grad_norm": 0.538381814956665, "grad_norm_var": 0.00233151992855453, "learning_rate": 5e-05, "loss": 0.1936, "loss/crossentropy": 2.6280346512794495, "loss/hidden": 0.0, "loss/logits": 0.1936146542429924, "loss/reg": 0.3132340610027313, "step": 1539 }, { "epoch": 0.0154, "grad_norm": 0.47338828444480896, "grad_norm_var": 0.0023083116586843627, "learning_rate": 5e-05, "loss": 0.1827, "loss/crossentropy": 2.6494054198265076, "loss/hidden": 0.0, "loss/logits": 0.18273716419935226, "loss/reg": 0.3130173087120056, "step": 1540 }, { "epoch": 0.01541, "grad_norm": 0.46660441160202026, "grad_norm_var": 0.0023283140165912966, "learning_rate": 5e-05, "loss": 0.2051, "loss/crossentropy": 2.8252177238464355, "loss/hidden": 0.0, "loss/logits": 0.20507887378335, "loss/reg": 0.3125694990158081, "step": 1541 }, { "epoch": 0.01542, "grad_norm": 0.4307655990123749, "grad_norm_var": 0.0021079597794560812, "learning_rate": 5e-05, "loss": 0.1915, "loss/crossentropy": 2.794115960597992, "loss/hidden": 0.0, "loss/logits": 0.19150134176015854, "loss/reg": 0.3117649257183075, "step": 1542 }, { "epoch": 0.01543, "grad_norm": 0.4114488959312439, "grad_norm_var": 0.0021226221922659625, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.7724334597587585, "loss/hidden": 0.0, "loss/logits": 0.18493615090847015, "loss/reg": 0.3115786612033844, "step": 1543 }, { "epoch": 0.01544, "grad_norm": 0.3810760974884033, "grad_norm_var": 0.001964869584106784, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.9771867990493774, "loss/hidden": 0.0, "loss/logits": 0.17503635585308075, "loss/reg": 0.31136539578437805, "step": 1544 }, { "epoch": 0.01545, "grad_norm": 0.37936171889305115, "grad_norm_var": 0.002154710102883669, "learning_rate": 5e-05, "loss": 0.1667, "loss/crossentropy": 2.7613800168037415, "loss/hidden": 0.0, "loss/logits": 0.16672071442008018, "loss/reg": 0.31108558177948, "step": 1545 }, { "epoch": 0.01546, "grad_norm": 0.4082939326763153, "grad_norm_var": 0.002179999786775281, "learning_rate": 5e-05, "loss": 0.1861, "loss/crossentropy": 2.5991764664649963, "loss/hidden": 0.0, "loss/logits": 0.18612245097756386, "loss/reg": 0.3107950687408447, "step": 1546 }, { "epoch": 0.01547, "grad_norm": 0.3777691423892975, "grad_norm_var": 0.002284589844310884, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.9355428814888, "loss/hidden": 0.0, "loss/logits": 0.1849907599389553, "loss/reg": 0.31046029925346375, "step": 1547 }, { "epoch": 0.01548, "grad_norm": 0.3555006682872772, "grad_norm_var": 0.002641161724990963, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.7723390460014343, "loss/hidden": 0.0, "loss/logits": 0.17729327082633972, "loss/reg": 0.31025782227516174, "step": 1548 }, { "epoch": 0.01549, "grad_norm": 0.40138089656829834, "grad_norm_var": 0.002701980036061876, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 3.0421932339668274, "loss/hidden": 0.0, "loss/logits": 0.18854095041751862, "loss/reg": 0.30984851717948914, "step": 1549 }, { "epoch": 0.0155, "grad_norm": 0.6220811009407043, "grad_norm_var": 0.004724707842526051, "learning_rate": 5e-05, "loss": 0.2058, "loss/crossentropy": 2.7495766282081604, "loss/hidden": 0.0, "loss/logits": 0.20575930178165436, "loss/reg": 0.309490829706192, "step": 1550 }, { "epoch": 0.01551, "grad_norm": 0.45552000403404236, "grad_norm_var": 0.004737076302227813, "learning_rate": 5e-05, "loss": 0.2028, "loss/crossentropy": 2.9180603623390198, "loss/hidden": 0.0, "loss/logits": 0.20275843515992165, "loss/reg": 0.3089672327041626, "step": 1551 }, { "epoch": 0.01552, "grad_norm": 0.40026915073394775, "grad_norm_var": 0.004586180236542707, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 2.859548509120941, "loss/hidden": 0.0, "loss/logits": 0.17561225965619087, "loss/reg": 0.30855605006217957, "step": 1552 }, { "epoch": 0.01553, "grad_norm": 0.41607484221458435, "grad_norm_var": 0.004605493474419649, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 2.72252756357193, "loss/hidden": 0.0, "loss/logits": 0.17868969216942787, "loss/reg": 0.3083699643611908, "step": 1553 }, { "epoch": 0.01554, "grad_norm": 0.8096429705619812, "grad_norm_var": 0.013338244620526025, "learning_rate": 5e-05, "loss": 0.261, "loss/crossentropy": 3.0023024678230286, "loss/hidden": 0.0, "loss/logits": 0.26104675978422165, "loss/reg": 0.3080141544342041, "step": 1554 }, { "epoch": 0.01555, "grad_norm": 0.39629507064819336, "grad_norm_var": 0.013076687876748953, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 2.842751920223236, "loss/hidden": 0.0, "loss/logits": 0.17593872919678688, "loss/reg": 0.3080020248889923, "step": 1555 }, { "epoch": 0.01556, "grad_norm": 0.41566234827041626, "grad_norm_var": 0.013097952551408195, "learning_rate": 5e-05, "loss": 0.1902, "loss/crossentropy": 2.830057978630066, "loss/hidden": 0.0, "loss/logits": 0.1902330256998539, "loss/reg": 0.3078528344631195, "step": 1556 }, { "epoch": 0.01557, "grad_norm": 0.41090479493141174, "grad_norm_var": 0.013135003653115025, "learning_rate": 5e-05, "loss": 0.1917, "loss/crossentropy": 2.8096529841423035, "loss/hidden": 0.0, "loss/logits": 0.1916906014084816, "loss/reg": 0.30745574831962585, "step": 1557 }, { "epoch": 0.01558, "grad_norm": 0.41976118087768555, "grad_norm_var": 0.01315906030021564, "learning_rate": 5e-05, "loss": 0.1888, "loss/crossentropy": 2.8282365202903748, "loss/hidden": 0.0, "loss/logits": 0.18881917372345924, "loss/reg": 0.30757883191108704, "step": 1558 }, { "epoch": 0.01559, "grad_norm": 0.6313467025756836, "grad_norm_var": 0.015305580039369248, "learning_rate": 5e-05, "loss": 0.2056, "loss/crossentropy": 2.9385411739349365, "loss/hidden": 0.0, "loss/logits": 0.2055722437798977, "loss/reg": 0.3074854910373688, "step": 1559 }, { "epoch": 0.0156, "grad_norm": 0.4025796353816986, "grad_norm_var": 0.015122361558866126, "learning_rate": 5e-05, "loss": 0.1767, "loss/crossentropy": 2.727729558944702, "loss/hidden": 0.0, "loss/logits": 0.17670691385865211, "loss/reg": 0.3076530694961548, "step": 1560 }, { "epoch": 0.01561, "grad_norm": 0.40675556659698486, "grad_norm_var": 0.014887869583349788, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.817299008369446, "loss/hidden": 0.0, "loss/logits": 0.1787610538303852, "loss/reg": 0.30783405900001526, "step": 1561 }, { "epoch": 0.01562, "grad_norm": 0.45717841386795044, "grad_norm_var": 0.014712495905824107, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 2.8390182852745056, "loss/hidden": 0.0, "loss/logits": 0.19009235501289368, "loss/reg": 0.3075363337993622, "step": 1562 }, { "epoch": 0.01563, "grad_norm": 0.4487801790237427, "grad_norm_var": 0.014238004043399732, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.7976795434951782, "loss/hidden": 0.0, "loss/logits": 0.18359609693288803, "loss/reg": 0.30752602219581604, "step": 1563 }, { "epoch": 0.01564, "grad_norm": 0.5654500722885132, "grad_norm_var": 0.013910653901874884, "learning_rate": 5e-05, "loss": 0.1919, "loss/crossentropy": 2.8910155296325684, "loss/hidden": 0.0, "loss/logits": 0.19191483035683632, "loss/reg": 0.3076559901237488, "step": 1564 }, { "epoch": 0.01565, "grad_norm": 0.4396404027938843, "grad_norm_var": 0.013607561364160251, "learning_rate": 5e-05, "loss": 0.1927, "loss/crossentropy": 2.675153970718384, "loss/hidden": 0.0, "loss/logits": 0.19274790585041046, "loss/reg": 0.307650089263916, "step": 1565 }, { "epoch": 0.01566, "grad_norm": 0.40461117029190063, "grad_norm_var": 0.012476118414910534, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.6574506759643555, "loss/hidden": 0.0, "loss/logits": 0.1839245744049549, "loss/reg": 0.3075932562351227, "step": 1566 }, { "epoch": 0.01567, "grad_norm": 0.3962284326553345, "grad_norm_var": 0.012790778401731488, "learning_rate": 5e-05, "loss": 0.1762, "loss/crossentropy": 2.7790459990501404, "loss/hidden": 0.0, "loss/logits": 0.1761659011244774, "loss/reg": 0.3076193630695343, "step": 1567 }, { "epoch": 0.01568, "grad_norm": 0.4472375512123108, "grad_norm_var": 0.012530647235415264, "learning_rate": 5e-05, "loss": 0.1953, "loss/crossentropy": 2.667806565761566, "loss/hidden": 0.0, "loss/logits": 0.1952761895954609, "loss/reg": 0.30732592940330505, "step": 1568 }, { "epoch": 0.01569, "grad_norm": 0.4162098467350006, "grad_norm_var": 0.012529736023164277, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.519023060798645, "loss/hidden": 0.0, "loss/logits": 0.1873750165104866, "loss/reg": 0.30730780959129333, "step": 1569 }, { "epoch": 0.0157, "grad_norm": 0.4547649919986725, "grad_norm_var": 0.004177037064504067, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.7990105152130127, "loss/hidden": 0.0, "loss/logits": 0.18080955743789673, "loss/reg": 0.30702728033065796, "step": 1570 }, { "epoch": 0.01571, "grad_norm": 0.4508178234100342, "grad_norm_var": 0.004011758343833553, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.7788299322128296, "loss/hidden": 0.0, "loss/logits": 0.18691060319542885, "loss/reg": 0.3069750666618347, "step": 1571 }, { "epoch": 0.01572, "grad_norm": 0.4282020628452301, "grad_norm_var": 0.003967526205939005, "learning_rate": 5e-05, "loss": 0.1985, "loss/crossentropy": 2.544512450695038, "loss/hidden": 0.0, "loss/logits": 0.19851427152752876, "loss/reg": 0.3065212368965149, "step": 1572 }, { "epoch": 0.01573, "grad_norm": 0.5207678079605103, "grad_norm_var": 0.004167092816921543, "learning_rate": 5e-05, "loss": 0.2417, "loss/crossentropy": 2.9696783423423767, "loss/hidden": 0.0, "loss/logits": 0.2416580319404602, "loss/reg": 0.30631935596466064, "step": 1573 }, { "epoch": 0.01574, "grad_norm": 0.5364368557929993, "grad_norm_var": 0.0044596712822784065, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.8673795461654663, "loss/hidden": 0.0, "loss/logits": 0.1882382556796074, "loss/reg": 0.3061702847480774, "step": 1574 }, { "epoch": 0.01575, "grad_norm": 0.5118915438652039, "grad_norm_var": 0.0026692116278975203, "learning_rate": 5e-05, "loss": 0.193, "loss/crossentropy": 2.9447904229164124, "loss/hidden": 0.0, "loss/logits": 0.19304672628641129, "loss/reg": 0.3062281012535095, "step": 1575 }, { "epoch": 0.01576, "grad_norm": 0.47952836751937866, "grad_norm_var": 0.0024966138906354278, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.7698771357536316, "loss/hidden": 0.0, "loss/logits": 0.18586242571473122, "loss/reg": 0.30612289905548096, "step": 1576 }, { "epoch": 0.01577, "grad_norm": 0.397810161113739, "grad_norm_var": 0.002565456431499143, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 2.769404709339142, "loss/hidden": 0.0, "loss/logits": 0.174674890935421, "loss/reg": 0.30621200799942017, "step": 1577 }, { "epoch": 0.01578, "grad_norm": 0.4634721577167511, "grad_norm_var": 0.002565797448431499, "learning_rate": 5e-05, "loss": 0.1986, "loss/crossentropy": 2.8350581526756287, "loss/hidden": 0.0, "loss/logits": 0.19861549139022827, "loss/reg": 0.30604487657546997, "step": 1578 }, { "epoch": 0.01579, "grad_norm": 0.4619215726852417, "grad_norm_var": 0.0025567292176472615, "learning_rate": 5e-05, "loss": 0.1917, "loss/crossentropy": 2.649975001811981, "loss/hidden": 0.0, "loss/logits": 0.19172656536102295, "loss/reg": 0.3063375651836395, "step": 1579 }, { "epoch": 0.0158, "grad_norm": 0.4163503348827362, "grad_norm_var": 0.0018684322656784583, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.705292522907257, "loss/hidden": 0.0, "loss/logits": 0.19033200293779373, "loss/reg": 0.30640220642089844, "step": 1580 }, { "epoch": 0.01581, "grad_norm": 0.3746154308319092, "grad_norm_var": 0.002236545095703794, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.7222800850868225, "loss/hidden": 0.0, "loss/logits": 0.17946995422244072, "loss/reg": 0.30664968490600586, "step": 1581 }, { "epoch": 0.01582, "grad_norm": 0.3900749385356903, "grad_norm_var": 0.0023329819852412705, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.7695385217666626, "loss/hidden": 0.0, "loss/logits": 0.18413609266281128, "loss/reg": 0.3065512180328369, "step": 1582 }, { "epoch": 0.01583, "grad_norm": 0.41534125804901123, "grad_norm_var": 0.002227331261083876, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.9322189688682556, "loss/hidden": 0.0, "loss/logits": 0.18287482485175133, "loss/reg": 0.30627167224884033, "step": 1583 }, { "epoch": 0.01584, "grad_norm": 0.44304996728897095, "grad_norm_var": 0.0022287637206601357, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.8015982508659363, "loss/hidden": 0.0, "loss/logits": 0.18688656389713287, "loss/reg": 0.306031197309494, "step": 1584 }, { "epoch": 0.01585, "grad_norm": 0.4375084936618805, "grad_norm_var": 0.002168034583127228, "learning_rate": 5e-05, "loss": 0.1995, "loss/crossentropy": 2.7860718369483948, "loss/hidden": 0.0, "loss/logits": 0.19949697330594063, "loss/reg": 0.3057941198348999, "step": 1585 }, { "epoch": 0.01586, "grad_norm": 0.38727009296417236, "grad_norm_var": 0.002400062719371058, "learning_rate": 5e-05, "loss": 0.1711, "loss/crossentropy": 2.7247307300567627, "loss/hidden": 0.0, "loss/logits": 0.1710893139243126, "loss/reg": 0.30573582649230957, "step": 1586 }, { "epoch": 0.01587, "grad_norm": 0.42968758940696716, "grad_norm_var": 0.0024107071539155663, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 2.8963180780410767, "loss/hidden": 0.0, "loss/logits": 0.17700784653425217, "loss/reg": 0.30560967326164246, "step": 1587 }, { "epoch": 0.01588, "grad_norm": 0.537462592124939, "grad_norm_var": 0.0029358481663739593, "learning_rate": 5e-05, "loss": 0.2092, "loss/crossentropy": 2.8367822766304016, "loss/hidden": 0.0, "loss/logits": 0.20920675992965698, "loss/reg": 0.3054641783237457, "step": 1588 }, { "epoch": 0.01589, "grad_norm": 0.381846159696579, "grad_norm_var": 0.002834917651652387, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.7925211787223816, "loss/hidden": 0.0, "loss/logits": 0.1732960045337677, "loss/reg": 0.3053727447986603, "step": 1589 }, { "epoch": 0.0159, "grad_norm": 0.4224330186843872, "grad_norm_var": 0.0022043877011299023, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.7238301038742065, "loss/hidden": 0.0, "loss/logits": 0.17680301517248154, "loss/reg": 0.305414617061615, "step": 1590 }, { "epoch": 0.01591, "grad_norm": 0.37531012296676636, "grad_norm_var": 0.0019589504711139552, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.669899344444275, "loss/hidden": 0.0, "loss/logits": 0.17094431445002556, "loss/reg": 0.3054427206516266, "step": 1591 }, { "epoch": 0.01592, "grad_norm": 0.39835357666015625, "grad_norm_var": 0.0017898629145007541, "learning_rate": 5e-05, "loss": 0.1736, "loss/crossentropy": 2.818868100643158, "loss/hidden": 0.0, "loss/logits": 0.17355035990476608, "loss/reg": 0.3056863844394684, "step": 1592 }, { "epoch": 0.01593, "grad_norm": 0.43223661184310913, "grad_norm_var": 0.0017584928128022406, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.8398038744926453, "loss/hidden": 0.0, "loss/logits": 0.1902666576206684, "loss/reg": 0.30567666888237, "step": 1593 }, { "epoch": 0.01594, "grad_norm": 0.38109663128852844, "grad_norm_var": 0.001737346907216993, "learning_rate": 5e-05, "loss": 0.1834, "loss/crossentropy": 2.8384387493133545, "loss/hidden": 0.0, "loss/logits": 0.18341000378131866, "loss/reg": 0.3056263029575348, "step": 1594 }, { "epoch": 0.01595, "grad_norm": 0.4238858222961426, "grad_norm_var": 0.0016039306034060156, "learning_rate": 5e-05, "loss": 0.1985, "loss/crossentropy": 2.7792044281959534, "loss/hidden": 0.0, "loss/logits": 0.1985013298690319, "loss/reg": 0.3056620955467224, "step": 1595 }, { "epoch": 0.01596, "grad_norm": 0.4234940707683563, "grad_norm_var": 0.0016080180547751511, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.9728158712387085, "loss/hidden": 0.0, "loss/logits": 0.1763019636273384, "loss/reg": 0.3058704137802124, "step": 1596 }, { "epoch": 0.01597, "grad_norm": 0.422718346118927, "grad_norm_var": 0.0014881425357037958, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.6792216897010803, "loss/hidden": 0.0, "loss/logits": 0.19394095614552498, "loss/reg": 0.30603963136672974, "step": 1597 }, { "epoch": 0.01598, "grad_norm": 0.4095570743083954, "grad_norm_var": 0.001437090531214702, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.671915113925934, "loss/hidden": 0.0, "loss/logits": 0.18137168511748314, "loss/reg": 0.30596035718917847, "step": 1598 }, { "epoch": 0.01599, "grad_norm": 0.3701878488063812, "grad_norm_var": 0.0015930360587398126, "learning_rate": 5e-05, "loss": 0.167, "loss/crossentropy": 2.7310194969177246, "loss/hidden": 0.0, "loss/logits": 0.16697857901453972, "loss/reg": 0.3062557578086853, "step": 1599 }, { "epoch": 0.016, "grad_norm": 0.38828393816947937, "grad_norm_var": 0.0015921432632330923, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.7151002287864685, "loss/hidden": 0.0, "loss/logits": 0.1822204738855362, "loss/reg": 0.3064274191856384, "step": 1600 }, { "epoch": 0.01601, "grad_norm": 0.43839284777641296, "grad_norm_var": 0.0015949837833320046, "learning_rate": 5e-05, "loss": 0.1947, "loss/crossentropy": 2.837165594100952, "loss/hidden": 0.0, "loss/logits": 0.1947314366698265, "loss/reg": 0.3068501055240631, "step": 1601 }, { "epoch": 0.01602, "grad_norm": 0.4174271523952484, "grad_norm_var": 0.0015447931604489207, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.839165449142456, "loss/hidden": 0.0, "loss/logits": 0.19575729593634605, "loss/reg": 0.30712273716926575, "step": 1602 }, { "epoch": 0.01603, "grad_norm": 0.38109201192855835, "grad_norm_var": 0.0016022326486468655, "learning_rate": 5e-05, "loss": 0.1786, "loss/crossentropy": 2.900722026824951, "loss/hidden": 0.0, "loss/logits": 0.1785622052848339, "loss/reg": 0.30706262588500977, "step": 1603 }, { "epoch": 0.01604, "grad_norm": 0.4136604964733124, "grad_norm_var": 0.0005013143310054684, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.8320196866989136, "loss/hidden": 0.0, "loss/logits": 0.19070453941822052, "loss/reg": 0.3072315752506256, "step": 1604 }, { "epoch": 0.01605, "grad_norm": 0.4881981909275055, "grad_norm_var": 0.000879930273452223, "learning_rate": 5e-05, "loss": 0.1862, "loss/crossentropy": 2.7946428060531616, "loss/hidden": 0.0, "loss/logits": 0.18617677316069603, "loss/reg": 0.3070087134838104, "step": 1605 }, { "epoch": 0.01606, "grad_norm": 0.40416595339775085, "grad_norm_var": 0.0008745114173578224, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.6720356941223145, "loss/hidden": 0.0, "loss/logits": 0.18705860525369644, "loss/reg": 0.30686888098716736, "step": 1606 }, { "epoch": 0.01607, "grad_norm": 0.40146052837371826, "grad_norm_var": 0.0007945411484711211, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.777036488056183, "loss/hidden": 0.0, "loss/logits": 0.1824381723999977, "loss/reg": 0.3069646954536438, "step": 1607 }, { "epoch": 0.01608, "grad_norm": 0.4910094738006592, "grad_norm_var": 0.001160814043543823, "learning_rate": 5e-05, "loss": 0.2089, "loss/crossentropy": 2.7680267095565796, "loss/hidden": 0.0, "loss/logits": 0.20890551805496216, "loss/reg": 0.3071206212043762, "step": 1608 }, { "epoch": 0.01609, "grad_norm": 0.3999772071838379, "grad_norm_var": 0.0011643160019621105, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.763466775417328, "loss/hidden": 0.0, "loss/logits": 0.1814812235534191, "loss/reg": 0.30683812499046326, "step": 1609 }, { "epoch": 0.0161, "grad_norm": 0.41103607416152954, "grad_norm_var": 0.001081354885614542, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.879312515258789, "loss/hidden": 0.0, "loss/logits": 0.1831214651465416, "loss/reg": 0.3067988157272339, "step": 1610 }, { "epoch": 0.01611, "grad_norm": 0.3813370168209076, "grad_norm_var": 0.0011598893153843993, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.8331557512283325, "loss/hidden": 0.0, "loss/logits": 0.18348579853773117, "loss/reg": 0.30691415071487427, "step": 1611 }, { "epoch": 0.01612, "grad_norm": 0.3793767988681793, "grad_norm_var": 0.0012323051107494246, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.896154284477234, "loss/hidden": 0.0, "loss/logits": 0.1843285858631134, "loss/reg": 0.30699262022972107, "step": 1612 }, { "epoch": 0.01613, "grad_norm": 0.4155321717262268, "grad_norm_var": 0.0012256150057333196, "learning_rate": 5e-05, "loss": 0.1987, "loss/crossentropy": 2.879794716835022, "loss/hidden": 0.0, "loss/logits": 0.1987392157316208, "loss/reg": 0.3068220019340515, "step": 1613 }, { "epoch": 0.01614, "grad_norm": 0.3880569338798523, "grad_norm_var": 0.0012612752549993093, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.878888189792633, "loss/hidden": 0.0, "loss/logits": 0.18443862348794937, "loss/reg": 0.30660489201545715, "step": 1614 }, { "epoch": 0.01615, "grad_norm": 0.4117138385772705, "grad_norm_var": 0.0011454370737117377, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.838645339012146, "loss/hidden": 0.0, "loss/logits": 0.17832265049219131, "loss/reg": 0.306822806596756, "step": 1615 }, { "epoch": 0.01616, "grad_norm": 0.4495355486869812, "grad_norm_var": 0.0011766802212597664, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.914682447910309, "loss/hidden": 0.0, "loss/logits": 0.19028585776686668, "loss/reg": 0.3067680597305298, "step": 1616 }, { "epoch": 0.01617, "grad_norm": 0.4152105450630188, "grad_norm_var": 0.001144138827084958, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.8278183937072754, "loss/hidden": 0.0, "loss/logits": 0.18834519013762474, "loss/reg": 0.30672961473464966, "step": 1617 }, { "epoch": 0.01618, "grad_norm": 0.38920125365257263, "grad_norm_var": 0.001186865721211546, "learning_rate": 5e-05, "loss": 0.1917, "loss/crossentropy": 2.720738708972931, "loss/hidden": 0.0, "loss/logits": 0.1916758418083191, "loss/reg": 0.3069232702255249, "step": 1618 }, { "epoch": 0.01619, "grad_norm": 0.39140528440475464, "grad_norm_var": 0.00114855687214613, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.978335916996002, "loss/hidden": 0.0, "loss/logits": 0.17554296553134918, "loss/reg": 0.30702608823776245, "step": 1619 }, { "epoch": 0.0162, "grad_norm": 0.389161616563797, "grad_norm_var": 0.0011885821155874629, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.6545186638832092, "loss/hidden": 0.0, "loss/logits": 0.17330938205122948, "loss/reg": 0.30729639530181885, "step": 1620 }, { "epoch": 0.01621, "grad_norm": 0.9429404139518738, "grad_norm_var": 0.01867857165094581, "learning_rate": 5e-05, "loss": 0.2488, "loss/crossentropy": 2.814428150653839, "loss/hidden": 0.0, "loss/logits": 0.24882221221923828, "loss/reg": 0.3072620928287506, "step": 1621 }, { "epoch": 0.01622, "grad_norm": 0.3707922101020813, "grad_norm_var": 0.018913514037429024, "learning_rate": 5e-05, "loss": 0.1744, "loss/crossentropy": 2.756153166294098, "loss/hidden": 0.0, "loss/logits": 0.17437605187296867, "loss/reg": 0.30766981840133667, "step": 1622 }, { "epoch": 0.01623, "grad_norm": 0.4563125669956207, "grad_norm_var": 0.01882529908181543, "learning_rate": 5e-05, "loss": 0.1974, "loss/crossentropy": 2.751705527305603, "loss/hidden": 0.0, "loss/logits": 0.19736479595303535, "loss/reg": 0.30792200565338135, "step": 1623 }, { "epoch": 0.01624, "grad_norm": 0.5763250589370728, "grad_norm_var": 0.019830188356452505, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 2.887412667274475, "loss/hidden": 0.0, "loss/logits": 0.18372977524995804, "loss/reg": 0.30813470482826233, "step": 1624 }, { "epoch": 0.01625, "grad_norm": 0.46158668398857117, "grad_norm_var": 0.019672977324548254, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.630948841571808, "loss/hidden": 0.0, "loss/logits": 0.1865311898291111, "loss/reg": 0.30826428532600403, "step": 1625 }, { "epoch": 0.01626, "grad_norm": 0.3987314701080322, "grad_norm_var": 0.01974939213134387, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.8407875299453735, "loss/hidden": 0.0, "loss/logits": 0.1790435127913952, "loss/reg": 0.3087776005268097, "step": 1626 }, { "epoch": 0.01627, "grad_norm": 0.3927339017391205, "grad_norm_var": 0.019651535580128967, "learning_rate": 5e-05, "loss": 0.1692, "loss/crossentropy": 2.8696249127388, "loss/hidden": 0.0, "loss/logits": 0.1691587045788765, "loss/reg": 0.3087894320487976, "step": 1627 }, { "epoch": 0.01628, "grad_norm": 0.3900286555290222, "grad_norm_var": 0.01955578439041921, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.835869312286377, "loss/hidden": 0.0, "loss/logits": 0.180639810860157, "loss/reg": 0.3088276982307434, "step": 1628 }, { "epoch": 0.01629, "grad_norm": 0.5516893863677979, "grad_norm_var": 0.02004416409793735, "learning_rate": 5e-05, "loss": 0.2306, "loss/crossentropy": 3.049050509929657, "loss/hidden": 0.0, "loss/logits": 0.23063744604587555, "loss/reg": 0.3093978762626648, "step": 1629 }, { "epoch": 0.0163, "grad_norm": 0.43755584955215454, "grad_norm_var": 0.019716121353513597, "learning_rate": 5e-05, "loss": 0.1984, "loss/crossentropy": 2.770651161670685, "loss/hidden": 0.0, "loss/logits": 0.19839532673358917, "loss/reg": 0.309800386428833, "step": 1630 }, { "epoch": 0.01631, "grad_norm": 0.4202122092247009, "grad_norm_var": 0.019661323499309828, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.7499778270721436, "loss/hidden": 0.0, "loss/logits": 0.1806310974061489, "loss/reg": 0.3094925284385681, "step": 1631 }, { "epoch": 0.01632, "grad_norm": 0.4182007610797882, "grad_norm_var": 0.01978558284850013, "learning_rate": 5e-05, "loss": 0.2007, "loss/crossentropy": 2.7593899965286255, "loss/hidden": 0.0, "loss/logits": 0.20066531747579575, "loss/reg": 0.30977171659469604, "step": 1632 }, { "epoch": 0.01633, "grad_norm": 0.3889821469783783, "grad_norm_var": 0.019994411634316592, "learning_rate": 5e-05, "loss": 0.1776, "loss/crossentropy": 2.7616063952445984, "loss/hidden": 0.0, "loss/logits": 0.17755092307925224, "loss/reg": 0.30979904532432556, "step": 1633 }, { "epoch": 0.01634, "grad_norm": 0.4680616557598114, "grad_norm_var": 0.019628245441376146, "learning_rate": 5e-05, "loss": 0.2114, "loss/crossentropy": 2.7517603635787964, "loss/hidden": 0.0, "loss/logits": 0.2114482782781124, "loss/reg": 0.3103366494178772, "step": 1634 }, { "epoch": 0.01635, "grad_norm": 0.4042542576789856, "grad_norm_var": 0.019510905617066792, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 2.797895848751068, "loss/hidden": 0.0, "loss/logits": 0.1741146557033062, "loss/reg": 0.31056225299835205, "step": 1635 }, { "epoch": 0.01636, "grad_norm": 0.38947105407714844, "grad_norm_var": 0.019507711545812715, "learning_rate": 5e-05, "loss": 0.1719, "loss/crossentropy": 2.8696359992027283, "loss/hidden": 0.0, "loss/logits": 0.17186293378472328, "loss/reg": 0.3104727864265442, "step": 1636 }, { "epoch": 0.01637, "grad_norm": 0.4464292526245117, "grad_norm_var": 0.003390402199995298, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.950844466686249, "loss/hidden": 0.0, "loss/logits": 0.1808536872267723, "loss/reg": 0.31077489256858826, "step": 1637 }, { "epoch": 0.01638, "grad_norm": 0.4987088739871979, "grad_norm_var": 0.003305852717628499, "learning_rate": 5e-05, "loss": 0.1987, "loss/crossentropy": 2.833077311515808, "loss/hidden": 0.0, "loss/logits": 0.19870679825544357, "loss/reg": 0.31085842847824097, "step": 1638 }, { "epoch": 0.01639, "grad_norm": 0.44467684626579285, "grad_norm_var": 0.0032947552089230355, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.7020366191864014, "loss/hidden": 0.0, "loss/logits": 0.18078051880002022, "loss/reg": 0.31109049916267395, "step": 1639 }, { "epoch": 0.0164, "grad_norm": 0.41029319167137146, "grad_norm_var": 0.0020656851146042617, "learning_rate": 5e-05, "loss": 0.1969, "loss/crossentropy": 2.646275222301483, "loss/hidden": 0.0, "loss/logits": 0.19690000265836716, "loss/reg": 0.3109976649284363, "step": 1640 }, { "epoch": 0.01641, "grad_norm": 0.42425641417503357, "grad_norm_var": 0.002008509537175958, "learning_rate": 5e-05, "loss": 0.1914, "loss/crossentropy": 2.8337140679359436, "loss/hidden": 0.0, "loss/logits": 0.19142041727900505, "loss/reg": 0.31090450286865234, "step": 1641 }, { "epoch": 0.01642, "grad_norm": 0.42708146572113037, "grad_norm_var": 0.0019395346031235642, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.87061607837677, "loss/hidden": 0.0, "loss/logits": 0.18034549057483673, "loss/reg": 0.31067630648612976, "step": 1642 }, { "epoch": 0.01643, "grad_norm": 0.4371202290058136, "grad_norm_var": 0.0018300497939087131, "learning_rate": 5e-05, "loss": 0.1948, "loss/crossentropy": 2.9504263401031494, "loss/hidden": 0.0, "loss/logits": 0.1947750262916088, "loss/reg": 0.3104950189590454, "step": 1643 }, { "epoch": 0.01644, "grad_norm": 0.3731367886066437, "grad_norm_var": 0.0019487507393844935, "learning_rate": 5e-05, "loss": 0.1734, "loss/crossentropy": 2.7138015031814575, "loss/hidden": 0.0, "loss/logits": 0.17338746413588524, "loss/reg": 0.3102791905403137, "step": 1644 }, { "epoch": 0.01645, "grad_norm": 0.41540318727493286, "grad_norm_var": 0.0009666345625866731, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.689002275466919, "loss/hidden": 0.0, "loss/logits": 0.1882849633693695, "loss/reg": 0.31019479036331177, "step": 1645 }, { "epoch": 0.01646, "grad_norm": 0.39533117413520813, "grad_norm_var": 0.001008731035611617, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.777493178844452, "loss/hidden": 0.0, "loss/logits": 0.17739306017756462, "loss/reg": 0.31009015440940857, "step": 1646 }, { "epoch": 0.01647, "grad_norm": 1.0455763339996338, "grad_norm_var": 0.025252048913802956, "learning_rate": 5e-05, "loss": 0.2137, "loss/crossentropy": 2.842250645160675, "loss/hidden": 0.0, "loss/logits": 0.21366163343191147, "loss/reg": 0.310110867023468, "step": 1647 }, { "epoch": 0.01648, "grad_norm": 0.5187786817550659, "grad_norm_var": 0.02530113341104618, "learning_rate": 5e-05, "loss": 0.188, "loss/crossentropy": 2.6376081109046936, "loss/hidden": 0.0, "loss/logits": 0.18798857182264328, "loss/reg": 0.31031233072280884, "step": 1648 }, { "epoch": 0.01649, "grad_norm": 0.41704630851745605, "grad_norm_var": 0.025054784800065257, "learning_rate": 5e-05, "loss": 0.1863, "loss/crossentropy": 2.7398990392684937, "loss/hidden": 0.0, "loss/logits": 0.18625252693891525, "loss/reg": 0.31006503105163574, "step": 1649 }, { "epoch": 0.0165, "grad_norm": 0.4378436207771301, "grad_norm_var": 0.025118563610582914, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.8916249871253967, "loss/hidden": 0.0, "loss/logits": 0.18525099754333496, "loss/reg": 0.31010475754737854, "step": 1650 }, { "epoch": 0.01651, "grad_norm": 0.5064277648925781, "grad_norm_var": 0.02490481812070637, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 3.0163946747779846, "loss/hidden": 0.0, "loss/logits": 0.19058984890580177, "loss/reg": 0.3098670244216919, "step": 1651 }, { "epoch": 0.01652, "grad_norm": 0.45759156346321106, "grad_norm_var": 0.024425056441629053, "learning_rate": 5e-05, "loss": 0.1914, "loss/crossentropy": 2.7099347710609436, "loss/hidden": 0.0, "loss/logits": 0.19142314791679382, "loss/reg": 0.30977174639701843, "step": 1652 }, { "epoch": 0.01653, "grad_norm": 0.4375245273113251, "grad_norm_var": 0.02446806768183608, "learning_rate": 5e-05, "loss": 0.197, "loss/crossentropy": 2.604577958583832, "loss/hidden": 0.0, "loss/logits": 0.1969580128788948, "loss/reg": 0.3096407949924469, "step": 1653 }, { "epoch": 0.01654, "grad_norm": 0.4219309389591217, "grad_norm_var": 0.024623728227340057, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.793237090110779, "loss/hidden": 0.0, "loss/logits": 0.17940423637628555, "loss/reg": 0.3096158504486084, "step": 1654 }, { "epoch": 0.01655, "grad_norm": 0.5084498524665833, "grad_norm_var": 0.024636008809260267, "learning_rate": 5e-05, "loss": 0.1908, "loss/crossentropy": 2.7083000540733337, "loss/hidden": 0.0, "loss/logits": 0.19082609564065933, "loss/reg": 0.30940037965774536, "step": 1655 }, { "epoch": 0.01656, "grad_norm": 0.536689817905426, "grad_norm_var": 0.02450842586723049, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 3.065483033657074, "loss/hidden": 0.0, "loss/logits": 0.1826135590672493, "loss/reg": 0.30936428904533386, "step": 1656 }, { "epoch": 0.01657, "grad_norm": 0.491369366645813, "grad_norm_var": 0.02424627210535307, "learning_rate": 5e-05, "loss": 0.1854, "loss/crossentropy": 2.739288330078125, "loss/hidden": 0.0, "loss/logits": 0.18541349470615387, "loss/reg": 0.30928856134414673, "step": 1657 }, { "epoch": 0.01658, "grad_norm": 0.4541490077972412, "grad_norm_var": 0.024067853784253045, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.749044418334961, "loss/hidden": 0.0, "loss/logits": 0.18408801034092903, "loss/reg": 0.30905503034591675, "step": 1658 }, { "epoch": 0.01659, "grad_norm": 0.7109642624855042, "grad_norm_var": 0.02679119790002412, "learning_rate": 5e-05, "loss": 0.1916, "loss/crossentropy": 2.744619846343994, "loss/hidden": 0.0, "loss/logits": 0.19158120453357697, "loss/reg": 0.30922454595565796, "step": 1659 }, { "epoch": 0.0166, "grad_norm": 0.4283425211906433, "grad_norm_var": 0.025988883058292684, "learning_rate": 5e-05, "loss": 0.1888, "loss/crossentropy": 2.680475890636444, "loss/hidden": 0.0, "loss/logits": 0.18882188946008682, "loss/reg": 0.3092501759529114, "step": 1660 }, { "epoch": 0.01661, "grad_norm": 0.397672563791275, "grad_norm_var": 0.026235626494109588, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.787123918533325, "loss/hidden": 0.0, "loss/logits": 0.1807740405201912, "loss/reg": 0.3093108832836151, "step": 1661 }, { "epoch": 0.01662, "grad_norm": 0.4122120141983032, "grad_norm_var": 0.02599454232385572, "learning_rate": 5e-05, "loss": 0.1955, "loss/crossentropy": 2.853621184825897, "loss/hidden": 0.0, "loss/logits": 0.19553126394748688, "loss/reg": 0.3097609877586365, "step": 1662 }, { "epoch": 0.01663, "grad_norm": 0.4292946457862854, "grad_norm_var": 0.00583936023341729, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.911811053752899, "loss/hidden": 0.0, "loss/logits": 0.18709555640816689, "loss/reg": 0.3098617494106293, "step": 1663 }, { "epoch": 0.01664, "grad_norm": 0.5035573840141296, "grad_norm_var": 0.005760715375512622, "learning_rate": 5e-05, "loss": 0.1967, "loss/crossentropy": 2.717349052429199, "loss/hidden": 0.0, "loss/logits": 0.19669797271490097, "loss/reg": 0.3097706437110901, "step": 1664 }, { "epoch": 0.01665, "grad_norm": 0.41146397590637207, "grad_norm_var": 0.005803522224020933, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.886099100112915, "loss/hidden": 0.0, "loss/logits": 0.1869964376091957, "loss/reg": 0.30980947613716125, "step": 1665 }, { "epoch": 0.01666, "grad_norm": 0.4190480709075928, "grad_norm_var": 0.005910179532629304, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.918531060218811, "loss/hidden": 0.0, "loss/logits": 0.18494650721549988, "loss/reg": 0.30957937240600586, "step": 1666 }, { "epoch": 0.01667, "grad_norm": 0.4139748215675354, "grad_norm_var": 0.006000506916820569, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.6986467242240906, "loss/hidden": 0.0, "loss/logits": 0.19399984553456306, "loss/reg": 0.30962032079696655, "step": 1667 }, { "epoch": 0.01668, "grad_norm": 0.44614142179489136, "grad_norm_var": 0.00601946132690427, "learning_rate": 5e-05, "loss": 0.1965, "loss/crossentropy": 2.778202712535858, "loss/hidden": 0.0, "loss/logits": 0.19651244580745697, "loss/reg": 0.3093251883983612, "step": 1668 }, { "epoch": 0.01669, "grad_norm": 0.4579530954360962, "grad_norm_var": 0.005973636900079512, "learning_rate": 5e-05, "loss": 0.2247, "loss/crossentropy": 2.9327465891838074, "loss/hidden": 0.0, "loss/logits": 0.22473404556512833, "loss/reg": 0.3091830611228943, "step": 1669 }, { "epoch": 0.0167, "grad_norm": 0.47435206174850464, "grad_norm_var": 0.005842950902667084, "learning_rate": 5e-05, "loss": 0.1996, "loss/crossentropy": 2.7255457043647766, "loss/hidden": 0.0, "loss/logits": 0.19959543645381927, "loss/reg": 0.30895188450813293, "step": 1670 }, { "epoch": 0.01671, "grad_norm": 0.45027223229408264, "grad_norm_var": 0.0057444219616859344, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.7964431643486023, "loss/hidden": 0.0, "loss/logits": 0.19045410305261612, "loss/reg": 0.30852365493774414, "step": 1671 }, { "epoch": 0.01672, "grad_norm": 0.44329389929771423, "grad_norm_var": 0.00539487961594229, "learning_rate": 5e-05, "loss": 0.2026, "loss/crossentropy": 2.668112576007843, "loss/hidden": 0.0, "loss/logits": 0.202640600502491, "loss/reg": 0.30818429589271545, "step": 1672 }, { "epoch": 0.01673, "grad_norm": 0.4275861084461212, "grad_norm_var": 0.005373898067991867, "learning_rate": 5e-05, "loss": 0.193, "loss/crossentropy": 2.875367522239685, "loss/hidden": 0.0, "loss/logits": 0.19296974688768387, "loss/reg": 0.30799129605293274, "step": 1673 }, { "epoch": 0.01674, "grad_norm": 0.3906836211681366, "grad_norm_var": 0.005632987238109074, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.733620345592499, "loss/hidden": 0.0, "loss/logits": 0.18738483637571335, "loss/reg": 0.3078615665435791, "step": 1674 }, { "epoch": 0.01675, "grad_norm": 0.4052586555480957, "grad_norm_var": 0.0008797148254549432, "learning_rate": 5e-05, "loss": 0.192, "loss/crossentropy": 2.840841293334961, "loss/hidden": 0.0, "loss/logits": 0.19200164079666138, "loss/reg": 0.30786243081092834, "step": 1675 }, { "epoch": 0.01676, "grad_norm": 0.44211894273757935, "grad_norm_var": 0.0008849609335689383, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 2.902799367904663, "loss/hidden": 0.0, "loss/logits": 0.18851850554347038, "loss/reg": 0.3076139986515045, "step": 1676 }, { "epoch": 0.01677, "grad_norm": 0.4148205816745758, "grad_norm_var": 0.0008230119527400271, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.8570794463157654, "loss/hidden": 0.0, "loss/logits": 0.18036120384931564, "loss/reg": 0.3074818253517151, "step": 1677 }, { "epoch": 0.01678, "grad_norm": 0.4282993972301483, "grad_norm_var": 0.0007927162020482826, "learning_rate": 5e-05, "loss": 0.1854, "loss/crossentropy": 2.7639008164405823, "loss/hidden": 0.0, "loss/logits": 0.18544172495603561, "loss/reg": 0.30771613121032715, "step": 1678 }, { "epoch": 0.01679, "grad_norm": 0.39564332365989685, "grad_norm_var": 0.0008885634397795254, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.74172043800354, "loss/hidden": 0.0, "loss/logits": 0.18087028712034225, "loss/reg": 0.30753421783447266, "step": 1679 }, { "epoch": 0.0168, "grad_norm": 0.40890124440193176, "grad_norm_var": 0.0005552716756729007, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 2.790540933609009, "loss/hidden": 0.0, "loss/logits": 0.18737027421593666, "loss/reg": 0.3078846037387848, "step": 1680 }, { "epoch": 0.01681, "grad_norm": 0.404003769159317, "grad_norm_var": 0.0005740676286444294, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.6701492071151733, "loss/hidden": 0.0, "loss/logits": 0.18892518430948257, "loss/reg": 0.3078761100769043, "step": 1681 }, { "epoch": 0.01682, "grad_norm": 0.41149359941482544, "grad_norm_var": 0.0005850367620930682, "learning_rate": 5e-05, "loss": 0.1978, "loss/crossentropy": 2.7879398465156555, "loss/hidden": 0.0, "loss/logits": 0.19776083156466484, "loss/reg": 0.3079061806201935, "step": 1682 }, { "epoch": 0.01683, "grad_norm": 0.40107202529907227, "grad_norm_var": 0.0006160003122224311, "learning_rate": 5e-05, "loss": 0.1877, "loss/crossentropy": 2.7811286449432373, "loss/hidden": 0.0, "loss/logits": 0.1876610890030861, "loss/reg": 0.3082219660282135, "step": 1683 }, { "epoch": 0.01684, "grad_norm": 0.373333215713501, "grad_norm_var": 0.0007432282694189163, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.8956187963485718, "loss/hidden": 0.0, "loss/logits": 0.17740336060523987, "loss/reg": 0.3079228699207306, "step": 1684 }, { "epoch": 0.01685, "grad_norm": 0.4067177474498749, "grad_norm_var": 0.0006519018406564067, "learning_rate": 5e-05, "loss": 0.1921, "loss/crossentropy": 2.8503127098083496, "loss/hidden": 0.0, "loss/logits": 0.19205302372574806, "loss/reg": 0.3080797493457794, "step": 1685 }, { "epoch": 0.01686, "grad_norm": 0.3976154327392578, "grad_norm_var": 0.0004368743946918367, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.7111610174179077, "loss/hidden": 0.0, "loss/logits": 0.18699976056814194, "loss/reg": 0.30818602442741394, "step": 1686 }, { "epoch": 0.01687, "grad_norm": 0.3696020543575287, "grad_norm_var": 0.00043807396968487865, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.804477870464325, "loss/hidden": 0.0, "loss/logits": 0.17212367802858353, "loss/reg": 0.3081195056438446, "step": 1687 }, { "epoch": 0.01688, "grad_norm": 0.37137407064437866, "grad_norm_var": 0.00041837988996959636, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.7993348240852356, "loss/hidden": 0.0, "loss/logits": 0.1729583851993084, "loss/reg": 0.3081451654434204, "step": 1688 }, { "epoch": 0.01689, "grad_norm": 0.45647454261779785, "grad_norm_var": 0.0005651132029709534, "learning_rate": 5e-05, "loss": 0.1897, "loss/crossentropy": 2.8094905614852905, "loss/hidden": 0.0, "loss/logits": 0.1896502636373043, "loss/reg": 0.3082219064235687, "step": 1689 }, { "epoch": 0.0169, "grad_norm": 0.44392576813697815, "grad_norm_var": 0.0006418004482256231, "learning_rate": 5e-05, "loss": 0.1973, "loss/crossentropy": 2.8073139786720276, "loss/hidden": 0.0, "loss/logits": 0.19732685759663582, "loss/reg": 0.30785444378852844, "step": 1690 }, { "epoch": 0.01691, "grad_norm": 0.5088895559310913, "grad_norm_var": 0.001272839978401293, "learning_rate": 5e-05, "loss": 0.204, "loss/crossentropy": 2.7897263169288635, "loss/hidden": 0.0, "loss/logits": 0.2040284164249897, "loss/reg": 0.30745962262153625, "step": 1691 }, { "epoch": 0.01692, "grad_norm": 0.4105668365955353, "grad_norm_var": 0.0012194703753083366, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.8705750703811646, "loss/hidden": 0.0, "loss/logits": 0.18419185653328896, "loss/reg": 0.30764514207839966, "step": 1692 }, { "epoch": 0.01693, "grad_norm": 0.4335708022117615, "grad_norm_var": 0.0012468180088101466, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.9459662437438965, "loss/hidden": 0.0, "loss/logits": 0.1849401630461216, "loss/reg": 0.307472288608551, "step": 1693 }, { "epoch": 0.01694, "grad_norm": 0.3927754759788513, "grad_norm_var": 0.0012572153985319153, "learning_rate": 5e-05, "loss": 0.1931, "loss/crossentropy": 2.7950392961502075, "loss/hidden": 0.0, "loss/logits": 0.1930950991809368, "loss/reg": 0.3074984550476074, "step": 1694 }, { "epoch": 0.01695, "grad_norm": 0.4470040798187256, "grad_norm_var": 0.0013126590717689476, "learning_rate": 5e-05, "loss": 0.1832, "loss/crossentropy": 2.757478952407837, "loss/hidden": 0.0, "loss/logits": 0.183172095566988, "loss/reg": 0.3072901666164398, "step": 1695 }, { "epoch": 0.01696, "grad_norm": 0.3924000859260559, "grad_norm_var": 0.0013427267950040862, "learning_rate": 5e-05, "loss": 0.1891, "loss/crossentropy": 2.670287072658539, "loss/hidden": 0.0, "loss/logits": 0.18911606818437576, "loss/reg": 0.30714932084083557, "step": 1696 }, { "epoch": 0.01697, "grad_norm": 0.4226417541503906, "grad_norm_var": 0.0013400904725269522, "learning_rate": 5e-05, "loss": 0.1967, "loss/crossentropy": 2.80754816532135, "loss/hidden": 0.0, "loss/logits": 0.19667508080601692, "loss/reg": 0.3068749010562897, "step": 1697 }, { "epoch": 0.01698, "grad_norm": 0.3954371213912964, "grad_norm_var": 0.0013636377045166033, "learning_rate": 5e-05, "loss": 0.1777, "loss/crossentropy": 2.745996832847595, "loss/hidden": 0.0, "loss/logits": 0.17765112221240997, "loss/reg": 0.30665123462677, "step": 1698 }, { "epoch": 0.01699, "grad_norm": 0.40876221656799316, "grad_norm_var": 0.0013541164960312695, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 2.8506519198417664, "loss/hidden": 0.0, "loss/logits": 0.18068614602088928, "loss/reg": 0.30618026852607727, "step": 1699 }, { "epoch": 0.017, "grad_norm": 0.4228490889072418, "grad_norm_var": 0.0012359426311042143, "learning_rate": 5e-05, "loss": 0.1953, "loss/crossentropy": 2.7214561104774475, "loss/hidden": 0.0, "loss/logits": 0.19528314471244812, "loss/reg": 0.3061833083629608, "step": 1700 }, { "epoch": 0.01701, "grad_norm": 0.4368031322956085, "grad_norm_var": 0.001249109427606169, "learning_rate": 5e-05, "loss": 0.2004, "loss/crossentropy": 2.7446228861808777, "loss/hidden": 0.0, "loss/logits": 0.2003537118434906, "loss/reg": 0.3060120940208435, "step": 1701 }, { "epoch": 0.01702, "grad_norm": 0.4164392948150635, "grad_norm_var": 0.0012165337728750243, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.8693122267723083, "loss/hidden": 0.0, "loss/logits": 0.18686944618821144, "loss/reg": 0.30598264932632446, "step": 1702 }, { "epoch": 0.01703, "grad_norm": 0.3722231984138489, "grad_norm_var": 0.0011991419484356522, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 2.70352703332901, "loss/hidden": 0.0, "loss/logits": 0.18055832013487816, "loss/reg": 0.3057422637939453, "step": 1703 }, { "epoch": 0.01704, "grad_norm": 0.38578420877456665, "grad_norm_var": 0.0011172352206796502, "learning_rate": 5e-05, "loss": 0.1764, "loss/crossentropy": 2.707695186138153, "loss/hidden": 0.0, "loss/logits": 0.1763889454305172, "loss/reg": 0.3055722415447235, "step": 1704 }, { "epoch": 0.01705, "grad_norm": 0.3832581043243408, "grad_norm_var": 0.0011124015738343798, "learning_rate": 5e-05, "loss": 0.1713, "loss/crossentropy": 2.6583611965179443, "loss/hidden": 0.0, "loss/logits": 0.171316497027874, "loss/reg": 0.30531615018844604, "step": 1705 }, { "epoch": 0.01706, "grad_norm": 0.3905664086341858, "grad_norm_var": 0.0010993790577647192, "learning_rate": 5e-05, "loss": 0.1805, "loss/crossentropy": 2.851792097091675, "loss/hidden": 0.0, "loss/logits": 0.18052047863602638, "loss/reg": 0.3050723075866699, "step": 1706 }, { "epoch": 0.01707, "grad_norm": 0.3747948110103607, "grad_norm_var": 0.0005221559996447282, "learning_rate": 5e-05, "loss": 0.1764, "loss/crossentropy": 2.7359138131141663, "loss/hidden": 0.0, "loss/logits": 0.17639316990971565, "loss/reg": 0.30480334162712097, "step": 1707 }, { "epoch": 0.01708, "grad_norm": 0.39186209440231323, "grad_norm_var": 0.0005310552173019271, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.81093692779541, "loss/hidden": 0.0, "loss/logits": 0.18185490369796753, "loss/reg": 0.30453255772590637, "step": 1708 }, { "epoch": 0.01709, "grad_norm": 0.455247700214386, "grad_norm_var": 0.0006453173427129403, "learning_rate": 5e-05, "loss": 0.1902, "loss/crossentropy": 2.737622320652008, "loss/hidden": 0.0, "loss/logits": 0.19024287164211273, "loss/reg": 0.30428019165992737, "step": 1709 }, { "epoch": 0.0171, "grad_norm": 0.40103429555892944, "grad_norm_var": 0.0006355099935591135, "learning_rate": 5e-05, "loss": 0.179, "loss/crossentropy": 2.859184503555298, "loss/hidden": 0.0, "loss/logits": 0.17904967442154884, "loss/reg": 0.3040787875652313, "step": 1710 }, { "epoch": 0.01711, "grad_norm": 0.3853871822357178, "grad_norm_var": 0.0005364962860730812, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.918987989425659, "loss/hidden": 0.0, "loss/logits": 0.18325665593147278, "loss/reg": 0.3038725256919861, "step": 1711 }, { "epoch": 0.01712, "grad_norm": 0.3928311765193939, "grad_norm_var": 0.0005359435699386832, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.9536839723587036, "loss/hidden": 0.0, "loss/logits": 0.18017259240150452, "loss/reg": 0.3035624027252197, "step": 1712 }, { "epoch": 0.01713, "grad_norm": 0.3927394449710846, "grad_norm_var": 0.000510506931579846, "learning_rate": 5e-05, "loss": 0.17, "loss/crossentropy": 2.783134341239929, "loss/hidden": 0.0, "loss/logits": 0.17001546174287796, "loss/reg": 0.30310800671577454, "step": 1713 }, { "epoch": 0.01714, "grad_norm": 0.4554060399532318, "grad_norm_var": 0.000695781581346111, "learning_rate": 5e-05, "loss": 0.1988, "loss/crossentropy": 2.8036736845970154, "loss/hidden": 0.0, "loss/logits": 0.19879141822457314, "loss/reg": 0.3033056855201721, "step": 1714 }, { "epoch": 0.01715, "grad_norm": 0.39158546924591064, "grad_norm_var": 0.0007035996548166778, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.8976573944091797, "loss/hidden": 0.0, "loss/logits": 0.18494173511862755, "loss/reg": 0.30299001932144165, "step": 1715 }, { "epoch": 0.01716, "grad_norm": 0.4292289912700653, "grad_norm_var": 0.0007229851497682294, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.7815128564834595, "loss/hidden": 0.0, "loss/logits": 0.19315115362405777, "loss/reg": 0.3031367063522339, "step": 1716 }, { "epoch": 0.01717, "grad_norm": 0.37825852632522583, "grad_norm_var": 0.0006768451606373405, "learning_rate": 5e-05, "loss": 0.1739, "loss/crossentropy": 2.7039105892181396, "loss/hidden": 0.0, "loss/logits": 0.173922348767519, "loss/reg": 0.3029789328575134, "step": 1717 }, { "epoch": 0.01718, "grad_norm": 0.39798057079315186, "grad_norm_var": 0.0006571648782215574, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 2.898083209991455, "loss/hidden": 0.0, "loss/logits": 0.18097640201449394, "loss/reg": 0.30289024114608765, "step": 1718 }, { "epoch": 0.01719, "grad_norm": 0.37748467922210693, "grad_norm_var": 0.0006403651479281458, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.6626601219177246, "loss/hidden": 0.0, "loss/logits": 0.17911962792277336, "loss/reg": 0.302725613117218, "step": 1719 }, { "epoch": 0.0172, "grad_norm": 0.3813312351703644, "grad_norm_var": 0.000649430647596283, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.7424102425575256, "loss/hidden": 0.0, "loss/logits": 0.17663609609007835, "loss/reg": 0.3027529716491699, "step": 1720 }, { "epoch": 0.01721, "grad_norm": 0.4177798330783844, "grad_norm_var": 0.0006528960264696182, "learning_rate": 5e-05, "loss": 0.1945, "loss/crossentropy": 2.7866047024726868, "loss/hidden": 0.0, "loss/logits": 0.19452189654111862, "loss/reg": 0.30244576930999756, "step": 1721 }, { "epoch": 0.01722, "grad_norm": 0.4319918751716614, "grad_norm_var": 0.0007033781627532876, "learning_rate": 5e-05, "loss": 0.207, "loss/crossentropy": 2.8035479187965393, "loss/hidden": 0.0, "loss/logits": 0.20697124302387238, "loss/reg": 0.3022805452346802, "step": 1722 }, { "epoch": 0.01723, "grad_norm": 0.38785526156425476, "grad_norm_var": 0.0006641670365487943, "learning_rate": 5e-05, "loss": 0.1783, "loss/crossentropy": 2.690383553504944, "loss/hidden": 0.0, "loss/logits": 0.17828065901994705, "loss/reg": 0.3022075295448303, "step": 1723 }, { "epoch": 0.01724, "grad_norm": 0.40128472447395325, "grad_norm_var": 0.0006541522617278399, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.8421429991722107, "loss/hidden": 0.0, "loss/logits": 0.1823546215891838, "loss/reg": 0.3021862506866455, "step": 1724 }, { "epoch": 0.01725, "grad_norm": 0.40270885825157166, "grad_norm_var": 0.00047355223285017547, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.833739936351776, "loss/hidden": 0.0, "loss/logits": 0.1839047335088253, "loss/reg": 0.30199679732322693, "step": 1725 }, { "epoch": 0.01726, "grad_norm": 0.4189218580722809, "grad_norm_var": 0.0004923069372884707, "learning_rate": 5e-05, "loss": 0.1805, "loss/crossentropy": 2.6183314323425293, "loss/hidden": 0.0, "loss/logits": 0.18052782863378525, "loss/reg": 0.3019511103630066, "step": 1726 }, { "epoch": 0.01727, "grad_norm": 0.3918308615684509, "grad_norm_var": 0.0004800503495578183, "learning_rate": 5e-05, "loss": 0.175, "loss/crossentropy": 2.722986876964569, "loss/hidden": 0.0, "loss/logits": 0.17500554025173187, "loss/reg": 0.301822304725647, "step": 1727 }, { "epoch": 0.01728, "grad_norm": 0.3899526298046112, "grad_norm_var": 0.0004845003352646741, "learning_rate": 5e-05, "loss": 0.1851, "loss/crossentropy": 2.6688209772109985, "loss/hidden": 0.0, "loss/logits": 0.18510791286826134, "loss/reg": 0.3017856776714325, "step": 1728 }, { "epoch": 0.01729, "grad_norm": 0.40600088238716125, "grad_norm_var": 0.0004775326700897716, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.7467674016952515, "loss/hidden": 0.0, "loss/logits": 0.18481990694999695, "loss/reg": 0.30173367261886597, "step": 1729 }, { "epoch": 0.0173, "grad_norm": 0.4132924973964691, "grad_norm_var": 0.0002981841567603135, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 2.8023144006729126, "loss/hidden": 0.0, "loss/logits": 0.18365807086229324, "loss/reg": 0.30136561393737793, "step": 1730 }, { "epoch": 0.01731, "grad_norm": 0.4205559194087982, "grad_norm_var": 0.0003139144184092187, "learning_rate": 5e-05, "loss": 0.1843, "loss/crossentropy": 2.8707889914512634, "loss/hidden": 0.0, "loss/logits": 0.1842528097331524, "loss/reg": 0.30133485794067383, "step": 1731 }, { "epoch": 0.01732, "grad_norm": 0.38152876496315, "grad_norm_var": 0.00028869174751724086, "learning_rate": 5e-05, "loss": 0.1704, "loss/crossentropy": 2.8792667388916016, "loss/hidden": 0.0, "loss/logits": 0.17037741467356682, "loss/reg": 0.3010103702545166, "step": 1732 }, { "epoch": 0.01733, "grad_norm": 0.4108543395996094, "grad_norm_var": 0.0002609434866503298, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.767768085002899, "loss/hidden": 0.0, "loss/logits": 0.18487627431750298, "loss/reg": 0.3008021414279938, "step": 1733 }, { "epoch": 0.01734, "grad_norm": 0.3998468816280365, "grad_norm_var": 0.00026017101551047974, "learning_rate": 5e-05, "loss": 0.1873, "loss/crossentropy": 2.688480496406555, "loss/hidden": 0.0, "loss/logits": 0.18730635941028595, "loss/reg": 0.3003675639629364, "step": 1734 }, { "epoch": 0.01735, "grad_norm": 0.4100005030632019, "grad_norm_var": 0.00021963528419039296, "learning_rate": 5e-05, "loss": 0.1906, "loss/crossentropy": 2.8193799257278442, "loss/hidden": 0.0, "loss/logits": 0.19060558825731277, "loss/reg": 0.3000779151916504, "step": 1735 }, { "epoch": 0.01736, "grad_norm": 0.41830891370773315, "grad_norm_var": 0.000192794243100336, "learning_rate": 5e-05, "loss": 0.191, "loss/crossentropy": 2.887315809726715, "loss/hidden": 0.0, "loss/logits": 0.1910369023680687, "loss/reg": 0.30000731348991394, "step": 1736 }, { "epoch": 0.01737, "grad_norm": 0.3981500267982483, "grad_norm_var": 0.00018714426498795216, "learning_rate": 5e-05, "loss": 0.1846, "loss/crossentropy": 2.8737019300460815, "loss/hidden": 0.0, "loss/logits": 0.18461866676807404, "loss/reg": 0.2995445132255554, "step": 1737 }, { "epoch": 0.01738, "grad_norm": 0.37010127305984497, "grad_norm_var": 0.000205399058455491, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 2.8804875016212463, "loss/hidden": 0.0, "loss/logits": 0.17233756929636002, "loss/reg": 0.2993263602256775, "step": 1738 }, { "epoch": 0.01739, "grad_norm": 0.40156883001327515, "grad_norm_var": 0.0001925245035687954, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.794457733631134, "loss/hidden": 0.0, "loss/logits": 0.19071513041853905, "loss/reg": 0.2993166446685791, "step": 1739 }, { "epoch": 0.0174, "grad_norm": 0.36473914980888367, "grad_norm_var": 0.00028036909609246985, "learning_rate": 5e-05, "loss": 0.1729, "loss/crossentropy": 2.874186158180237, "loss/hidden": 0.0, "loss/logits": 0.1728925257921219, "loss/reg": 0.29882875084877014, "step": 1740 }, { "epoch": 0.01741, "grad_norm": 0.402413934469223, "grad_norm_var": 0.0002802639862048021, "learning_rate": 5e-05, "loss": 0.1881, "loss/crossentropy": 2.859429895877838, "loss/hidden": 0.0, "loss/logits": 0.18807627633213997, "loss/reg": 0.29876694083213806, "step": 1741 }, { "epoch": 0.01742, "grad_norm": 0.39315420389175415, "grad_norm_var": 0.00025633763339478756, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.681659996509552, "loss/hidden": 0.0, "loss/logits": 0.17889350280165672, "loss/reg": 0.29850339889526367, "step": 1742 }, { "epoch": 0.01743, "grad_norm": 0.38321396708488464, "grad_norm_var": 0.00026837489895286483, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.7574509382247925, "loss/hidden": 0.0, "loss/logits": 0.17731642350554466, "loss/reg": 0.2982405126094818, "step": 1743 }, { "epoch": 0.01744, "grad_norm": 0.41650208830833435, "grad_norm_var": 0.0002848975780182431, "learning_rate": 5e-05, "loss": 0.1959, "loss/crossentropy": 2.8237980008125305, "loss/hidden": 0.0, "loss/logits": 0.19589094817638397, "loss/reg": 0.2980102002620697, "step": 1744 }, { "epoch": 0.01745, "grad_norm": 0.3941941559314728, "grad_norm_var": 0.0002832021818657086, "learning_rate": 5e-05, "loss": 0.1849, "loss/crossentropy": 2.741166055202484, "loss/hidden": 0.0, "loss/logits": 0.18494422733783722, "loss/reg": 0.2979772686958313, "step": 1745 }, { "epoch": 0.01746, "grad_norm": 0.37175920605659485, "grad_norm_var": 0.00030993756847153366, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.730659544467926, "loss/hidden": 0.0, "loss/logits": 0.1791798248887062, "loss/reg": 0.29750317335128784, "step": 1746 }, { "epoch": 0.01747, "grad_norm": 0.7653926014900208, "grad_norm_var": 0.008868432480744795, "learning_rate": 5e-05, "loss": 0.2, "loss/crossentropy": 2.839723527431488, "loss/hidden": 0.0, "loss/logits": 0.20002424716949463, "loss/reg": 0.29731935262680054, "step": 1747 }, { "epoch": 0.01748, "grad_norm": 0.4967210292816162, "grad_norm_var": 0.009143620447275883, "learning_rate": 5e-05, "loss": 0.1928, "loss/crossentropy": 2.8431330919265747, "loss/hidden": 0.0, "loss/logits": 0.1928383931517601, "loss/reg": 0.2970081567764282, "step": 1748 }, { "epoch": 0.01749, "grad_norm": 0.5016075372695923, "grad_norm_var": 0.00948953935280029, "learning_rate": 5e-05, "loss": 0.1991, "loss/crossentropy": 3.0068525075912476, "loss/hidden": 0.0, "loss/logits": 0.19914891943335533, "loss/reg": 0.29689136147499084, "step": 1749 }, { "epoch": 0.0175, "grad_norm": 0.5093101263046265, "grad_norm_var": 0.009791338767343068, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.7702261209487915, "loss/hidden": 0.0, "loss/logits": 0.18693333491683006, "loss/reg": 0.29673805832862854, "step": 1750 }, { "epoch": 0.01751, "grad_norm": 0.5022183656692505, "grad_norm_var": 0.009986920920952361, "learning_rate": 5e-05, "loss": 0.198, "loss/crossentropy": 2.7430413961410522, "loss/hidden": 0.0, "loss/logits": 0.1980058178305626, "loss/reg": 0.2966318130493164, "step": 1751 }, { "epoch": 0.01752, "grad_norm": 0.4671650826931, "grad_norm_var": 0.009974710330218732, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.834463596343994, "loss/hidden": 0.0, "loss/logits": 0.18755127489566803, "loss/reg": 0.29621079564094543, "step": 1752 }, { "epoch": 0.01753, "grad_norm": 0.5265049934387207, "grad_norm_var": 0.010183127884364155, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.933722198009491, "loss/hidden": 0.0, "loss/logits": 0.18977096304297447, "loss/reg": 0.29570338129997253, "step": 1753 }, { "epoch": 0.01754, "grad_norm": 0.508196234703064, "grad_norm_var": 0.009827264114973592, "learning_rate": 5e-05, "loss": 0.2023, "loss/crossentropy": 2.7292739748954773, "loss/hidden": 0.0, "loss/logits": 0.2023284174501896, "loss/reg": 0.29535311460494995, "step": 1754 }, { "epoch": 0.01755, "grad_norm": 0.44522297382354736, "grad_norm_var": 0.009590020523749543, "learning_rate": 5e-05, "loss": 0.1721, "loss/crossentropy": 2.8829163312911987, "loss/hidden": 0.0, "loss/logits": 0.17206500098109245, "loss/reg": 0.29472070932388306, "step": 1755 }, { "epoch": 0.01756, "grad_norm": 0.4023776054382324, "grad_norm_var": 0.009172797980864057, "learning_rate": 5e-05, "loss": 0.1852, "loss/crossentropy": 2.713328242301941, "loss/hidden": 0.0, "loss/logits": 0.18521857261657715, "loss/reg": 0.2945430874824524, "step": 1756 }, { "epoch": 0.01757, "grad_norm": 0.40760013461112976, "grad_norm_var": 0.009129215114681092, "learning_rate": 5e-05, "loss": 0.1908, "loss/crossentropy": 2.902981758117676, "loss/hidden": 0.0, "loss/logits": 0.190819401293993, "loss/reg": 0.294501394033432, "step": 1757 }, { "epoch": 0.01758, "grad_norm": 0.4122978746891022, "grad_norm_var": 0.008960576043869451, "learning_rate": 5e-05, "loss": 0.2002, "loss/crossentropy": 2.7262309193611145, "loss/hidden": 0.0, "loss/logits": 0.20020971819758415, "loss/reg": 0.2944103181362152, "step": 1758 }, { "epoch": 0.01759, "grad_norm": 0.40353283286094666, "grad_norm_var": 0.00875290555057762, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.7971346974372864, "loss/hidden": 0.0, "loss/logits": 0.1773214302957058, "loss/reg": 0.29427704215049744, "step": 1759 }, { "epoch": 0.0176, "grad_norm": 0.40881532430648804, "grad_norm_var": 0.008812107736019096, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.8461914658546448, "loss/hidden": 0.0, "loss/logits": 0.18221157416701317, "loss/reg": 0.2941247224807739, "step": 1760 }, { "epoch": 0.01761, "grad_norm": 0.4345194101333618, "grad_norm_var": 0.008505175364569188, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.893059730529785, "loss/hidden": 0.0, "loss/logits": 0.1730005219578743, "loss/reg": 0.2944605052471161, "step": 1761 }, { "epoch": 0.01762, "grad_norm": 0.4975241422653198, "grad_norm_var": 0.007801041576697291, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.862951099872589, "loss/hidden": 0.0, "loss/logits": 0.18445667251944542, "loss/reg": 0.29462262988090515, "step": 1762 }, { "epoch": 0.01763, "grad_norm": 0.4073471128940582, "grad_norm_var": 0.0022157283115425567, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.8038280606269836, "loss/hidden": 0.0, "loss/logits": 0.17718364670872688, "loss/reg": 0.29482150077819824, "step": 1763 }, { "epoch": 0.01764, "grad_norm": 0.3986457288265228, "grad_norm_var": 0.0023129773809520415, "learning_rate": 5e-05, "loss": 0.1811, "loss/crossentropy": 2.7405189275741577, "loss/hidden": 0.0, "loss/logits": 0.1810799427330494, "loss/reg": 0.2950429618358612, "step": 1764 }, { "epoch": 0.01765, "grad_norm": 0.4012739956378937, "grad_norm_var": 0.002279253978165955, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.7286781668663025, "loss/hidden": 0.0, "loss/logits": 0.18146860972046852, "loss/reg": 0.29542869329452515, "step": 1765 }, { "epoch": 0.01766, "grad_norm": 0.40899789333343506, "grad_norm_var": 0.002058509941745964, "learning_rate": 5e-05, "loss": 0.1808, "loss/crossentropy": 2.7478153705596924, "loss/hidden": 0.0, "loss/logits": 0.1807897500693798, "loss/reg": 0.2954123616218567, "step": 1766 }, { "epoch": 0.01767, "grad_norm": 0.39125075936317444, "grad_norm_var": 0.001900383786244797, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.6805031299591064, "loss/hidden": 0.0, "loss/logits": 0.18396606668829918, "loss/reg": 0.2954240143299103, "step": 1767 }, { "epoch": 0.01768, "grad_norm": 0.3843337893486023, "grad_norm_var": 0.0019472286625075719, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.766793668270111, "loss/hidden": 0.0, "loss/logits": 0.18502875044941902, "loss/reg": 0.29545697569847107, "step": 1768 }, { "epoch": 0.01769, "grad_norm": 0.40911319851875305, "grad_norm_var": 0.0012573556005526232, "learning_rate": 5e-05, "loss": 0.1913, "loss/crossentropy": 2.728518068790436, "loss/hidden": 0.0, "loss/logits": 0.19132709503173828, "loss/reg": 0.2956134080886841, "step": 1769 }, { "epoch": 0.0177, "grad_norm": 0.36639800667762756, "grad_norm_var": 0.0008477902847954355, "learning_rate": 5e-05, "loss": 0.1688, "loss/crossentropy": 2.6968125700950623, "loss/hidden": 0.0, "loss/logits": 0.1687924973666668, "loss/reg": 0.29546263813972473, "step": 1770 }, { "epoch": 0.01771, "grad_norm": 0.381981760263443, "grad_norm_var": 0.0008108955206214575, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.682013690471649, "loss/hidden": 0.0, "loss/logits": 0.17397115752100945, "loss/reg": 0.2952303886413574, "step": 1771 }, { "epoch": 0.01772, "grad_norm": 0.9064951539039612, "grad_norm_var": 0.01636676045746719, "learning_rate": 5e-05, "loss": 0.1953, "loss/crossentropy": 2.8549567461013794, "loss/hidden": 0.0, "loss/logits": 0.19533607363700867, "loss/reg": 0.29505717754364014, "step": 1772 }, { "epoch": 0.01773, "grad_norm": 0.46348196268081665, "grad_norm_var": 0.016329780074087537, "learning_rate": 5e-05, "loss": 0.189, "loss/crossentropy": 2.933027684688568, "loss/hidden": 0.0, "loss/logits": 0.1890040598809719, "loss/reg": 0.2950621247291565, "step": 1773 }, { "epoch": 0.01774, "grad_norm": 0.47669729590415955, "grad_norm_var": 0.016331794009514258, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.788936674594879, "loss/hidden": 0.0, "loss/logits": 0.17976482585072517, "loss/reg": 0.2948567569255829, "step": 1774 }, { "epoch": 0.01775, "grad_norm": 0.4476977288722992, "grad_norm_var": 0.016202005775361555, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.8527714610099792, "loss/hidden": 0.0, "loss/logits": 0.1840391792356968, "loss/reg": 0.29487699270248413, "step": 1775 }, { "epoch": 0.01776, "grad_norm": 0.43799036741256714, "grad_norm_var": 0.016098746727462622, "learning_rate": 5e-05, "loss": 0.1707, "loss/crossentropy": 2.7468485832214355, "loss/hidden": 0.0, "loss/logits": 0.1707058995962143, "loss/reg": 0.29452523589134216, "step": 1776 }, { "epoch": 0.01777, "grad_norm": 0.463224858045578, "grad_norm_var": 0.016087707835870414, "learning_rate": 5e-05, "loss": 0.185, "loss/crossentropy": 2.995372712612152, "loss/hidden": 0.0, "loss/logits": 0.18500499054789543, "loss/reg": 0.294184148311615, "step": 1777 }, { "epoch": 0.01778, "grad_norm": 0.42039719223976135, "grad_norm_var": 0.015998060355728087, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 2.766482412815094, "loss/hidden": 0.0, "loss/logits": 0.17731116712093353, "loss/reg": 0.2940358519554138, "step": 1778 }, { "epoch": 0.01779, "grad_norm": 0.39360329508781433, "grad_norm_var": 0.016084056755880143, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.754012882709503, "loss/hidden": 0.0, "loss/logits": 0.18136395514011383, "loss/reg": 0.29391273856163025, "step": 1779 }, { "epoch": 0.0178, "grad_norm": 0.3945219814777374, "grad_norm_var": 0.016111692029150502, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.832540452480316, "loss/hidden": 0.0, "loss/logits": 0.17983149737119675, "loss/reg": 0.2937617599964142, "step": 1780 }, { "epoch": 0.01781, "grad_norm": 0.3858507573604584, "grad_norm_var": 0.016220008094332168, "learning_rate": 5e-05, "loss": 0.1677, "loss/crossentropy": 2.827308773994446, "loss/hidden": 0.0, "loss/logits": 0.1676756888628006, "loss/reg": 0.2933359742164612, "step": 1781 }, { "epoch": 0.01782, "grad_norm": 0.42543327808380127, "grad_norm_var": 0.016156347778849358, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.7960240840911865, "loss/hidden": 0.0, "loss/logits": 0.17720023542642593, "loss/reg": 0.29312464594841003, "step": 1782 }, { "epoch": 0.01783, "grad_norm": 2.5133259296417236, "grad_norm_var": 0.28189505968747713, "learning_rate": 5e-05, "loss": 0.2226, "loss/crossentropy": 2.629977583885193, "loss/hidden": 0.0, "loss/logits": 0.22264550626277924, "loss/reg": 0.29290342330932617, "step": 1783 }, { "epoch": 0.01784, "grad_norm": 0.5351274013519287, "grad_norm_var": 0.2793940799814661, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.8067432641983032, "loss/hidden": 0.0, "loss/logits": 0.18308651447296143, "loss/reg": 0.29280903935432434, "step": 1784 }, { "epoch": 0.01785, "grad_norm": 0.5454636216163635, "grad_norm_var": 0.27728871489404205, "learning_rate": 5e-05, "loss": 0.2106, "loss/crossentropy": 2.8134812116622925, "loss/hidden": 0.0, "loss/logits": 0.21064979955554008, "loss/reg": 0.2926865518093109, "step": 1785 }, { "epoch": 0.01786, "grad_norm": 0.5155285596847534, "grad_norm_var": 0.2740863309628429, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 2.734460711479187, "loss/hidden": 0.0, "loss/logits": 0.18702712282538414, "loss/reg": 0.2925674319267273, "step": 1786 }, { "epoch": 0.01787, "grad_norm": 0.47069650888442993, "grad_norm_var": 0.27192039559150166, "learning_rate": 5e-05, "loss": 0.1909, "loss/crossentropy": 2.716682016849518, "loss/hidden": 0.0, "loss/logits": 0.19088491797447205, "loss/reg": 0.29232555627822876, "step": 1787 }, { "epoch": 0.01788, "grad_norm": 0.43045687675476074, "grad_norm_var": 0.26740557124813624, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.764638066291809, "loss/hidden": 0.0, "loss/logits": 0.19054853543639183, "loss/reg": 0.2923974394798279, "step": 1788 }, { "epoch": 0.01789, "grad_norm": 0.4076257348060608, "grad_norm_var": 0.2684867187726093, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 2.700261890888214, "loss/hidden": 0.0, "loss/logits": 0.18229813501238823, "loss/reg": 0.29219070076942444, "step": 1789 }, { "epoch": 0.0179, "grad_norm": 0.5000810623168945, "grad_norm_var": 0.26820200068201955, "learning_rate": 5e-05, "loss": 0.173, "loss/crossentropy": 2.8919987082481384, "loss/hidden": 0.0, "loss/logits": 0.17296933755278587, "loss/reg": 0.29219815135002136, "step": 1790 }, { "epoch": 0.01791, "grad_norm": 0.5624118447303772, "grad_norm_var": 0.26699415126206244, "learning_rate": 5e-05, "loss": 0.2137, "loss/crossentropy": 2.803538739681244, "loss/hidden": 0.0, "loss/logits": 0.21373149007558823, "loss/reg": 0.2922021746635437, "step": 1791 }, { "epoch": 0.01792, "grad_norm": 0.45283082127571106, "grad_norm_var": 0.2667118623338177, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.8267714977264404, "loss/hidden": 0.0, "loss/logits": 0.1882677674293518, "loss/reg": 0.2917059659957886, "step": 1792 }, { "epoch": 0.01793, "grad_norm": 0.4044612944126129, "grad_norm_var": 0.26790951700136495, "learning_rate": 5e-05, "loss": 0.1886, "loss/crossentropy": 2.830375552177429, "loss/hidden": 0.0, "loss/logits": 0.18858004733920097, "loss/reg": 0.2916744351387024, "step": 1793 }, { "epoch": 0.01794, "grad_norm": 0.429433673620224, "grad_norm_var": 0.26771646105036556, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.7985280752182007, "loss/hidden": 0.0, "loss/logits": 0.19264116510748863, "loss/reg": 0.2913740277290344, "step": 1794 }, { "epoch": 0.01795, "grad_norm": 0.4507838487625122, "grad_norm_var": 0.26645832410988446, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.9009050130844116, "loss/hidden": 0.0, "loss/logits": 0.18690499290823936, "loss/reg": 0.29099005460739136, "step": 1795 }, { "epoch": 0.01796, "grad_norm": 0.39600175619125366, "grad_norm_var": 0.2664200894049838, "learning_rate": 5e-05, "loss": 0.1856, "loss/crossentropy": 2.6873438954353333, "loss/hidden": 0.0, "loss/logits": 0.18564420193433762, "loss/reg": 0.2907087206840515, "step": 1796 }, { "epoch": 0.01797, "grad_norm": 0.3983283042907715, "grad_norm_var": 0.2660916887661533, "learning_rate": 5e-05, "loss": 0.1832, "loss/crossentropy": 2.8260722756385803, "loss/hidden": 0.0, "loss/logits": 0.18320371583104134, "loss/reg": 0.29045289754867554, "step": 1797 }, { "epoch": 0.01798, "grad_norm": 0.4237523674964905, "grad_norm_var": 0.26612872013543504, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.8483822345733643, "loss/hidden": 0.0, "loss/logits": 0.18923310190439224, "loss/reg": 0.2902470827102661, "step": 1798 }, { "epoch": 0.01799, "grad_norm": 0.4298318326473236, "grad_norm_var": 0.003075444644164982, "learning_rate": 5e-05, "loss": 0.1918, "loss/crossentropy": 2.7422080636024475, "loss/hidden": 0.0, "loss/logits": 0.19181104004383087, "loss/reg": 0.29007354378700256, "step": 1799 }, { "epoch": 0.018, "grad_norm": 0.4445195496082306, "grad_norm_var": 0.002675513648957036, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.80060213804245, "loss/hidden": 0.0, "loss/logits": 0.1789444498717785, "loss/reg": 0.29011183977127075, "step": 1800 }, { "epoch": 0.01801, "grad_norm": 0.4318268895149231, "grad_norm_var": 0.002095081086988441, "learning_rate": 5e-05, "loss": 0.1875, "loss/crossentropy": 2.8809146881103516, "loss/hidden": 0.0, "loss/logits": 0.1875174529850483, "loss/reg": 0.2897431254386902, "step": 1801 }, { "epoch": 0.01802, "grad_norm": 0.4308742880821228, "grad_norm_var": 0.0017670606040069796, "learning_rate": 5e-05, "loss": 0.1992, "loss/crossentropy": 2.7047346234321594, "loss/hidden": 0.0, "loss/logits": 0.19920287653803825, "loss/reg": 0.2892533540725708, "step": 1802 }, { "epoch": 0.01803, "grad_norm": 0.3983348309993744, "grad_norm_var": 0.0018125791719831124, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.7841886281967163, "loss/hidden": 0.0, "loss/logits": 0.18243521451950073, "loss/reg": 0.2888230085372925, "step": 1803 }, { "epoch": 0.01804, "grad_norm": 0.3934668004512787, "grad_norm_var": 0.0019302293523735063, "learning_rate": 5e-05, "loss": 0.1822, "loss/crossentropy": 2.8564711213111877, "loss/hidden": 0.0, "loss/logits": 0.18224915117025375, "loss/reg": 0.2886658012866974, "step": 1804 }, { "epoch": 0.01805, "grad_norm": 0.3787465989589691, "grad_norm_var": 0.002086452640810125, "learning_rate": 5e-05, "loss": 0.1782, "loss/crossentropy": 2.7245002388954163, "loss/hidden": 0.0, "loss/logits": 0.17817376554012299, "loss/reg": 0.28839489817619324, "step": 1805 }, { "epoch": 0.01806, "grad_norm": 0.3995632827281952, "grad_norm_var": 0.0018169578673726408, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 2.7673813700675964, "loss/hidden": 0.0, "loss/logits": 0.1752001866698265, "loss/reg": 0.2880570888519287, "step": 1806 }, { "epoch": 0.01807, "grad_norm": 0.4311852753162384, "grad_norm_var": 0.0005164782205177539, "learning_rate": 5e-05, "loss": 0.1984, "loss/crossentropy": 2.71965491771698, "loss/hidden": 0.0, "loss/logits": 0.19843441992998123, "loss/reg": 0.28791508078575134, "step": 1807 }, { "epoch": 0.01808, "grad_norm": 0.3885496258735657, "grad_norm_var": 0.0004793864920144627, "learning_rate": 5e-05, "loss": 0.184, "loss/crossentropy": 2.7418529391288757, "loss/hidden": 0.0, "loss/logits": 0.18398427590727806, "loss/reg": 0.2879105508327484, "step": 1808 }, { "epoch": 0.01809, "grad_norm": 0.4516981840133667, "grad_norm_var": 0.0005565389618607567, "learning_rate": 5e-05, "loss": 0.2031, "loss/crossentropy": 2.8339706659317017, "loss/hidden": 0.0, "loss/logits": 0.2030727006494999, "loss/reg": 0.28796589374542236, "step": 1809 }, { "epoch": 0.0181, "grad_norm": 0.43539348244667053, "grad_norm_var": 0.0005683960132204291, "learning_rate": 5e-05, "loss": 0.195, "loss/crossentropy": 2.7498282194137573, "loss/hidden": 0.0, "loss/logits": 0.19498739764094353, "loss/reg": 0.28785622119903564, "step": 1810 }, { "epoch": 0.01811, "grad_norm": 0.4696669280529022, "grad_norm_var": 0.000674032326662651, "learning_rate": 5e-05, "loss": 0.2118, "loss/crossentropy": 2.677243411540985, "loss/hidden": 0.0, "loss/logits": 0.2117900289595127, "loss/reg": 0.28758811950683594, "step": 1811 }, { "epoch": 0.01812, "grad_norm": 0.4130764901638031, "grad_norm_var": 0.0006402170407412792, "learning_rate": 5e-05, "loss": 0.1909, "loss/crossentropy": 2.676861882209778, "loss/hidden": 0.0, "loss/logits": 0.19092785194516182, "loss/reg": 0.28740450739860535, "step": 1812 }, { "epoch": 0.01813, "grad_norm": 0.35778436064720154, "grad_norm_var": 0.0008597089232768066, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.7366560101509094, "loss/hidden": 0.0, "loss/logits": 0.17595980316400528, "loss/reg": 0.28695616126060486, "step": 1813 }, { "epoch": 0.01814, "grad_norm": 0.4103051424026489, "grad_norm_var": 0.0008596066229808094, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.8703550696372986, "loss/hidden": 0.0, "loss/logits": 0.19264446943998337, "loss/reg": 0.2869335114955902, "step": 1814 }, { "epoch": 0.01815, "grad_norm": 0.41271740198135376, "grad_norm_var": 0.000847608333088464, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 3.024072229862213, "loss/hidden": 0.0, "loss/logits": 0.1803882196545601, "loss/reg": 0.2870616912841797, "step": 1815 }, { "epoch": 0.01816, "grad_norm": 0.3776957392692566, "grad_norm_var": 0.0008679756263098212, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.664080262184143, "loss/hidden": 0.0, "loss/logits": 0.1939038522541523, "loss/reg": 0.2867726981639862, "step": 1816 }, { "epoch": 0.01817, "grad_norm": 0.4147520661354065, "grad_norm_var": 0.0008394772144945884, "learning_rate": 5e-05, "loss": 0.1787, "loss/crossentropy": 2.689032733440399, "loss/hidden": 0.0, "loss/logits": 0.1787305288016796, "loss/reg": 0.2868281602859497, "step": 1817 }, { "epoch": 0.01818, "grad_norm": 0.39301252365112305, "grad_norm_var": 0.0008248957407844852, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.655192196369171, "loss/hidden": 0.0, "loss/logits": 0.18089300021529198, "loss/reg": 0.28639689087867737, "step": 1818 }, { "epoch": 0.01819, "grad_norm": 0.4132724404335022, "grad_norm_var": 0.0008198469076731237, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.6796045899391174, "loss/hidden": 0.0, "loss/logits": 0.18301057815551758, "loss/reg": 0.2865252196788788, "step": 1819 }, { "epoch": 0.0182, "grad_norm": 0.40159738063812256, "grad_norm_var": 0.00080735032897525, "learning_rate": 5e-05, "loss": 0.188, "loss/crossentropy": 2.8699146509170532, "loss/hidden": 0.0, "loss/logits": 0.18802717328071594, "loss/reg": 0.28615501523017883, "step": 1820 }, { "epoch": 0.01821, "grad_norm": 0.709716260433197, "grad_norm_var": 0.006304759499767683, "learning_rate": 5e-05, "loss": 0.1876, "loss/crossentropy": 2.7997403144836426, "loss/hidden": 0.0, "loss/logits": 0.18760208413004875, "loss/reg": 0.2859438955783844, "step": 1821 }, { "epoch": 0.01822, "grad_norm": 0.5454288721084595, "grad_norm_var": 0.007042617982155082, "learning_rate": 5e-05, "loss": 0.2003, "loss/crossentropy": 2.6838915944099426, "loss/hidden": 0.0, "loss/logits": 0.2002948448061943, "loss/reg": 0.28592756390571594, "step": 1822 }, { "epoch": 0.01823, "grad_norm": 0.4290259778499603, "grad_norm_var": 0.007045192629477148, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.711521089076996, "loss/hidden": 0.0, "loss/logits": 0.18885130062699318, "loss/reg": 0.28590691089630127, "step": 1823 }, { "epoch": 0.01824, "grad_norm": 0.38728779554367065, "grad_norm_var": 0.007053776888441698, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.7658904790878296, "loss/hidden": 0.0, "loss/logits": 0.179933063685894, "loss/reg": 0.2859230637550354, "step": 1824 }, { "epoch": 0.01825, "grad_norm": 0.42048969864845276, "grad_norm_var": 0.0070614031348301244, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.6710073351860046, "loss/hidden": 0.0, "loss/logits": 0.18247348442673683, "loss/reg": 0.28594648838043213, "step": 1825 }, { "epoch": 0.01826, "grad_norm": 0.4239570200443268, "grad_norm_var": 0.007071953300871537, "learning_rate": 5e-05, "loss": 0.2069, "loss/crossentropy": 2.8063488602638245, "loss/hidden": 0.0, "loss/logits": 0.2069377452135086, "loss/reg": 0.28589680790901184, "step": 1826 }, { "epoch": 0.01827, "grad_norm": 0.3886289596557617, "grad_norm_var": 0.007121183874868106, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 2.767171561717987, "loss/hidden": 0.0, "loss/logits": 0.17853541299700737, "loss/reg": 0.28593909740448, "step": 1827 }, { "epoch": 0.01828, "grad_norm": 0.48891201615333557, "grad_norm_var": 0.007297654507580905, "learning_rate": 5e-05, "loss": 0.1895, "loss/crossentropy": 2.8321388959884644, "loss/hidden": 0.0, "loss/logits": 0.1895015761256218, "loss/reg": 0.28601542115211487, "step": 1828 }, { "epoch": 0.01829, "grad_norm": 0.4100649654865265, "grad_norm_var": 0.006923878963645195, "learning_rate": 5e-05, "loss": 0.1943, "loss/crossentropy": 2.829595446586609, "loss/hidden": 0.0, "loss/logits": 0.1943066529929638, "loss/reg": 0.2862739562988281, "step": 1829 }, { "epoch": 0.0183, "grad_norm": 0.4316028356552124, "grad_norm_var": 0.006870235526090277, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.9004554748535156, "loss/hidden": 0.0, "loss/logits": 0.1830880083143711, "loss/reg": 0.2859758138656616, "step": 1830 }, { "epoch": 0.01831, "grad_norm": 0.4355422258377075, "grad_norm_var": 0.006818214453751929, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.7657106518745422, "loss/hidden": 0.0, "loss/logits": 0.18653302639722824, "loss/reg": 0.28651851415634155, "step": 1831 }, { "epoch": 0.01832, "grad_norm": 0.43425384163856506, "grad_norm_var": 0.006533694592397291, "learning_rate": 5e-05, "loss": 0.196, "loss/crossentropy": 2.7590981125831604, "loss/hidden": 0.0, "loss/logits": 0.1959908865392208, "loss/reg": 0.2869378924369812, "step": 1832 }, { "epoch": 0.01833, "grad_norm": 0.39966991543769836, "grad_norm_var": 0.0066096870081289874, "learning_rate": 5e-05, "loss": 0.1857, "loss/crossentropy": 2.661256492137909, "loss/hidden": 0.0, "loss/logits": 0.1856737956404686, "loss/reg": 0.28677791357040405, "step": 1833 }, { "epoch": 0.01834, "grad_norm": 0.3755083382129669, "grad_norm_var": 0.006749070465186927, "learning_rate": 5e-05, "loss": 0.1732, "loss/crossentropy": 2.7987326979637146, "loss/hidden": 0.0, "loss/logits": 0.1732139103114605, "loss/reg": 0.286994993686676, "step": 1834 }, { "epoch": 0.01835, "grad_norm": 0.41761380434036255, "grad_norm_var": 0.006732788929913861, "learning_rate": 5e-05, "loss": 0.1939, "loss/crossentropy": 2.8797848224639893, "loss/hidden": 0.0, "loss/logits": 0.19392692670226097, "loss/reg": 0.28666937351226807, "step": 1835 }, { "epoch": 0.01836, "grad_norm": 0.42143893241882324, "grad_norm_var": 0.006645993685495159, "learning_rate": 5e-05, "loss": 0.1877, "loss/crossentropy": 2.8627849221229553, "loss/hidden": 0.0, "loss/logits": 0.18769410997629166, "loss/reg": 0.2864522933959961, "step": 1836 }, { "epoch": 0.01837, "grad_norm": 0.4315630793571472, "grad_norm_var": 0.0016620221566348312, "learning_rate": 5e-05, "loss": 0.1923, "loss/crossentropy": 3.107477605342865, "loss/hidden": 0.0, "loss/logits": 0.19227750226855278, "loss/reg": 0.286095529794693, "step": 1837 }, { "epoch": 0.01838, "grad_norm": 0.4683064818382263, "grad_norm_var": 0.0008217378859434484, "learning_rate": 5e-05, "loss": 0.1972, "loss/crossentropy": 2.489143431186676, "loss/hidden": 0.0, "loss/logits": 0.19717977195978165, "loss/reg": 0.2857098877429962, "step": 1838 }, { "epoch": 0.01839, "grad_norm": 0.4195990562438965, "grad_norm_var": 0.0008193931084515567, "learning_rate": 5e-05, "loss": 0.1951, "loss/crossentropy": 2.8369109630584717, "loss/hidden": 0.0, "loss/logits": 0.19511422514915466, "loss/reg": 0.28555694222450256, "step": 1839 }, { "epoch": 0.0184, "grad_norm": 0.37779441475868225, "grad_norm_var": 0.0008691569828260359, "learning_rate": 5e-05, "loss": 0.1704, "loss/crossentropy": 2.6417484879493713, "loss/hidden": 0.0, "loss/logits": 0.1704365611076355, "loss/reg": 0.2853318750858307, "step": 1840 }, { "epoch": 0.01841, "grad_norm": 0.4020920395851135, "grad_norm_var": 0.0008929348610217584, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.7724534273147583, "loss/hidden": 0.0, "loss/logits": 0.18278496339917183, "loss/reg": 0.2851176857948303, "step": 1841 }, { "epoch": 0.01842, "grad_norm": 0.4984844923019409, "grad_norm_var": 0.0012753355919444302, "learning_rate": 5e-05, "loss": 0.2054, "loss/crossentropy": 2.9141579270362854, "loss/hidden": 0.0, "loss/logits": 0.20539025217294693, "loss/reg": 0.2849942445755005, "step": 1842 }, { "epoch": 0.01843, "grad_norm": 0.40646782517433167, "grad_norm_var": 0.0012085557166622038, "learning_rate": 5e-05, "loss": 0.1811, "loss/crossentropy": 2.7275006771087646, "loss/hidden": 0.0, "loss/logits": 0.18112165480852127, "loss/reg": 0.2849220037460327, "step": 1843 }, { "epoch": 0.01844, "grad_norm": 0.3913916349411011, "grad_norm_var": 0.000987285925394573, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.9048088788986206, "loss/hidden": 0.0, "loss/logits": 0.1868954598903656, "loss/reg": 0.284819096326828, "step": 1844 }, { "epoch": 0.01845, "grad_norm": 0.4697960913181305, "grad_norm_var": 0.0011304559627877205, "learning_rate": 5e-05, "loss": 0.1971, "loss/crossentropy": 2.733458638191223, "loss/hidden": 0.0, "loss/logits": 0.1971411257982254, "loss/reg": 0.2844506502151489, "step": 1845 }, { "epoch": 0.01846, "grad_norm": 0.4240216016769409, "grad_norm_var": 0.0011261813404522148, "learning_rate": 5e-05, "loss": 0.1728, "loss/crossentropy": 2.8301478028297424, "loss/hidden": 0.0, "loss/logits": 0.17284953594207764, "loss/reg": 0.28430625796318054, "step": 1846 }, { "epoch": 0.01847, "grad_norm": 0.4012894332408905, "grad_norm_var": 0.0011438112831025139, "learning_rate": 5e-05, "loss": 0.1709, "loss/crossentropy": 2.836243987083435, "loss/hidden": 0.0, "loss/logits": 0.17090784013271332, "loss/reg": 0.2841241657733917, "step": 1847 }, { "epoch": 0.01848, "grad_norm": 0.3927440047264099, "grad_norm_var": 0.0011792860370338184, "learning_rate": 5e-05, "loss": 0.1862, "loss/crossentropy": 2.8864346742630005, "loss/hidden": 0.0, "loss/logits": 0.1862446777522564, "loss/reg": 0.28361642360687256, "step": 1848 }, { "epoch": 0.01849, "grad_norm": 0.4403141736984253, "grad_norm_var": 0.0011798853496232812, "learning_rate": 5e-05, "loss": 0.1902, "loss/crossentropy": 2.79424124956131, "loss/hidden": 0.0, "loss/logits": 0.19023774191737175, "loss/reg": 0.28345903754234314, "step": 1849 }, { "epoch": 0.0185, "grad_norm": 0.40912845730781555, "grad_norm_var": 0.0010459256771254883, "learning_rate": 5e-05, "loss": 0.1909, "loss/crossentropy": 2.7717979550361633, "loss/hidden": 0.0, "loss/logits": 0.19086836278438568, "loss/reg": 0.2831501066684723, "step": 1850 }, { "epoch": 0.01851, "grad_norm": 0.578755795955658, "grad_norm_var": 0.0025476888488598195, "learning_rate": 5e-05, "loss": 0.1921, "loss/crossentropy": 2.6009531021118164, "loss/hidden": 0.0, "loss/logits": 0.19208815693855286, "loss/reg": 0.28307682275772095, "step": 1851 }, { "epoch": 0.01852, "grad_norm": 0.45383477210998535, "grad_norm_var": 0.002561944152675422, "learning_rate": 5e-05, "loss": 0.1931, "loss/crossentropy": 2.7235918045043945, "loss/hidden": 0.0, "loss/logits": 0.19309358671307564, "loss/reg": 0.28260084986686707, "step": 1852 }, { "epoch": 0.01853, "grad_norm": 0.4577047824859619, "grad_norm_var": 0.002591460028373194, "learning_rate": 5e-05, "loss": 0.2236, "loss/crossentropy": 2.7366344928741455, "loss/hidden": 0.0, "loss/logits": 0.22360356152057648, "loss/reg": 0.2821139097213745, "step": 1853 }, { "epoch": 0.01854, "grad_norm": 0.4365105628967285, "grad_norm_var": 0.00252185100890883, "learning_rate": 5e-05, "loss": 0.1947, "loss/crossentropy": 2.8033213019371033, "loss/hidden": 0.0, "loss/logits": 0.19469663873314857, "loss/reg": 0.28183096647262573, "step": 1854 }, { "epoch": 0.01855, "grad_norm": 0.4712860584259033, "grad_norm_var": 0.002582716019453246, "learning_rate": 5e-05, "loss": 0.1998, "loss/crossentropy": 2.758160352706909, "loss/hidden": 0.0, "loss/logits": 0.199786227196455, "loss/reg": 0.2815592288970947, "step": 1855 }, { "epoch": 0.01856, "grad_norm": 0.4058321416378021, "grad_norm_var": 0.002405932882053252, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.8052437901496887, "loss/hidden": 0.0, "loss/logits": 0.18153153732419014, "loss/reg": 0.28127020597457886, "step": 1856 }, { "epoch": 0.01857, "grad_norm": 0.42083102464675903, "grad_norm_var": 0.0023332195538674607, "learning_rate": 5e-05, "loss": 0.1992, "loss/crossentropy": 2.59348601102829, "loss/hidden": 0.0, "loss/logits": 0.19922863319516182, "loss/reg": 0.28071773052215576, "step": 1857 }, { "epoch": 0.01858, "grad_norm": 0.47982022166252136, "grad_norm_var": 0.0022123097654171975, "learning_rate": 5e-05, "loss": 0.2089, "loss/crossentropy": 2.681568682193756, "loss/hidden": 0.0, "loss/logits": 0.20894664525985718, "loss/reg": 0.28021734952926636, "step": 1858 }, { "epoch": 0.01859, "grad_norm": 0.5198923945426941, "grad_norm_var": 0.0025095207876681476, "learning_rate": 5e-05, "loss": 0.1985, "loss/crossentropy": 2.7710816860198975, "loss/hidden": 0.0, "loss/logits": 0.19853570312261581, "loss/reg": 0.27992287278175354, "step": 1859 }, { "epoch": 0.0186, "grad_norm": 0.45905089378356934, "grad_norm_var": 0.00229332546431484, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.6997227668762207, "loss/hidden": 0.0, "loss/logits": 0.177433829754591, "loss/reg": 0.2796526849269867, "step": 1860 }, { "epoch": 0.01861, "grad_norm": 0.42669880390167236, "grad_norm_var": 0.0023031317509346247, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 2.842024505138397, "loss/hidden": 0.0, "loss/logits": 0.17894761636853218, "loss/reg": 0.2793560326099396, "step": 1861 }, { "epoch": 0.01862, "grad_norm": 0.4246305823326111, "grad_norm_var": 0.0023011586427452713, "learning_rate": 5e-05, "loss": 0.1814, "loss/crossentropy": 2.6766517758369446, "loss/hidden": 0.0, "loss/logits": 0.18142684549093246, "loss/reg": 0.27923518419265747, "step": 1862 }, { "epoch": 0.01863, "grad_norm": 0.39616042375564575, "grad_norm_var": 0.002335187942751793, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.8127864599227905, "loss/hidden": 0.0, "loss/logits": 0.18350589647889137, "loss/reg": 0.2786794900894165, "step": 1863 }, { "epoch": 0.01864, "grad_norm": 0.43346020579338074, "grad_norm_var": 0.0021370630745344956, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.86939400434494, "loss/hidden": 0.0, "loss/logits": 0.1898091360926628, "loss/reg": 0.2781749665737152, "step": 1864 }, { "epoch": 0.01865, "grad_norm": 0.39947545528411865, "grad_norm_var": 0.0022987758586734703, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.901140809059143, "loss/hidden": 0.0, "loss/logits": 0.17914026230573654, "loss/reg": 0.2776210308074951, "step": 1865 }, { "epoch": 0.01866, "grad_norm": 0.40816158056259155, "grad_norm_var": 0.002303886356898159, "learning_rate": 5e-05, "loss": 0.1991, "loss/crossentropy": 2.946498155593872, "loss/hidden": 0.0, "loss/logits": 0.19913273304700851, "loss/reg": 0.2769990563392639, "step": 1866 }, { "epoch": 0.01867, "grad_norm": 0.4050334393978119, "grad_norm_var": 0.0011673521943027362, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.846403658390045, "loss/hidden": 0.0, "loss/logits": 0.17989295348525047, "loss/reg": 0.2766755521297455, "step": 1867 }, { "epoch": 0.01868, "grad_norm": 0.4267737567424774, "grad_norm_var": 0.0011538182148522424, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.851919412612915, "loss/hidden": 0.0, "loss/logits": 0.18656672164797783, "loss/reg": 0.2762860059738159, "step": 1868 }, { "epoch": 0.01869, "grad_norm": 0.44364508986473083, "grad_norm_var": 0.0011249365016800183, "learning_rate": 5e-05, "loss": 0.1934, "loss/crossentropy": 2.6769986748695374, "loss/hidden": 0.0, "loss/logits": 0.1933923326432705, "loss/reg": 0.27565231919288635, "step": 1869 }, { "epoch": 0.0187, "grad_norm": 0.402190238237381, "grad_norm_var": 0.0011908589995408077, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 2.936454713344574, "loss/hidden": 0.0, "loss/logits": 0.17851043865084648, "loss/reg": 0.2749617099761963, "step": 1870 }, { "epoch": 0.01871, "grad_norm": 0.4893626272678375, "grad_norm_var": 0.0013043209358951054, "learning_rate": 5e-05, "loss": 0.1904, "loss/crossentropy": 2.803850769996643, "loss/hidden": 0.0, "loss/logits": 0.19039420410990715, "loss/reg": 0.27434176206588745, "step": 1871 }, { "epoch": 0.01872, "grad_norm": 0.44162189960479736, "grad_norm_var": 0.0012508506114582022, "learning_rate": 5e-05, "loss": 0.1967, "loss/crossentropy": 2.7133684754371643, "loss/hidden": 0.0, "loss/logits": 0.1966903693974018, "loss/reg": 0.2741237282752991, "step": 1872 }, { "epoch": 0.01873, "grad_norm": 0.4145720899105072, "grad_norm_var": 0.001266000063265397, "learning_rate": 5e-05, "loss": 0.192, "loss/crossentropy": 2.747166335582733, "loss/hidden": 0.0, "loss/logits": 0.1920209936797619, "loss/reg": 0.2738998234272003, "step": 1873 }, { "epoch": 0.01874, "grad_norm": 0.416154146194458, "grad_norm_var": 0.0011444624388339522, "learning_rate": 5e-05, "loss": 0.1902, "loss/crossentropy": 2.8506762981414795, "loss/hidden": 0.0, "loss/logits": 0.1902056373655796, "loss/reg": 0.2732773423194885, "step": 1874 }, { "epoch": 0.01875, "grad_norm": 0.3881121277809143, "grad_norm_var": 0.0006798901233864248, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 2.650409758090973, "loss/hidden": 0.0, "loss/logits": 0.17934256419539452, "loss/reg": 0.2730664610862732, "step": 1875 }, { "epoch": 0.01876, "grad_norm": 0.38840100169181824, "grad_norm_var": 0.0006564362729794955, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.863765001296997, "loss/hidden": 0.0, "loss/logits": 0.1818632073700428, "loss/reg": 0.2727936804294586, "step": 1876 }, { "epoch": 0.01877, "grad_norm": 0.42450594902038574, "grad_norm_var": 0.0006544941165524065, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.6840816140174866, "loss/hidden": 0.0, "loss/logits": 0.19261902570724487, "loss/reg": 0.27240055799484253, "step": 1877 }, { "epoch": 0.01878, "grad_norm": 0.4350152015686035, "grad_norm_var": 0.0006691808540507882, "learning_rate": 5e-05, "loss": 0.2014, "loss/crossentropy": 2.756488621234894, "loss/hidden": 0.0, "loss/logits": 0.2013804353773594, "loss/reg": 0.2722313106060028, "step": 1878 }, { "epoch": 0.01879, "grad_norm": 0.40308424830436707, "grad_norm_var": 0.0006505932834232101, "learning_rate": 5e-05, "loss": 0.1931, "loss/crossentropy": 2.6871517300605774, "loss/hidden": 0.0, "loss/logits": 0.19308482855558395, "loss/reg": 0.271931916475296, "step": 1879 }, { "epoch": 0.0188, "grad_norm": 0.38713574409484863, "grad_norm_var": 0.000701410919604406, "learning_rate": 5e-05, "loss": 0.1867, "loss/crossentropy": 2.697225272655487, "loss/hidden": 0.0, "loss/logits": 0.18666525930166245, "loss/reg": 0.2716715931892395, "step": 1880 }, { "epoch": 0.01881, "grad_norm": 0.401745080947876, "grad_norm_var": 0.0006964061090712903, "learning_rate": 5e-05, "loss": 0.1831, "loss/crossentropy": 2.497227907180786, "loss/hidden": 0.0, "loss/logits": 0.1831391677260399, "loss/reg": 0.2715659439563751, "step": 1881 }, { "epoch": 0.01882, "grad_norm": 0.39038655161857605, "grad_norm_var": 0.0007376207204813393, "learning_rate": 5e-05, "loss": 0.1842, "loss/crossentropy": 2.729752480983734, "loss/hidden": 0.0, "loss/logits": 0.18423685058951378, "loss/reg": 0.2712423503398895, "step": 1882 }, { "epoch": 0.01883, "grad_norm": 0.36909186840057373, "grad_norm_var": 0.0008714329697182036, "learning_rate": 5e-05, "loss": 0.1734, "loss/crossentropy": 2.758967339992523, "loss/hidden": 0.0, "loss/logits": 0.17344772443175316, "loss/reg": 0.27110445499420166, "step": 1883 }, { "epoch": 0.01884, "grad_norm": 0.3961617350578308, "grad_norm_var": 0.0008773022320247829, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.7413728833198547, "loss/hidden": 0.0, "loss/logits": 0.18351080641150475, "loss/reg": 0.27082017064094543, "step": 1884 }, { "epoch": 0.01885, "grad_norm": 0.35756000876426697, "grad_norm_var": 0.0009766603915590329, "learning_rate": 5e-05, "loss": 0.1692, "loss/crossentropy": 2.7662473320961, "loss/hidden": 0.0, "loss/logits": 0.1691870205104351, "loss/reg": 0.2704273462295532, "step": 1885 }, { "epoch": 0.01886, "grad_norm": 0.3999634087085724, "grad_norm_var": 0.0009782703508609066, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.8441994190216064, "loss/hidden": 0.0, "loss/logits": 0.1819535531103611, "loss/reg": 0.27025607228279114, "step": 1886 }, { "epoch": 0.01887, "grad_norm": 0.4377036690711975, "grad_norm_var": 0.0005738297149364839, "learning_rate": 5e-05, "loss": 0.1904, "loss/crossentropy": 2.710397183895111, "loss/hidden": 0.0, "loss/logits": 0.19038205221295357, "loss/reg": 0.2703438699245453, "step": 1887 }, { "epoch": 0.01888, "grad_norm": 0.44405144453048706, "grad_norm_var": 0.0005866446988916844, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.958820402622223, "loss/hidden": 0.0, "loss/logits": 0.1840880662202835, "loss/reg": 0.27023187279701233, "step": 1888 }, { "epoch": 0.01889, "grad_norm": 0.4236486554145813, "grad_norm_var": 0.0006053714237732858, "learning_rate": 5e-05, "loss": 0.1878, "loss/crossentropy": 2.810550570487976, "loss/hidden": 0.0, "loss/logits": 0.18775535374879837, "loss/reg": 0.2701106369495392, "step": 1889 }, { "epoch": 0.0189, "grad_norm": 0.42999762296676636, "grad_norm_var": 0.0006399306914619059, "learning_rate": 5e-05, "loss": 0.1904, "loss/crossentropy": 2.7731493711471558, "loss/hidden": 0.0, "loss/logits": 0.19040461257100105, "loss/reg": 0.2700783312320709, "step": 1890 }, { "epoch": 0.01891, "grad_norm": 0.4342472553253174, "grad_norm_var": 0.0006703964778035994, "learning_rate": 5e-05, "loss": 0.1956, "loss/crossentropy": 2.756820857524872, "loss/hidden": 0.0, "loss/logits": 0.19563548266887665, "loss/reg": 0.2700011432170868, "step": 1891 }, { "epoch": 0.01892, "grad_norm": 0.3801390826702118, "grad_norm_var": 0.0006958877897457823, "learning_rate": 5e-05, "loss": 0.1946, "loss/crossentropy": 2.917654037475586, "loss/hidden": 0.0, "loss/logits": 0.1945624016225338, "loss/reg": 0.2699950933456421, "step": 1892 }, { "epoch": 0.01893, "grad_norm": 0.4050667881965637, "grad_norm_var": 0.0006745267517155207, "learning_rate": 5e-05, "loss": 0.1757, "loss/crossentropy": 2.699453055858612, "loss/hidden": 0.0, "loss/logits": 0.17572908848524094, "loss/reg": 0.2700875401496887, "step": 1893 }, { "epoch": 0.01894, "grad_norm": 0.4661920666694641, "grad_norm_var": 0.0008561505275783513, "learning_rate": 5e-05, "loss": 0.189, "loss/crossentropy": 2.7657936215400696, "loss/hidden": 0.0, "loss/logits": 0.18904687091708183, "loss/reg": 0.27017372846603394, "step": 1894 }, { "epoch": 0.01895, "grad_norm": 0.4022267162799835, "grad_norm_var": 0.0008567455029347415, "learning_rate": 5e-05, "loss": 0.1803, "loss/crossentropy": 2.8219807147979736, "loss/hidden": 0.0, "loss/logits": 0.18025044724345207, "loss/reg": 0.27011817693710327, "step": 1895 }, { "epoch": 0.01896, "grad_norm": 0.3908122479915619, "grad_norm_var": 0.0008474448054988315, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 2.7481996417045593, "loss/hidden": 0.0, "loss/logits": 0.1741633377969265, "loss/reg": 0.2699486017227173, "step": 1896 }, { "epoch": 0.01897, "grad_norm": 0.3891170620918274, "grad_norm_var": 0.0008680477391055152, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.8849986791610718, "loss/hidden": 0.0, "loss/logits": 0.18975405022501945, "loss/reg": 0.26975497603416443, "step": 1897 }, { "epoch": 0.01898, "grad_norm": 0.4161876142024994, "grad_norm_var": 0.0008515622816494872, "learning_rate": 5e-05, "loss": 0.1946, "loss/crossentropy": 2.679152250289917, "loss/hidden": 0.0, "loss/logits": 0.19457165151834488, "loss/reg": 0.2697330415248871, "step": 1898 }, { "epoch": 0.01899, "grad_norm": 0.3866231441497803, "grad_norm_var": 0.0007777537431073345, "learning_rate": 5e-05, "loss": 0.1848, "loss/crossentropy": 2.674207627773285, "loss/hidden": 0.0, "loss/logits": 0.18481703847646713, "loss/reg": 0.2695638835430145, "step": 1899 }, { "epoch": 0.019, "grad_norm": 0.3786781132221222, "grad_norm_var": 0.000829073698738425, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 2.6509466767311096, "loss/hidden": 0.0, "loss/logits": 0.1770479381084442, "loss/reg": 0.2697155475616455, "step": 1900 }, { "epoch": 0.01901, "grad_norm": 0.39138707518577576, "grad_norm_var": 0.0006690852725194874, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.86653733253479, "loss/hidden": 0.0, "loss/logits": 0.17957644164562225, "loss/reg": 0.26970818638801575, "step": 1901 }, { "epoch": 0.01902, "grad_norm": 0.43527013063430786, "grad_norm_var": 0.0006950277653280157, "learning_rate": 5e-05, "loss": 0.2, "loss/crossentropy": 2.8745383620262146, "loss/hidden": 0.0, "loss/logits": 0.19998397678136826, "loss/reg": 0.2694828510284424, "step": 1902 }, { "epoch": 0.01903, "grad_norm": 0.3599347174167633, "grad_norm_var": 0.0008190414543429392, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.916846752166748, "loss/hidden": 0.0, "loss/logits": 0.18350648507475853, "loss/reg": 0.26949837803840637, "step": 1903 }, { "epoch": 0.01904, "grad_norm": 0.39248159527778625, "grad_norm_var": 0.0007397659158129954, "learning_rate": 5e-05, "loss": 0.1914, "loss/crossentropy": 2.8454513549804688, "loss/hidden": 0.0, "loss/logits": 0.1914045698940754, "loss/reg": 0.26927462220191956, "step": 1904 }, { "epoch": 0.01905, "grad_norm": 0.5987407565116882, "grad_norm_var": 0.0030882753298692835, "learning_rate": 5e-05, "loss": 0.2005, "loss/crossentropy": 2.872275233268738, "loss/hidden": 0.0, "loss/logits": 0.2004827745258808, "loss/reg": 0.2695675492286682, "step": 1905 }, { "epoch": 0.01906, "grad_norm": 0.441245973110199, "grad_norm_var": 0.0031170732251211437, "learning_rate": 5e-05, "loss": 0.193, "loss/crossentropy": 2.7498526573181152, "loss/hidden": 0.0, "loss/logits": 0.19295179843902588, "loss/reg": 0.26951608061790466, "step": 1906 }, { "epoch": 0.01907, "grad_norm": 0.4036622643470764, "grad_norm_var": 0.003104273836133098, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.9569591879844666, "loss/hidden": 0.0, "loss/logits": 0.18328739702701569, "loss/reg": 0.26906752586364746, "step": 1907 }, { "epoch": 0.01908, "grad_norm": 0.4065428078174591, "grad_norm_var": 0.0030256100788648155, "learning_rate": 5e-05, "loss": 0.1744, "loss/crossentropy": 2.9296292662620544, "loss/hidden": 0.0, "loss/logits": 0.17443006113171577, "loss/reg": 0.2690469026565552, "step": 1908 }, { "epoch": 0.01909, "grad_norm": 0.41113191843032837, "grad_norm_var": 0.0030186547904314407, "learning_rate": 5e-05, "loss": 0.188, "loss/crossentropy": 2.855474829673767, "loss/hidden": 0.0, "loss/logits": 0.18804873153567314, "loss/reg": 0.26883524656295776, "step": 1909 }, { "epoch": 0.0191, "grad_norm": 0.413989782333374, "grad_norm_var": 0.0028458122740410226, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.7152153849601746, "loss/hidden": 0.0, "loss/logits": 0.18093138560652733, "loss/reg": 0.26858463883399963, "step": 1910 }, { "epoch": 0.01911, "grad_norm": 0.3984268605709076, "grad_norm_var": 0.0028524906273631246, "learning_rate": 5e-05, "loss": 0.1686, "loss/crossentropy": 2.6562029123306274, "loss/hidden": 0.0, "loss/logits": 0.1685929261147976, "loss/reg": 0.26806753873825073, "step": 1911 }, { "epoch": 0.01912, "grad_norm": 0.43265825510025024, "grad_norm_var": 0.002835964578320112, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.697164237499237, "loss/hidden": 0.0, "loss/logits": 0.1870552934706211, "loss/reg": 0.2676653563976288, "step": 1912 }, { "epoch": 0.01913, "grad_norm": 0.4212251603603363, "grad_norm_var": 0.0027852888110065995, "learning_rate": 5e-05, "loss": 0.174, "loss/crossentropy": 2.49627023935318, "loss/hidden": 0.0, "loss/logits": 0.1740170232951641, "loss/reg": 0.26754942536354065, "step": 1913 }, { "epoch": 0.01914, "grad_norm": 0.40805354714393616, "grad_norm_var": 0.0027914022296908164, "learning_rate": 5e-05, "loss": 0.1861, "loss/crossentropy": 2.7017217874526978, "loss/hidden": 0.0, "loss/logits": 0.18606874346733093, "loss/reg": 0.2672904133796692, "step": 1914 }, { "epoch": 0.01915, "grad_norm": 0.47244638204574585, "grad_norm_var": 0.002898389827784579, "learning_rate": 5e-05, "loss": 0.1922, "loss/crossentropy": 2.964231252670288, "loss/hidden": 0.0, "loss/logits": 0.19221315160393715, "loss/reg": 0.2669829726219177, "step": 1915 }, { "epoch": 0.01916, "grad_norm": 0.5235002040863037, "grad_norm_var": 0.00335595540915445, "learning_rate": 5e-05, "loss": 0.2, "loss/crossentropy": 2.758783519268036, "loss/hidden": 0.0, "loss/logits": 0.1999942846596241, "loss/reg": 0.26658549904823303, "step": 1916 }, { "epoch": 0.01917, "grad_norm": 0.4030700922012329, "grad_norm_var": 0.003301348831871683, "learning_rate": 5e-05, "loss": 0.1818, "loss/crossentropy": 2.796988606452942, "loss/hidden": 0.0, "loss/logits": 0.18176335841417313, "loss/reg": 0.26672980189323425, "step": 1917 }, { "epoch": 0.01918, "grad_norm": 0.42077726125717163, "grad_norm_var": 0.0033094110795455953, "learning_rate": 5e-05, "loss": 0.1907, "loss/crossentropy": 2.7011221051216125, "loss/hidden": 0.0, "loss/logits": 0.1906937062740326, "loss/reg": 0.26661765575408936, "step": 1918 }, { "epoch": 0.01919, "grad_norm": 0.42605507373809814, "grad_norm_var": 0.0029495899262365695, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.8211509585380554, "loss/hidden": 0.0, "loss/logits": 0.18978893756866455, "loss/reg": 0.2668221890926361, "step": 1919 }, { "epoch": 0.0192, "grad_norm": 0.38060420751571655, "grad_norm_var": 0.003027127772042822, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.782521665096283, "loss/hidden": 0.0, "loss/logits": 0.17924562469124794, "loss/reg": 0.26662570238113403, "step": 1920 }, { "epoch": 0.01921, "grad_norm": 1.0707707405090332, "grad_norm_var": 0.027249922425339065, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 2.7368358969688416, "loss/hidden": 0.0, "loss/logits": 0.1911153867840767, "loss/reg": 0.26646286249160767, "step": 1921 }, { "epoch": 0.01922, "grad_norm": 0.3721320331096649, "grad_norm_var": 0.027764003148303143, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 2.901717960834503, "loss/hidden": 0.0, "loss/logits": 0.1704598143696785, "loss/reg": 0.2661983370780945, "step": 1922 }, { "epoch": 0.01923, "grad_norm": 0.4181463122367859, "grad_norm_var": 0.02766770595568625, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.8132722973823547, "loss/hidden": 0.0, "loss/logits": 0.17675171419978142, "loss/reg": 0.2662166357040405, "step": 1923 }, { "epoch": 0.01924, "grad_norm": 0.4323657155036926, "grad_norm_var": 0.027521123531636722, "learning_rate": 5e-05, "loss": 0.1755, "loss/crossentropy": 2.7973023056983948, "loss/hidden": 0.0, "loss/logits": 0.17552904784679413, "loss/reg": 0.26579412817955017, "step": 1924 }, { "epoch": 0.01925, "grad_norm": 0.4927521347999573, "grad_norm_var": 0.027374825259902025, "learning_rate": 5e-05, "loss": 0.194, "loss/crossentropy": 2.9093286395072937, "loss/hidden": 0.0, "loss/logits": 0.1939934454858303, "loss/reg": 0.26533132791519165, "step": 1925 }, { "epoch": 0.01926, "grad_norm": 0.5082671642303467, "grad_norm_var": 0.027252219975209848, "learning_rate": 5e-05, "loss": 0.1877, "loss/crossentropy": 2.788951277732849, "loss/hidden": 0.0, "loss/logits": 0.18765148892998695, "loss/reg": 0.26488885283470154, "step": 1926 }, { "epoch": 0.01927, "grad_norm": 0.45780715346336365, "grad_norm_var": 0.026875615719080058, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.809519052505493, "loss/hidden": 0.0, "loss/logits": 0.18977425247430801, "loss/reg": 0.26454493403434753, "step": 1927 }, { "epoch": 0.01928, "grad_norm": 0.4502750039100647, "grad_norm_var": 0.02678959111757919, "learning_rate": 5e-05, "loss": 0.2015, "loss/crossentropy": 2.853406071662903, "loss/hidden": 0.0, "loss/logits": 0.2014762908220291, "loss/reg": 0.264074444770813, "step": 1928 }, { "epoch": 0.01929, "grad_norm": 0.4113198220729828, "grad_norm_var": 0.026871552480788572, "learning_rate": 5e-05, "loss": 0.1866, "loss/crossentropy": 2.8419198989868164, "loss/hidden": 0.0, "loss/logits": 0.18659916520118713, "loss/reg": 0.2640066146850586, "step": 1929 }, { "epoch": 0.0193, "grad_norm": 0.4102359414100647, "grad_norm_var": 0.026851490491655115, "learning_rate": 5e-05, "loss": 0.1798, "loss/crossentropy": 2.7696731090545654, "loss/hidden": 0.0, "loss/logits": 0.1797773241996765, "loss/reg": 0.2636849582195282, "step": 1930 }, { "epoch": 0.01931, "grad_norm": 0.5081679224967957, "grad_norm_var": 0.026904039385318063, "learning_rate": 5e-05, "loss": 0.1799, "loss/crossentropy": 2.756717324256897, "loss/hidden": 0.0, "loss/logits": 0.17987844347953796, "loss/reg": 0.26325714588165283, "step": 1931 }, { "epoch": 0.01932, "grad_norm": 0.37792888283729553, "grad_norm_var": 0.027391737795562542, "learning_rate": 5e-05, "loss": 0.1775, "loss/crossentropy": 2.778018593788147, "loss/hidden": 0.0, "loss/logits": 0.1774960234761238, "loss/reg": 0.2633077800273895, "step": 1932 }, { "epoch": 0.01933, "grad_norm": 0.4282725751399994, "grad_norm_var": 0.027202186694615504, "learning_rate": 5e-05, "loss": 0.2092, "loss/crossentropy": 2.6867257356643677, "loss/hidden": 0.0, "loss/logits": 0.20919723063707352, "loss/reg": 0.26265817880630493, "step": 1933 }, { "epoch": 0.01934, "grad_norm": 0.39677292108535767, "grad_norm_var": 0.027404918213135298, "learning_rate": 5e-05, "loss": 0.1899, "loss/crossentropy": 2.7861551642417908, "loss/hidden": 0.0, "loss/logits": 0.18993796035647392, "loss/reg": 0.2624299228191376, "step": 1934 }, { "epoch": 0.01935, "grad_norm": 0.4345720708370209, "grad_norm_var": 0.02735799559582528, "learning_rate": 5e-05, "loss": 0.1804, "loss/crossentropy": 2.7524542808532715, "loss/hidden": 0.0, "loss/logits": 0.18044590577483177, "loss/reg": 0.26210200786590576, "step": 1935 }, { "epoch": 0.01936, "grad_norm": 0.4304160475730896, "grad_norm_var": 0.026906727521972408, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 2.931870758533478, "loss/hidden": 0.0, "loss/logits": 0.19031662493944168, "loss/reg": 0.2622711658477783, "step": 1936 }, { "epoch": 0.01937, "grad_norm": 0.4203381836414337, "grad_norm_var": 0.0016814069403744748, "learning_rate": 5e-05, "loss": 0.1758, "loss/crossentropy": 2.7717989683151245, "loss/hidden": 0.0, "loss/logits": 0.17579283565282822, "loss/reg": 0.2618957459926605, "step": 1937 }, { "epoch": 0.01938, "grad_norm": 0.4128052592277527, "grad_norm_var": 0.0014473297587235478, "learning_rate": 5e-05, "loss": 0.1835, "loss/crossentropy": 2.633501410484314, "loss/hidden": 0.0, "loss/logits": 0.18347061052918434, "loss/reg": 0.2618264853954315, "step": 1938 }, { "epoch": 0.01939, "grad_norm": 0.37451133131980896, "grad_norm_var": 0.0016754550649258904, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.7058133482933044, "loss/hidden": 0.0, "loss/logits": 0.18334686756134033, "loss/reg": 0.26167500019073486, "step": 1939 }, { "epoch": 0.0194, "grad_norm": 0.4017476439476013, "grad_norm_var": 0.0017414350235883114, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.6937740445137024, "loss/hidden": 0.0, "loss/logits": 0.18694143369793892, "loss/reg": 0.2614714205265045, "step": 1940 }, { "epoch": 0.01941, "grad_norm": 0.4206730127334595, "grad_norm_var": 0.0014848029056289235, "learning_rate": 5e-05, "loss": 0.1922, "loss/crossentropy": 2.667712688446045, "loss/hidden": 0.0, "loss/logits": 0.1921989545226097, "loss/reg": 0.26124048233032227, "step": 1941 }, { "epoch": 0.01942, "grad_norm": 0.4543474614620209, "grad_norm_var": 0.0010876996074879574, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.731380879878998, "loss/hidden": 0.0, "loss/logits": 0.18122068792581558, "loss/reg": 0.26093557476997375, "step": 1942 }, { "epoch": 0.01943, "grad_norm": 0.47828689217567444, "grad_norm_var": 0.0012051716042224213, "learning_rate": 5e-05, "loss": 0.2131, "loss/crossentropy": 2.813185155391693, "loss/hidden": 0.0, "loss/logits": 0.21305937692523003, "loss/reg": 0.26078593730926514, "step": 1943 }, { "epoch": 0.01944, "grad_norm": 0.40145817399024963, "grad_norm_var": 0.0011939425673134552, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.9048438668251038, "loss/hidden": 0.0, "loss/logits": 0.17687463015317917, "loss/reg": 0.2608109414577484, "step": 1944 }, { "epoch": 0.01945, "grad_norm": 0.46941348910331726, "grad_norm_var": 0.0013173749145744319, "learning_rate": 5e-05, "loss": 0.1886, "loss/crossentropy": 2.928474187850952, "loss/hidden": 0.0, "loss/logits": 0.1886068768799305, "loss/reg": 0.26084601879119873, "step": 1945 }, { "epoch": 0.01946, "grad_norm": 0.42269355058670044, "grad_norm_var": 0.0013004802499192558, "learning_rate": 5e-05, "loss": 0.1994, "loss/crossentropy": 2.825789511203766, "loss/hidden": 0.0, "loss/logits": 0.19935093075037003, "loss/reg": 0.2607755661010742, "step": 1946 }, { "epoch": 0.01947, "grad_norm": 0.4244174659252167, "grad_norm_var": 0.0008327668098045748, "learning_rate": 5e-05, "loss": 0.1841, "loss/crossentropy": 2.8915027379989624, "loss/hidden": 0.0, "loss/logits": 0.18410703912377357, "loss/reg": 0.260786235332489, "step": 1947 }, { "epoch": 0.01948, "grad_norm": 0.4118313193321228, "grad_norm_var": 0.0007063320343105084, "learning_rate": 5e-05, "loss": 0.1968, "loss/crossentropy": 2.6807077527046204, "loss/hidden": 0.0, "loss/logits": 0.1967647820711136, "loss/reg": 0.2608233392238617, "step": 1948 }, { "epoch": 0.01949, "grad_norm": 0.4010051488876343, "grad_norm_var": 0.0007369401503618978, "learning_rate": 5e-05, "loss": 0.1891, "loss/crossentropy": 2.7997159361839294, "loss/hidden": 0.0, "loss/logits": 0.1891281045973301, "loss/reg": 0.26086315512657166, "step": 1949 }, { "epoch": 0.0195, "grad_norm": 0.401782363653183, "grad_norm_var": 0.000721521402768129, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.7234673500061035, "loss/hidden": 0.0, "loss/logits": 0.18237195909023285, "loss/reg": 0.2608618140220642, "step": 1950 }, { "epoch": 0.01951, "grad_norm": 0.4690000116825104, "grad_norm_var": 0.0008509312341088167, "learning_rate": 5e-05, "loss": 0.2104, "loss/crossentropy": 2.7913509011268616, "loss/hidden": 0.0, "loss/logits": 0.21035091951489449, "loss/reg": 0.2607622444629669, "step": 1951 }, { "epoch": 0.01952, "grad_norm": 0.4157317876815796, "grad_norm_var": 0.0008531586580746227, "learning_rate": 5e-05, "loss": 0.1872, "loss/crossentropy": 2.7069193720817566, "loss/hidden": 0.0, "loss/logits": 0.18720722571015358, "loss/reg": 0.2605006694793701, "step": 1952 }, { "epoch": 0.01953, "grad_norm": 0.3935359716415405, "grad_norm_var": 0.0009102582573822903, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.759518265724182, "loss/hidden": 0.0, "loss/logits": 0.18588988110423088, "loss/reg": 0.2604720890522003, "step": 1953 }, { "epoch": 0.01954, "grad_norm": 0.4643121659755707, "grad_norm_var": 0.0010123900429575116, "learning_rate": 5e-05, "loss": 0.1853, "loss/crossentropy": 2.7099204063415527, "loss/hidden": 0.0, "loss/logits": 0.18534060195088387, "loss/reg": 0.260466605424881, "step": 1954 }, { "epoch": 0.01955, "grad_norm": 0.4161752164363861, "grad_norm_var": 0.0008387601935913443, "learning_rate": 5e-05, "loss": 0.1889, "loss/crossentropy": 2.7642754912376404, "loss/hidden": 0.0, "loss/logits": 0.18887390568852425, "loss/reg": 0.2601833939552307, "step": 1955 }, { "epoch": 0.01956, "grad_norm": 0.4402320683002472, "grad_norm_var": 0.0007971276825827397, "learning_rate": 5e-05, "loss": 0.1935, "loss/crossentropy": 2.9624531865119934, "loss/hidden": 0.0, "loss/logits": 0.19350137561559677, "loss/reg": 0.26018860936164856, "step": 1956 }, { "epoch": 0.01957, "grad_norm": 0.4067208766937256, "grad_norm_var": 0.0008272141752552494, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 2.819717586040497, "loss/hidden": 0.0, "loss/logits": 0.17906580120325089, "loss/reg": 0.25983577966690063, "step": 1957 }, { "epoch": 0.01958, "grad_norm": 0.538730800151825, "grad_norm_var": 0.0015525525822944494, "learning_rate": 5e-05, "loss": 0.1974, "loss/crossentropy": 2.845821797847748, "loss/hidden": 0.0, "loss/logits": 0.19743062183260918, "loss/reg": 0.2597758173942566, "step": 1958 }, { "epoch": 0.01959, "grad_norm": 0.4568873643875122, "grad_norm_var": 0.0014568313328278274, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.8381652235984802, "loss/hidden": 0.0, "loss/logits": 0.18450325727462769, "loss/reg": 0.2595166563987732, "step": 1959 }, { "epoch": 0.0196, "grad_norm": 0.41597622632980347, "grad_norm_var": 0.0014082307494467335, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 2.753128945827484, "loss/hidden": 0.0, "loss/logits": 0.18201114237308502, "loss/reg": 0.2594538629055023, "step": 1960 }, { "epoch": 0.01961, "grad_norm": 0.42390555143356323, "grad_norm_var": 0.0013244732133061395, "learning_rate": 5e-05, "loss": 0.2278, "loss/crossentropy": 2.7025694847106934, "loss/hidden": 0.0, "loss/logits": 0.22779600322246552, "loss/reg": 0.2591736614704132, "step": 1961 }, { "epoch": 0.01962, "grad_norm": 0.5574085116386414, "grad_norm_var": 0.0023017417122839965, "learning_rate": 5e-05, "loss": 0.1924, "loss/crossentropy": 2.8173160552978516, "loss/hidden": 0.0, "loss/logits": 0.19237131252884865, "loss/reg": 0.25885117053985596, "step": 1962 }, { "epoch": 0.01963, "grad_norm": 0.44277575612068176, "grad_norm_var": 0.002285022477945716, "learning_rate": 5e-05, "loss": 0.2129, "loss/crossentropy": 2.73001891374588, "loss/hidden": 0.0, "loss/logits": 0.21286843717098236, "loss/reg": 0.25850170850753784, "step": 1963 }, { "epoch": 0.01964, "grad_norm": 0.43977436423301697, "grad_norm_var": 0.0022251458432345477, "learning_rate": 5e-05, "loss": 0.1925, "loss/crossentropy": 2.8810256719589233, "loss/hidden": 0.0, "loss/logits": 0.19253381714224815, "loss/reg": 0.2583758533000946, "step": 1964 }, { "epoch": 0.01965, "grad_norm": 0.46944957971572876, "grad_norm_var": 0.0021370016383161863, "learning_rate": 5e-05, "loss": 0.1883, "loss/crossentropy": 2.7972511649131775, "loss/hidden": 0.0, "loss/logits": 0.18826791644096375, "loss/reg": 0.2581353485584259, "step": 1965 }, { "epoch": 0.01966, "grad_norm": 0.39771679043769836, "grad_norm_var": 0.0021625596135145587, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 2.842879295349121, "loss/hidden": 0.0, "loss/logits": 0.17973103001713753, "loss/reg": 0.2576601207256317, "step": 1966 }, { "epoch": 0.01967, "grad_norm": 0.5614247918128967, "grad_norm_var": 0.0029703930089769533, "learning_rate": 5e-05, "loss": 0.1999, "loss/crossentropy": 2.8458929657936096, "loss/hidden": 0.0, "loss/logits": 0.19989408180117607, "loss/reg": 0.2572888135910034, "step": 1967 }, { "epoch": 0.01968, "grad_norm": 0.40444087982177734, "grad_norm_var": 0.0030337849670695268, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.7752053141593933, "loss/hidden": 0.0, "loss/logits": 0.18385807052254677, "loss/reg": 0.25686630606651306, "step": 1968 }, { "epoch": 0.01969, "grad_norm": 0.4082469344139099, "grad_norm_var": 0.0029329463253580544, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 2.7645137906074524, "loss/hidden": 0.0, "loss/logits": 0.1896432787179947, "loss/reg": 0.25636938214302063, "step": 1969 }, { "epoch": 0.0197, "grad_norm": 0.38901177048683167, "grad_norm_var": 0.0031713575357870264, "learning_rate": 5e-05, "loss": 0.178, "loss/crossentropy": 2.7305874824523926, "loss/hidden": 0.0, "loss/logits": 0.17798662185668945, "loss/reg": 0.2562461495399475, "step": 1970 }, { "epoch": 0.01971, "grad_norm": 0.3760077655315399, "grad_norm_var": 0.0034429329855772293, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.8130123615264893, "loss/hidden": 0.0, "loss/logits": 0.17601368203759193, "loss/reg": 0.2560918629169464, "step": 1971 }, { "epoch": 0.01972, "grad_norm": 1.1986591815948486, "grad_norm_var": 0.03885646351467405, "learning_rate": 5e-05, "loss": 0.2182, "loss/crossentropy": 2.8340243697166443, "loss/hidden": 0.0, "loss/logits": 0.21824359148740768, "loss/reg": 0.2555854916572571, "step": 1972 }, { "epoch": 0.01973, "grad_norm": 0.4706854522228241, "grad_norm_var": 0.03837679913214947, "learning_rate": 5e-05, "loss": 0.1898, "loss/crossentropy": 2.6575511693954468, "loss/hidden": 0.0, "loss/logits": 0.18976152315735817, "loss/reg": 0.25547683238983154, "step": 1973 }, { "epoch": 0.01974, "grad_norm": 0.44499078392982483, "grad_norm_var": 0.03840371738988433, "learning_rate": 5e-05, "loss": 0.1852, "loss/crossentropy": 2.7605016231536865, "loss/hidden": 0.0, "loss/logits": 0.1852148286998272, "loss/reg": 0.25506606698036194, "step": 1974 }, { "epoch": 0.01975, "grad_norm": 0.5894598364830017, "grad_norm_var": 0.03889769310051439, "learning_rate": 5e-05, "loss": 0.2028, "loss/crossentropy": 2.8285914063453674, "loss/hidden": 0.0, "loss/logits": 0.20277630537748337, "loss/reg": 0.2547808885574341, "step": 1975 }, { "epoch": 0.01976, "grad_norm": 0.493312805891037, "grad_norm_var": 0.0384115745613452, "learning_rate": 5e-05, "loss": 0.2019, "loss/crossentropy": 2.8811811804771423, "loss/hidden": 0.0, "loss/logits": 0.20190729573369026, "loss/reg": 0.254622220993042, "step": 1976 }, { "epoch": 0.01977, "grad_norm": 0.4299650192260742, "grad_norm_var": 0.038348993593695826, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.8572728633880615, "loss/hidden": 0.0, "loss/logits": 0.18294143676757812, "loss/reg": 0.25444284081459045, "step": 1977 }, { "epoch": 0.01978, "grad_norm": 0.470027893781662, "grad_norm_var": 0.03821074920698377, "learning_rate": 5e-05, "loss": 0.1817, "loss/crossentropy": 2.8667048811912537, "loss/hidden": 0.0, "loss/logits": 0.18173803761601448, "loss/reg": 0.25441354513168335, "step": 1978 }, { "epoch": 0.01979, "grad_norm": 0.5100470185279846, "grad_norm_var": 0.037988191743139235, "learning_rate": 5e-05, "loss": 0.1995, "loss/crossentropy": 2.555562973022461, "loss/hidden": 0.0, "loss/logits": 0.19947363436222076, "loss/reg": 0.25430208444595337, "step": 1979 }, { "epoch": 0.0198, "grad_norm": 0.4082716703414917, "grad_norm_var": 0.03831715895082874, "learning_rate": 5e-05, "loss": 0.201, "loss/crossentropy": 2.7957499027252197, "loss/hidden": 0.0, "loss/logits": 0.20104172825813293, "loss/reg": 0.2544541656970978, "step": 1980 }, { "epoch": 0.01981, "grad_norm": 0.41018468141555786, "grad_norm_var": 0.038788814513196504, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.743241310119629, "loss/hidden": 0.0, "loss/logits": 0.18249813094735146, "loss/reg": 0.2543153166770935, "step": 1981 }, { "epoch": 0.01982, "grad_norm": 0.6020731329917908, "grad_norm_var": 0.03867588709653149, "learning_rate": 5e-05, "loss": 0.2164, "loss/crossentropy": 2.8209879994392395, "loss/hidden": 0.0, "loss/logits": 0.21636280417442322, "loss/reg": 0.2540426552295685, "step": 1982 }, { "epoch": 0.01983, "grad_norm": 0.4378150701522827, "grad_norm_var": 0.038790314533802515, "learning_rate": 5e-05, "loss": 0.1922, "loss/crossentropy": 2.7217661142349243, "loss/hidden": 0.0, "loss/logits": 0.1921749822795391, "loss/reg": 0.2539899945259094, "step": 1983 }, { "epoch": 0.01984, "grad_norm": 0.456143856048584, "grad_norm_var": 0.038280017577062196, "learning_rate": 5e-05, "loss": 0.2041, "loss/crossentropy": 2.7847670912742615, "loss/hidden": 0.0, "loss/logits": 0.20405444502830505, "loss/reg": 0.25370466709136963, "step": 1984 }, { "epoch": 0.01985, "grad_norm": 0.43637850880622864, "grad_norm_var": 0.03796307668378175, "learning_rate": 5e-05, "loss": 0.2002, "loss/crossentropy": 2.637964367866516, "loss/hidden": 0.0, "loss/logits": 0.20022683963179588, "loss/reg": 0.2534542679786682, "step": 1985 }, { "epoch": 0.01986, "grad_norm": 0.4424703121185303, "grad_norm_var": 0.03729577729387613, "learning_rate": 5e-05, "loss": 0.2032, "loss/crossentropy": 2.7843973636627197, "loss/hidden": 0.0, "loss/logits": 0.2031734250485897, "loss/reg": 0.2534501850605011, "step": 1986 }, { "epoch": 0.01987, "grad_norm": 0.4896067678928375, "grad_norm_var": 0.03605719201363548, "learning_rate": 5e-05, "loss": 0.2055, "loss/crossentropy": 2.7897618412971497, "loss/hidden": 0.0, "loss/logits": 0.2054971568286419, "loss/reg": 0.25356993079185486, "step": 1987 }, { "epoch": 0.01988, "grad_norm": 0.43150532245635986, "grad_norm_var": 0.0032306721763952478, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 2.82052481174469, "loss/hidden": 0.0, "loss/logits": 0.18651033192873, "loss/reg": 0.2535029649734497, "step": 1988 }, { "epoch": 0.01989, "grad_norm": 0.5410552024841309, "grad_norm_var": 0.003544874419156611, "learning_rate": 5e-05, "loss": 0.2033, "loss/crossentropy": 3.023108422756195, "loss/hidden": 0.0, "loss/logits": 0.2032521776854992, "loss/reg": 0.2534503936767578, "step": 1989 }, { "epoch": 0.0199, "grad_norm": 0.4383617043495178, "grad_norm_var": 0.003573775738698363, "learning_rate": 5e-05, "loss": 0.1933, "loss/crossentropy": 2.866779148578644, "loss/hidden": 0.0, "loss/logits": 0.19326722249388695, "loss/reg": 0.253338485956192, "step": 1990 }, { "epoch": 0.01991, "grad_norm": 0.4365501403808594, "grad_norm_var": 0.0026845346764073834, "learning_rate": 5e-05, "loss": 0.2008, "loss/crossentropy": 2.7836719751358032, "loss/hidden": 0.0, "loss/logits": 0.20078910514712334, "loss/reg": 0.2533318102359772, "step": 1991 }, { "epoch": 0.01992, "grad_norm": 0.43848541378974915, "grad_norm_var": 0.0026625898543444363, "learning_rate": 5e-05, "loss": 0.203, "loss/crossentropy": 2.666118800640106, "loss/hidden": 0.0, "loss/logits": 0.2029670737683773, "loss/reg": 0.25323694944381714, "step": 1992 }, { "epoch": 0.01993, "grad_norm": 0.40215420722961426, "grad_norm_var": 0.0028266927643893157, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 2.8887206315994263, "loss/hidden": 0.0, "loss/logits": 0.18301741033792496, "loss/reg": 0.253271222114563, "step": 1993 }, { "epoch": 0.01994, "grad_norm": 0.5366244316101074, "grad_norm_var": 0.0031978516033143796, "learning_rate": 5e-05, "loss": 0.2148, "loss/crossentropy": 2.804258704185486, "loss/hidden": 0.0, "loss/logits": 0.2148107923567295, "loss/reg": 0.25308483839035034, "step": 1994 }, { "epoch": 0.01995, "grad_norm": 0.40444281697273254, "grad_norm_var": 0.0032409791762972725, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.891745090484619, "loss/hidden": 0.0, "loss/logits": 0.1844337061047554, "loss/reg": 0.25290459394454956, "step": 1995 }, { "epoch": 0.01996, "grad_norm": 0.47780659794807434, "grad_norm_var": 0.0030913257826184334, "learning_rate": 5e-05, "loss": 0.2412, "loss/crossentropy": 2.826097071170807, "loss/hidden": 0.0, "loss/logits": 0.24118012562394142, "loss/reg": 0.2526180148124695, "step": 1996 }, { "epoch": 0.01997, "grad_norm": 0.5234355330467224, "grad_norm_var": 0.0031202784791425093, "learning_rate": 5e-05, "loss": 0.2032, "loss/crossentropy": 3.04313200712204, "loss/hidden": 0.0, "loss/logits": 0.20319189876317978, "loss/reg": 0.2526146173477173, "step": 1997 }, { "epoch": 0.01998, "grad_norm": 0.49061840772628784, "grad_norm_var": 0.0019106690113487054, "learning_rate": 5e-05, "loss": 0.2004, "loss/crossentropy": 2.8717005848884583, "loss/hidden": 0.0, "loss/logits": 0.20044127106666565, "loss/reg": 0.25241363048553467, "step": 1998 }, { "epoch": 0.01999, "grad_norm": 0.44386622309684753, "grad_norm_var": 0.0018938755731678321, "learning_rate": 5e-05, "loss": 0.2069, "loss/crossentropy": 2.6812584400177, "loss/hidden": 0.0, "loss/logits": 0.206902377307415, "loss/reg": 0.2522028088569641, "step": 1999 }, { "epoch": 0.02, "grad_norm": 0.5584570169448853, "grad_norm_var": 0.0024703633050866683, "learning_rate": 5e-05, "loss": 0.2362, "loss/crossentropy": 2.7835442423820496, "loss/hidden": 0.0, "loss/logits": 0.23615377396345139, "loss/reg": 0.25219428539276123, "step": 2000 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.030493785030656e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }