{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0375, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.5e-05, "grad_norm": 0.8990891575813293, "learning_rate": 1.0000000000000002e-06, "loss": 352499.0625, "loss/crossentropy": 2.657993793487549, "loss/hidden": 0.388671875, "loss/logits": 0.10807140171527863, "loss/reg": 352498.5625, "step": 1 }, { "epoch": 5e-05, "grad_norm": 1.0061860084533691, "learning_rate": 2.0000000000000003e-06, "loss": 352499.0312, "loss/crossentropy": 2.5393872261047363, "loss/hidden": 0.375, "loss/logits": 0.09729088842868805, "loss/reg": 352498.5625, "step": 2 }, { "epoch": 7.5e-05, "grad_norm": 1.1664918661117554, "learning_rate": 3e-06, "loss": 352494.0625, "loss/crossentropy": 2.7846217155456543, "loss/hidden": 0.41015625, "loss/logits": 0.12509623169898987, "loss/reg": 352493.53125, "step": 3 }, { "epoch": 0.0001, "grad_norm": 1.007649302482605, "learning_rate": 4.000000000000001e-06, "loss": 352486.8125, "loss/crossentropy": 2.478257656097412, "loss/hidden": 0.38671875, "loss/logits": 0.09696321189403534, "loss/reg": 352486.34375, "step": 4 }, { "epoch": 0.000125, "grad_norm": 1.150974988937378, "learning_rate": 5e-06, "loss": 352475.5938, "loss/crossentropy": 2.636676073074341, "loss/hidden": 0.416015625, "loss/logits": 0.11413601040840149, "loss/reg": 352475.0625, "step": 5 }, { "epoch": 0.00015, "grad_norm": 1.0984797477722168, "learning_rate": 6e-06, "loss": 352463.0, "loss/crossentropy": 2.4723658561706543, "loss/hidden": 0.3984375, "loss/logits": 0.1231672614812851, "loss/reg": 352462.46875, "step": 6 }, { "epoch": 0.000175, "grad_norm": 0.825570285320282, "learning_rate": 7.000000000000001e-06, "loss": 352448.3438, "loss/crossentropy": 2.5369279384613037, "loss/hidden": 0.40625, "loss/logits": 0.10759128630161285, "loss/reg": 352447.84375, "step": 7 }, { "epoch": 0.0002, "grad_norm": 1.4584909677505493, "learning_rate": 8.000000000000001e-06, "loss": 352431.8125, "loss/crossentropy": 2.6420071125030518, "loss/hidden": 0.40234375, "loss/logits": 0.10714638233184814, "loss/reg": 352431.3125, "step": 8 }, { "epoch": 0.000225, "grad_norm": 0.7993799448013306, "learning_rate": 9e-06, "loss": 352412.25, "loss/crossentropy": 2.300945281982422, "loss/hidden": 0.37890625, "loss/logits": 0.0980607345700264, "loss/reg": 352411.78125, "step": 9 }, { "epoch": 0.00025, "grad_norm": 0.8132392168045044, "learning_rate": 1e-05, "loss": 352391.4688, "loss/crossentropy": 2.549692392349243, "loss/hidden": 0.388671875, "loss/logits": 0.11239537596702576, "loss/reg": 352390.96875, "step": 10 }, { "epoch": 0.000275, "grad_norm": 0.7587840557098389, "learning_rate": 1.1000000000000001e-05, "loss": 352369.1875, "loss/crossentropy": 2.391806125640869, "loss/hidden": 0.40234375, "loss/logits": 0.10249912738800049, "loss/reg": 352368.6875, "step": 11 }, { "epoch": 0.0003, "grad_norm": 0.9837542176246643, "learning_rate": 1.2e-05, "loss": 352344.4688, "loss/crossentropy": 2.783439874649048, "loss/hidden": 0.3984375, "loss/logits": 0.11502979695796967, "loss/reg": 352343.96875, "step": 12 }, { "epoch": 0.000325, "grad_norm": 0.7176467776298523, "learning_rate": 1.3000000000000001e-05, "loss": 352317.2812, "loss/crossentropy": 2.546398401260376, "loss/hidden": 0.376953125, "loss/logits": 0.10087086260318756, "loss/reg": 352316.8125, "step": 13 }, { "epoch": 0.00035, "grad_norm": 0.6804006695747375, "learning_rate": 1.4000000000000001e-05, "loss": 352288.7188, "loss/crossentropy": 2.2503833770751953, "loss/hidden": 0.365234375, "loss/logits": 0.08064158260822296, "loss/reg": 352288.28125, "step": 14 }, { "epoch": 0.000375, "grad_norm": 0.7415475845336914, "learning_rate": 1.5e-05, "loss": 352259.4062, "loss/crossentropy": 2.746664047241211, "loss/hidden": 0.388671875, "loss/logits": 0.12547868490219116, "loss/reg": 352258.90625, "step": 15 }, { "epoch": 0.0004, "grad_norm": 0.7663099765777588, "grad_norm_var": 0.044582702584291416, "learning_rate": 1.6000000000000003e-05, "loss": 352227.8125, "loss/crossentropy": 2.6695168018341064, "loss/hidden": 0.37109375, "loss/logits": 0.09486141800880432, "loss/reg": 352227.34375, "step": 16 }, { "epoch": 0.000425, "grad_norm": 0.7551850080490112, "grad_norm_var": 0.04646286949992676, "learning_rate": 1.7000000000000003e-05, "loss": 352193.7812, "loss/crossentropy": 2.4840545654296875, "loss/hidden": 0.3671875, "loss/logits": 0.09224767982959747, "loss/reg": 352193.3125, "step": 17 }, { "epoch": 0.00045, "grad_norm": 0.9829086661338806, "grad_norm_var": 0.04623120069336625, "learning_rate": 1.8e-05, "loss": 352158.9062, "loss/crossentropy": 2.8658523559570312, "loss/hidden": 0.396484375, "loss/logits": 0.10515056550502777, "loss/reg": 352158.40625, "step": 18 }, { "epoch": 0.000475, "grad_norm": 0.7825629711151123, "grad_norm_var": 0.04278355162789619, "learning_rate": 1.9e-05, "loss": 352120.7188, "loss/crossentropy": 2.770911455154419, "loss/hidden": 0.357421875, "loss/logits": 0.08866650611162186, "loss/reg": 352120.28125, "step": 19 }, { "epoch": 0.0005, "grad_norm": 0.7500621676445007, "grad_norm_var": 0.04306775406588148, "learning_rate": 2e-05, "loss": 352080.9688, "loss/crossentropy": 2.654839277267456, "loss/hidden": 0.37109375, "loss/logits": 0.09846656024456024, "loss/reg": 352080.5, "step": 20 }, { "epoch": 0.000525, "grad_norm": 1.1595454216003418, "grad_norm_var": 0.043383044629798855, "learning_rate": 2.1e-05, "loss": 352037.3125, "loss/crossentropy": 2.4238386154174805, "loss/hidden": 0.373046875, "loss/logits": 0.09536640346050262, "loss/reg": 352036.84375, "step": 21 }, { "epoch": 0.00055, "grad_norm": 0.800118625164032, "grad_norm_var": 0.04024005917018294, "learning_rate": 2.2000000000000003e-05, "loss": 351981.875, "loss/crossentropy": 2.3175930976867676, "loss/hidden": 0.34375, "loss/logits": 0.08112605661153793, "loss/reg": 351981.4375, "step": 22 }, { "epoch": 0.000575, "grad_norm": 1.008859395980835, "grad_norm_var": 0.04147464320553755, "learning_rate": 2.3000000000000003e-05, "loss": 351930.5938, "loss/crossentropy": 2.9110066890716553, "loss/hidden": 0.396484375, "loss/logits": 0.10111527144908905, "loss/reg": 351930.09375, "step": 23 }, { "epoch": 0.0006, "grad_norm": 0.6007267832756042, "grad_norm_var": 0.0204320549079418, "learning_rate": 2.4e-05, "loss": 351874.0, "loss/crossentropy": 2.521523952484131, "loss/hidden": 0.341796875, "loss/logits": 0.09783630818128586, "loss/reg": 351873.5625, "step": 24 }, { "epoch": 0.000625, "grad_norm": 0.8861953020095825, "grad_norm_var": 0.020678149542751458, "learning_rate": 2.5e-05, "loss": 351818.5, "loss/crossentropy": 2.551731586456299, "loss/hidden": 0.376953125, "loss/logits": 0.10109081864356995, "loss/reg": 351818.03125, "step": 25 }, { "epoch": 0.00065, "grad_norm": 0.680790364742279, "grad_norm_var": 0.021968847889962718, "learning_rate": 2.6000000000000002e-05, "loss": 351757.6562, "loss/crossentropy": 2.740166664123535, "loss/hidden": 0.373046875, "loss/logits": 0.10807862132787704, "loss/reg": 351757.1875, "step": 26 }, { "epoch": 0.000675, "grad_norm": 0.7792268395423889, "grad_norm_var": 0.0218391159024706, "learning_rate": 2.7000000000000002e-05, "loss": 351695.5312, "loss/crossentropy": 2.7071239948272705, "loss/hidden": 0.369140625, "loss/logits": 0.10798090696334839, "loss/reg": 351695.0625, "step": 27 }, { "epoch": 0.0007, "grad_norm": 0.6987195014953613, "grad_norm_var": 0.02058860603810307, "learning_rate": 2.8000000000000003e-05, "loss": 351627.8438, "loss/crossentropy": 2.501779317855835, "loss/hidden": 0.359375, "loss/logits": 0.09967444092035294, "loss/reg": 351627.375, "step": 28 }, { "epoch": 0.000725, "grad_norm": 0.8250280022621155, "grad_norm_var": 0.020138411660303213, "learning_rate": 2.9e-05, "loss": 351556.5312, "loss/crossentropy": 2.6017117500305176, "loss/hidden": 0.353515625, "loss/logits": 0.09585036337375641, "loss/reg": 351556.09375, "step": 29 }, { "epoch": 0.00075, "grad_norm": 0.7717598080635071, "grad_norm_var": 0.019128449305659424, "learning_rate": 3e-05, "loss": 351480.3125, "loss/crossentropy": 2.6378984451293945, "loss/hidden": 0.380859375, "loss/logits": 0.10243427008390427, "loss/reg": 351479.84375, "step": 30 }, { "epoch": 0.000775, "grad_norm": 0.6966306567192078, "grad_norm_var": 0.019675560702599747, "learning_rate": 3.1e-05, "loss": 351397.9375, "loss/crossentropy": 2.3649041652679443, "loss/hidden": 0.357421875, "loss/logits": 0.09597521275281906, "loss/reg": 351397.46875, "step": 31 }, { "epoch": 0.0008, "grad_norm": 0.664853572845459, "grad_norm_var": 0.02089692070087943, "learning_rate": 3.2000000000000005e-05, "loss": 351315.0312, "loss/crossentropy": 2.5563435554504395, "loss/hidden": 0.353515625, "loss/logits": 0.10334671288728714, "loss/reg": 351314.5625, "step": 32 }, { "epoch": 0.000825, "grad_norm": 0.7842820882797241, "grad_norm_var": 0.02076550256052436, "learning_rate": 3.3e-05, "loss": 351228.1562, "loss/crossentropy": 2.8158535957336426, "loss/hidden": 0.34375, "loss/logits": 0.09285588562488556, "loss/reg": 351227.71875, "step": 33 }, { "epoch": 0.00085, "grad_norm": 0.6544437408447266, "grad_norm_var": 0.01969585228427011, "learning_rate": 3.4000000000000007e-05, "loss": 351137.2812, "loss/crossentropy": 2.7246992588043213, "loss/hidden": 0.369140625, "loss/logits": 0.10069996118545532, "loss/reg": 351136.8125, "step": 34 }, { "epoch": 0.000875, "grad_norm": 0.6708226203918457, "grad_norm_var": 0.02049744993643774, "learning_rate": 3.5e-05, "loss": 351042.1562, "loss/crossentropy": 2.273859739303589, "loss/hidden": 0.34375, "loss/logits": 0.10322189331054688, "loss/reg": 351041.71875, "step": 35 }, { "epoch": 0.0009, "grad_norm": 0.9052221179008484, "grad_norm_var": 0.021444739470696975, "learning_rate": 3.6e-05, "loss": 350939.875, "loss/crossentropy": 2.6857171058654785, "loss/hidden": 0.37109375, "loss/logits": 0.10182373225688934, "loss/reg": 350939.40625, "step": 36 }, { "epoch": 0.000925, "grad_norm": 0.9864970445632935, "grad_norm_var": 0.014713678080535903, "learning_rate": 3.7e-05, "loss": 350830.5, "loss/crossentropy": 2.2562193870544434, "loss/hidden": 0.37109375, "loss/logits": 0.09030164778232574, "loss/reg": 350830.03125, "step": 37 }, { "epoch": 0.00095, "grad_norm": 0.8805357217788696, "grad_norm_var": 0.015377688589989431, "learning_rate": 3.8e-05, "loss": 350718.7812, "loss/crossentropy": 2.299184560775757, "loss/hidden": 0.359375, "loss/logits": 0.10313265025615692, "loss/reg": 350718.3125, "step": 38 }, { "epoch": 0.000975, "grad_norm": 0.786675751209259, "grad_norm_var": 0.01171021529588548, "learning_rate": 3.9000000000000006e-05, "loss": 350606.125, "loss/crossentropy": 2.5615806579589844, "loss/hidden": 0.376953125, "loss/logits": 0.10579188168048859, "loss/reg": 350605.65625, "step": 39 }, { "epoch": 0.001, "grad_norm": 0.6990974545478821, "grad_norm_var": 0.010133824182919622, "learning_rate": 4e-05, "loss": 350486.5, "loss/crossentropy": 2.393359899520874, "loss/hidden": 0.326171875, "loss/logits": 0.08165943622589111, "loss/reg": 350486.09375, "step": 40 }, { "epoch": 0.001025, "grad_norm": 0.7575247287750244, "grad_norm_var": 0.00922957570391792, "learning_rate": 4.1e-05, "loss": 350362.9375, "loss/crossentropy": 2.7492964267730713, "loss/hidden": 0.35546875, "loss/logits": 0.09948307275772095, "loss/reg": 350362.46875, "step": 41 }, { "epoch": 0.00105, "grad_norm": 0.7236489653587341, "grad_norm_var": 0.008862411546386539, "learning_rate": 4.2e-05, "loss": 350231.0625, "loss/crossentropy": 2.4042203426361084, "loss/hidden": 0.34765625, "loss/logits": 0.10723390430212021, "loss/reg": 350230.59375, "step": 42 }, { "epoch": 0.001075, "grad_norm": 0.6667615175247192, "grad_norm_var": 0.009481747856356634, "learning_rate": 4.3e-05, "loss": 350095.2812, "loss/crossentropy": 2.657647132873535, "loss/hidden": 0.35546875, "loss/logits": 0.0933607816696167, "loss/reg": 350094.84375, "step": 43 }, { "epoch": 0.0011, "grad_norm": 0.6898338198661804, "grad_norm_var": 0.009560210933929624, "learning_rate": 4.4000000000000006e-05, "loss": 349956.0, "loss/crossentropy": 2.505267381668091, "loss/hidden": 0.33203125, "loss/logits": 0.09544478356838226, "loss/reg": 349955.5625, "step": 44 }, { "epoch": 0.001125, "grad_norm": 0.7395535111427307, "grad_norm_var": 0.009278307698633626, "learning_rate": 4.5e-05, "loss": 349811.4375, "loss/crossentropy": 2.551318645477295, "loss/hidden": 0.3515625, "loss/logits": 0.1010345071554184, "loss/reg": 349811.0, "step": 45 }, { "epoch": 0.00115, "grad_norm": 0.766717255115509, "grad_norm_var": 0.009268550582984859, "learning_rate": 4.600000000000001e-05, "loss": 349662.0, "loss/crossentropy": 2.4719483852386475, "loss/hidden": 0.349609375, "loss/logits": 0.096840038895607, "loss/reg": 349661.5625, "step": 46 }, { "epoch": 0.001175, "grad_norm": 0.7225788831710815, "grad_norm_var": 0.00911018026195678, "learning_rate": 4.7e-05, "loss": 349505.25, "loss/crossentropy": 2.4699411392211914, "loss/hidden": 0.34375, "loss/logits": 0.09806468337774277, "loss/reg": 349504.8125, "step": 47 }, { "epoch": 0.0012, "grad_norm": 0.8643568158149719, "grad_norm_var": 0.009168173684230396, "learning_rate": 4.8e-05, "loss": 349342.75, "loss/crossentropy": 2.470043182373047, "loss/hidden": 0.328125, "loss/logits": 0.1013718917965889, "loss/reg": 349342.3125, "step": 48 }, { "epoch": 0.001225, "grad_norm": 0.7516047954559326, "grad_norm_var": 0.009166844350170263, "learning_rate": 4.9e-05, "loss": 349173.875, "loss/crossentropy": 2.684551239013672, "loss/hidden": 0.3359375, "loss/logits": 0.10089194774627686, "loss/reg": 349173.4375, "step": 49 }, { "epoch": 0.00125, "grad_norm": 0.7821515202522278, "grad_norm_var": 0.008276118193317084, "learning_rate": 5e-05, "loss": 348998.3438, "loss/crossentropy": 2.414154052734375, "loss/hidden": 0.357421875, "loss/logits": 0.09722993522882462, "loss/reg": 348997.875, "step": 50 }, { "epoch": 0.001275, "grad_norm": 0.6895391941070557, "grad_norm_var": 0.008039034372423481, "learning_rate": 5.1000000000000006e-05, "loss": 348816.4062, "loss/crossentropy": 2.731753349304199, "loss/hidden": 0.330078125, "loss/logits": 0.09635522961616516, "loss/reg": 348815.96875, "step": 51 }, { "epoch": 0.0013, "grad_norm": 0.7811358571052551, "grad_norm_var": 0.006859586362939026, "learning_rate": 5.2000000000000004e-05, "loss": 348628.8438, "loss/crossentropy": 2.4768221378326416, "loss/hidden": 0.34375, "loss/logits": 0.10576320439577103, "loss/reg": 348628.40625, "step": 52 }, { "epoch": 0.001325, "grad_norm": 0.7297627329826355, "grad_norm_var": 0.003500148277329096, "learning_rate": 5.300000000000001e-05, "loss": 348433.0312, "loss/crossentropy": 2.5304179191589355, "loss/hidden": 0.3359375, "loss/logits": 0.0967152938246727, "loss/reg": 348432.59375, "step": 53 }, { "epoch": 0.00135, "grad_norm": 0.7394194006919861, "grad_norm_var": 0.002325683451179339, "learning_rate": 5.4000000000000005e-05, "loss": 348235.5312, "loss/crossentropy": 2.220980644226074, "loss/hidden": 0.330078125, "loss/logits": 0.0968293845653534, "loss/reg": 348235.09375, "step": 54 }, { "epoch": 0.001375, "grad_norm": 0.9957661628723145, "grad_norm_var": 0.0062716166232480814, "learning_rate": 5.500000000000001e-05, "loss": 348031.875, "loss/crossentropy": 2.7074618339538574, "loss/hidden": 0.36328125, "loss/logits": 0.11125057935714722, "loss/reg": 348031.40625, "step": 55 }, { "epoch": 0.0014, "grad_norm": 0.7879313230514526, "grad_norm_var": 0.006088293627237211, "learning_rate": 5.6000000000000006e-05, "loss": 347816.1562, "loss/crossentropy": 2.515113592147827, "loss/hidden": 0.33203125, "loss/logits": 0.07407157868146896, "loss/reg": 347815.75, "step": 56 }, { "epoch": 0.001425, "grad_norm": 0.6987502574920654, "grad_norm_var": 0.006337448092067281, "learning_rate": 5.6999999999999996e-05, "loss": 347603.8125, "loss/crossentropy": 2.336862325668335, "loss/hidden": 0.33203125, "loss/logits": 0.10240699350833893, "loss/reg": 347603.375, "step": 57 }, { "epoch": 0.00145, "grad_norm": 0.7523587942123413, "grad_norm_var": 0.006257107260669119, "learning_rate": 5.8e-05, "loss": 347378.0, "loss/crossentropy": 2.4298152923583984, "loss/hidden": 0.34375, "loss/logits": 0.10168775916099548, "loss/reg": 347377.5625, "step": 58 }, { "epoch": 0.001475, "grad_norm": 0.9260439872741699, "grad_norm_var": 0.007239314549827706, "learning_rate": 5.9e-05, "loss": 347145.7812, "loss/crossentropy": 2.2897913455963135, "loss/hidden": 0.353515625, "loss/logits": 0.1300913393497467, "loss/reg": 347145.3125, "step": 59 }, { "epoch": 0.0015, "grad_norm": 0.9719749689102173, "grad_norm_var": 0.008969534389899995, "learning_rate": 6e-05, "loss": 346908.125, "loss/crossentropy": 2.8708131313323975, "loss/hidden": 0.3515625, "loss/logits": 0.09690976142883301, "loss/reg": 346907.6875, "step": 60 }, { "epoch": 0.001525, "grad_norm": 0.8373459577560425, "grad_norm_var": 0.008860865848953618, "learning_rate": 6.1e-05, "loss": 346664.3125, "loss/crossentropy": 2.327406644821167, "loss/hidden": 0.33984375, "loss/logits": 0.09570501744747162, "loss/reg": 346663.875, "step": 61 }, { "epoch": 0.00155, "grad_norm": 1.3656246662139893, "grad_norm_var": 0.028634012422057358, "learning_rate": 6.2e-05, "loss": 346416.75, "loss/crossentropy": 2.711550712585449, "loss/hidden": 0.369140625, "loss/logits": 0.09897757321596146, "loss/reg": 346416.28125, "step": 62 }, { "epoch": 0.001575, "grad_norm": 0.897807776927948, "grad_norm_var": 0.027873421941307677, "learning_rate": 6.3e-05, "loss": 346158.4688, "loss/crossentropy": 2.5601446628570557, "loss/hidden": 0.365234375, "loss/logits": 0.10523329675197601, "loss/reg": 346158.0, "step": 63 }, { "epoch": 0.0016, "grad_norm": 0.8370752334594727, "grad_norm_var": 0.027861253679391126, "learning_rate": 6.400000000000001e-05, "loss": 345896.7812, "loss/crossentropy": 2.370269536972046, "loss/hidden": 0.3515625, "loss/logits": 0.11418884992599487, "loss/reg": 345896.3125, "step": 64 }, { "epoch": 0.001625, "grad_norm": 0.9750260710716248, "grad_norm_var": 0.02815364448027721, "learning_rate": 6.500000000000001e-05, "loss": 345627.4375, "loss/crossentropy": 2.6104238033294678, "loss/hidden": 0.359375, "loss/logits": 0.10441127419471741, "loss/reg": 345626.96875, "step": 65 }, { "epoch": 0.00165, "grad_norm": 0.8502708077430725, "grad_norm_var": 0.027732216036353386, "learning_rate": 6.6e-05, "loss": 345353.1562, "loss/crossentropy": 2.0436782836914062, "loss/hidden": 0.337890625, "loss/logits": 0.10739766061306, "loss/reg": 345352.71875, "step": 66 }, { "epoch": 0.001675, "grad_norm": 0.9714474081993103, "grad_norm_var": 0.026113840930633226, "learning_rate": 6.7e-05, "loss": 345073.3438, "loss/crossentropy": 2.534754753112793, "loss/hidden": 0.36328125, "loss/logits": 0.09999954700469971, "loss/reg": 345072.875, "step": 67 }, { "epoch": 0.0017, "grad_norm": 0.9867809414863586, "grad_norm_var": 0.02598149193676691, "learning_rate": 6.800000000000001e-05, "loss": 344781.3125, "loss/crossentropy": 2.693114995956421, "loss/hidden": 0.357421875, "loss/logits": 0.10880092531442642, "loss/reg": 344780.84375, "step": 68 }, { "epoch": 0.001725, "grad_norm": 1.0025056600570679, "grad_norm_var": 0.024614116160431364, "learning_rate": 6.9e-05, "loss": 344485.0312, "loss/crossentropy": 2.3547627925872803, "loss/hidden": 0.345703125, "loss/logits": 0.12061820924282074, "loss/reg": 344484.5625, "step": 69 }, { "epoch": 0.00175, "grad_norm": 0.9661253690719604, "grad_norm_var": 0.02260186596820497, "learning_rate": 7e-05, "loss": 344183.5312, "loss/crossentropy": 2.389979124069214, "loss/hidden": 0.365234375, "loss/logits": 0.11400818079710007, "loss/reg": 344183.0625, "step": 70 }, { "epoch": 0.001775, "grad_norm": 0.8837181329727173, "grad_norm_var": 0.022350632813116045, "learning_rate": 7.1e-05, "loss": 343858.7812, "loss/crossentropy": 2.6821513175964355, "loss/hidden": 0.35546875, "loss/logits": 0.09980843961238861, "loss/reg": 343858.3125, "step": 71 }, { "epoch": 0.0018, "grad_norm": 0.943979799747467, "grad_norm_var": 0.021136676300270333, "learning_rate": 7.2e-05, "loss": 343543.0938, "loss/crossentropy": 2.401181697845459, "loss/hidden": 0.3359375, "loss/logits": 0.08882458508014679, "loss/reg": 343542.65625, "step": 72 }, { "epoch": 0.001825, "grad_norm": 0.8156776428222656, "grad_norm_var": 0.018398747204270666, "learning_rate": 7.3e-05, "loss": 343215.3125, "loss/crossentropy": 2.5160043239593506, "loss/hidden": 0.333984375, "loss/logits": 0.10164624452590942, "loss/reg": 343214.875, "step": 73 }, { "epoch": 0.00185, "grad_norm": 0.9953745603561401, "grad_norm_var": 0.016123705970816124, "learning_rate": 7.4e-05, "loss": 342870.4688, "loss/crossentropy": 2.549926280975342, "loss/hidden": 0.357421875, "loss/logits": 0.10379678010940552, "loss/reg": 342870.0, "step": 74 }, { "epoch": 0.001875, "grad_norm": 0.9699572920799255, "grad_norm_var": 0.016094164977751277, "learning_rate": 7.500000000000001e-05, "loss": 342530.2812, "loss/crossentropy": 2.412536144256592, "loss/hidden": 0.35546875, "loss/logits": 0.10453817248344421, "loss/reg": 342529.8125, "step": 75 }, { "epoch": 0.0019, "grad_norm": 1.2801817655563354, "grad_norm_var": 0.022752608592142905, "learning_rate": 7.6e-05, "loss": 342192.3438, "loss/crossentropy": 2.7425031661987305, "loss/hidden": 0.37109375, "loss/logits": 0.14211513102054596, "loss/reg": 342191.84375, "step": 76 }, { "epoch": 0.001925, "grad_norm": 0.8839605450630188, "grad_norm_var": 0.022041054647295215, "learning_rate": 7.7e-05, "loss": 341842.0, "loss/crossentropy": 2.3904454708099365, "loss/hidden": 0.357421875, "loss/logits": 0.10721279680728912, "loss/reg": 341841.53125, "step": 77 }, { "epoch": 0.00195, "grad_norm": 1.2837376594543457, "grad_norm_var": 0.018212612959343667, "learning_rate": 7.800000000000001e-05, "loss": 341472.5625, "loss/crossentropy": 2.8702011108398438, "loss/hidden": 0.36328125, "loss/logits": 0.11912151426076889, "loss/reg": 341472.09375, "step": 78 }, { "epoch": 0.001975, "grad_norm": 1.0258175134658813, "grad_norm_var": 0.017979390728313744, "learning_rate": 7.900000000000001e-05, "loss": 341086.625, "loss/crossentropy": 2.470363140106201, "loss/hidden": 0.34765625, "loss/logits": 0.09680184721946716, "loss/reg": 341086.1875, "step": 79 }, { "epoch": 0.002, "grad_norm": 0.8823785781860352, "grad_norm_var": 0.017247493455519333, "learning_rate": 8e-05, "loss": 340713.0938, "loss/crossentropy": 2.2525322437286377, "loss/hidden": 0.333984375, "loss/logits": 0.10186257213354111, "loss/reg": 340712.65625, "step": 80 }, { "epoch": 0.002025, "grad_norm": 0.8656690120697021, "grad_norm_var": 0.018101116935594005, "learning_rate": 8.1e-05, "loss": 340332.9688, "loss/crossentropy": 2.413992404937744, "loss/hidden": 0.3671875, "loss/logits": 0.12494418025016785, "loss/reg": 340332.46875, "step": 81 }, { "epoch": 0.00205, "grad_norm": 1.462515115737915, "grad_norm_var": 0.03130815801283344, "learning_rate": 8.2e-05, "loss": 339945.3125, "loss/crossentropy": 2.6302881240844727, "loss/hidden": 0.353515625, "loss/logits": 0.12398800253868103, "loss/reg": 339944.84375, "step": 82 }, { "epoch": 0.002075, "grad_norm": 0.816981315612793, "grad_norm_var": 0.03367041332098678, "learning_rate": 8.3e-05, "loss": 339556.2812, "loss/crossentropy": 2.577521562576294, "loss/hidden": 0.3359375, "loss/logits": 0.10806369036436081, "loss/reg": 339555.84375, "step": 83 }, { "epoch": 0.0021, "grad_norm": 1.0367132425308228, "grad_norm_var": 0.03371103591838046, "learning_rate": 8.4e-05, "loss": 339152.4688, "loss/crossentropy": 2.751410484313965, "loss/hidden": 0.37109375, "loss/logits": 0.13619256019592285, "loss/reg": 339151.96875, "step": 84 }, { "epoch": 0.002125, "grad_norm": 0.9986303448677063, "grad_norm_var": 0.03371440316385298, "learning_rate": 8.5e-05, "loss": 338737.25, "loss/crossentropy": 2.4450559616088867, "loss/hidden": 0.35546875, "loss/logits": 0.10456883907318115, "loss/reg": 338736.78125, "step": 85 }, { "epoch": 0.00215, "grad_norm": 1.2683286666870117, "grad_norm_var": 0.03777680309527393, "learning_rate": 8.6e-05, "loss": 338297.5625, "loss/crossentropy": 2.4777944087982178, "loss/hidden": 0.380859375, "loss/logits": 0.11096605658531189, "loss/reg": 338297.0625, "step": 86 }, { "epoch": 0.002175, "grad_norm": 0.8816508054733276, "grad_norm_var": 0.037816248324434316, "learning_rate": 8.7e-05, "loss": 337870.1875, "loss/crossentropy": 2.3346803188323975, "loss/hidden": 0.341796875, "loss/logits": 0.10044071823358536, "loss/reg": 337869.75, "step": 87 }, { "epoch": 0.0022, "grad_norm": 0.974234938621521, "grad_norm_var": 0.037543709145419144, "learning_rate": 8.800000000000001e-05, "loss": 337436.875, "loss/crossentropy": 2.5214929580688477, "loss/hidden": 0.349609375, "loss/logits": 0.10045409202575684, "loss/reg": 337436.4375, "step": 88 }, { "epoch": 0.002225, "grad_norm": 1.0935665369033813, "grad_norm_var": 0.0345174990707664, "learning_rate": 8.900000000000001e-05, "loss": 336995.4688, "loss/crossentropy": 2.659154176712036, "loss/hidden": 0.36328125, "loss/logits": 0.12203093618154526, "loss/reg": 336994.96875, "step": 89 }, { "epoch": 0.00225, "grad_norm": 0.8209539651870728, "grad_norm_var": 0.03757256211988141, "learning_rate": 9e-05, "loss": 336544.0312, "loss/crossentropy": 2.3388657569885254, "loss/hidden": 0.33984375, "loss/logits": 0.0983554944396019, "loss/reg": 336543.59375, "step": 90 }, { "epoch": 0.002275, "grad_norm": 1.0395029783248901, "grad_norm_var": 0.037280256977323664, "learning_rate": 9.1e-05, "loss": 336100.2812, "loss/crossentropy": 2.104595184326172, "loss/hidden": 0.35546875, "loss/logits": 0.10804722458124161, "loss/reg": 336099.8125, "step": 91 }, { "epoch": 0.0023, "grad_norm": 0.9130696654319763, "grad_norm_var": 0.033869955312990745, "learning_rate": 9.200000000000001e-05, "loss": 335647.1562, "loss/crossentropy": 2.1718626022338867, "loss/hidden": 0.341796875, "loss/logits": 0.11622267961502075, "loss/reg": 335646.6875, "step": 92 }, { "epoch": 0.002325, "grad_norm": 0.9693914651870728, "grad_norm_var": 0.03282797555167226, "learning_rate": 9.300000000000001e-05, "loss": 335189.0938, "loss/crossentropy": 2.4374241828918457, "loss/hidden": 0.353515625, "loss/logits": 0.1102052628993988, "loss/reg": 335188.625, "step": 93 }, { "epoch": 0.00235, "grad_norm": 1.4234834909439087, "grad_norm_var": 0.038947392506656735, "learning_rate": 9.4e-05, "loss": 334715.6875, "loss/crossentropy": 2.4830873012542725, "loss/hidden": 0.361328125, "loss/logits": 0.10641457140445709, "loss/reg": 334715.21875, "step": 94 }, { "epoch": 0.002375, "grad_norm": 1.064066767692566, "grad_norm_var": 0.03901976710384876, "learning_rate": 9.5e-05, "loss": 334240.6562, "loss/crossentropy": 2.2354929447174072, "loss/hidden": 0.337890625, "loss/logits": 0.10279420763254166, "loss/reg": 334240.21875, "step": 95 }, { "epoch": 0.0024, "grad_norm": 2.2034084796905518, "grad_norm_var": 0.12174535248636993, "learning_rate": 9.6e-05, "loss": 333756.1875, "loss/crossentropy": 3.0666353702545166, "loss/hidden": 0.43359375, "loss/logits": 0.1290225386619568, "loss/reg": 333755.625, "step": 96 }, { "epoch": 0.002425, "grad_norm": 1.1879992485046387, "grad_norm_var": 0.11754435314787527, "learning_rate": 9.7e-05, "loss": 333257.75, "loss/crossentropy": 2.5517079830169678, "loss/hidden": 0.376953125, "loss/logits": 0.12321121245622635, "loss/reg": 333257.25, "step": 97 }, { "epoch": 0.00245, "grad_norm": 1.6118693351745605, "grad_norm_var": 0.1254674718569584, "learning_rate": 9.8e-05, "loss": 332759.9688, "loss/crossentropy": 2.5265674591064453, "loss/hidden": 0.384765625, "loss/logits": 0.12099769711494446, "loss/reg": 332759.46875, "step": 98 }, { "epoch": 0.002475, "grad_norm": 2.0734384059906006, "grad_norm_var": 0.16935214632967607, "learning_rate": 9.900000000000001e-05, "loss": 332225.625, "loss/crossentropy": 3.1464080810546875, "loss/hidden": 0.416015625, "loss/logits": 0.1275343894958496, "loss/reg": 332225.09375, "step": 99 }, { "epoch": 0.0025, "grad_norm": 1.3410392999649048, "grad_norm_var": 0.16760112767482122, "learning_rate": 0.0001, "loss": 331675.25, "loss/crossentropy": 2.381397247314453, "loss/hidden": 0.359375, "loss/logits": 0.11763107776641846, "loss/reg": 331674.78125, "step": 100 }, { "epoch": 0.002525, "grad_norm": 1.091958999633789, "grad_norm_var": 0.1651227977177586, "learning_rate": 0.0001, "loss": 331125.6875, "loss/crossentropy": 2.41511869430542, "loss/hidden": 0.361328125, "loss/logits": 0.11357417702674866, "loss/reg": 331125.21875, "step": 101 }, { "epoch": 0.00255, "grad_norm": 1.239707112312317, "grad_norm_var": 0.16509402495576694, "learning_rate": 0.0001, "loss": 330589.7188, "loss/crossentropy": 2.671584367752075, "loss/hidden": 0.37109375, "loss/logits": 0.11257486045360565, "loss/reg": 330589.25, "step": 102 }, { "epoch": 0.002575, "grad_norm": 1.2163573503494263, "grad_norm_var": 0.15585436844416117, "learning_rate": 0.0001, "loss": 330047.3438, "loss/crossentropy": 2.4153730869293213, "loss/hidden": 0.3671875, "loss/logits": 0.11950643360614777, "loss/reg": 330046.84375, "step": 103 }, { "epoch": 0.0026, "grad_norm": 1.0298452377319336, "grad_norm_var": 0.15388056810262113, "learning_rate": 0.0001, "loss": 329506.1562, "loss/crossentropy": 2.2123711109161377, "loss/hidden": 0.3515625, "loss/logits": 0.10439261794090271, "loss/reg": 329505.6875, "step": 104 }, { "epoch": 0.002625, "grad_norm": 1.1107213497161865, "grad_norm_var": 0.15349545218275876, "learning_rate": 0.0001, "loss": 328957.9688, "loss/crossentropy": 2.4851179122924805, "loss/hidden": 0.3671875, "loss/logits": 0.10356856882572174, "loss/reg": 328957.5, "step": 105 }, { "epoch": 0.00265, "grad_norm": 1.1000527143478394, "grad_norm_var": 0.14161442987122588, "learning_rate": 0.0001, "loss": 328412.3125, "loss/crossentropy": 2.6760635375976562, "loss/hidden": 0.353515625, "loss/logits": 0.10187040269374847, "loss/reg": 328411.84375, "step": 106 }, { "epoch": 0.002675, "grad_norm": 0.9892934560775757, "grad_norm_var": 0.14343889146716268, "learning_rate": 0.0001, "loss": 327867.8438, "loss/crossentropy": 2.3127148151397705, "loss/hidden": 0.357421875, "loss/logits": 0.12052441388368607, "loss/reg": 327867.375, "step": 107 }, { "epoch": 0.0027, "grad_norm": 1.6295005083084106, "grad_norm_var": 0.13995610376130016, "learning_rate": 0.0001, "loss": 327327.125, "loss/crossentropy": 2.7045395374298096, "loss/hidden": 0.40234375, "loss/logits": 0.1712554693222046, "loss/reg": 327326.5625, "step": 108 }, { "epoch": 0.002725, "grad_norm": 1.1677104234695435, "grad_norm_var": 0.13287532765518548, "learning_rate": 0.0001, "loss": 326764.9688, "loss/crossentropy": 3.237940549850464, "loss/hidden": 0.376953125, "loss/logits": 0.11507049202919006, "loss/reg": 326764.46875, "step": 109 }, { "epoch": 0.00275, "grad_norm": 1.0809372663497925, "grad_norm_var": 0.13651149296486545, "learning_rate": 0.0001, "loss": 326164.9688, "loss/crossentropy": 2.2961301803588867, "loss/hidden": 0.3671875, "loss/logits": 0.12248177826404572, "loss/reg": 326164.46875, "step": 110 }, { "epoch": 0.002775, "grad_norm": 0.9283381104469299, "grad_norm_var": 0.14231480086240134, "learning_rate": 0.0001, "loss": 325608.25, "loss/crossentropy": 2.353752613067627, "loss/hidden": 0.333984375, "loss/logits": 0.09742879867553711, "loss/reg": 325607.8125, "step": 111 }, { "epoch": 0.0028, "grad_norm": 3.4329140186309814, "grad_norm_var": 0.3828229891507322, "learning_rate": 0.0001, "loss": 325060.25, "loss/crossentropy": 2.472973108291626, "loss/hidden": 0.3671875, "loss/logits": 0.10647609084844589, "loss/reg": 325059.78125, "step": 112 }, { "epoch": 0.002825, "grad_norm": 1.4601819515228271, "grad_norm_var": 0.3801412532983319, "learning_rate": 0.0001, "loss": 324513.25, "loss/crossentropy": 2.605860471725464, "loss/hidden": 0.37109375, "loss/logits": 0.09949396550655365, "loss/reg": 324512.78125, "step": 113 }, { "epoch": 0.00285, "grad_norm": 0.9892399907112122, "grad_norm_var": 0.3873205653246254, "learning_rate": 0.0001, "loss": 323942.9375, "loss/crossentropy": 2.413478374481201, "loss/hidden": 0.37109375, "loss/logits": 0.11860661953687668, "loss/reg": 323942.4375, "step": 114 }, { "epoch": 0.002875, "grad_norm": 0.990548849105835, "grad_norm_var": 0.3586952225239992, "learning_rate": 0.0001, "loss": 323375.625, "loss/crossentropy": 2.3325929641723633, "loss/hidden": 0.369140625, "loss/logits": 0.112250916659832, "loss/reg": 323375.15625, "step": 115 }, { "epoch": 0.0029, "grad_norm": 0.9300064444541931, "grad_norm_var": 0.3669996764077588, "learning_rate": 0.0001, "loss": 322812.5938, "loss/crossentropy": 2.345551013946533, "loss/hidden": 0.353515625, "loss/logits": 0.11626957356929779, "loss/reg": 322812.125, "step": 116 }, { "epoch": 0.002925, "grad_norm": 0.8991410732269287, "grad_norm_var": 0.3740087755224611, "learning_rate": 0.0001, "loss": 322255.1875, "loss/crossentropy": 2.345388889312744, "loss/hidden": 0.359375, "loss/logits": 0.12949004769325256, "loss/reg": 322254.6875, "step": 117 }, { "epoch": 0.00295, "grad_norm": 1.031494379043579, "grad_norm_var": 0.37734152840261964, "learning_rate": 0.0001, "loss": 321703.5, "loss/crossentropy": 2.427797555923462, "loss/hidden": 0.3671875, "loss/logits": 0.1204058825969696, "loss/reg": 321703.0, "step": 118 }, { "epoch": 0.002975, "grad_norm": 0.8741577863693237, "grad_norm_var": 0.38615619634775694, "learning_rate": 0.0001, "loss": 321156.4688, "loss/crossentropy": 2.613999605178833, "loss/hidden": 0.359375, "loss/logits": 0.11682362854480743, "loss/reg": 321156.0, "step": 119 }, { "epoch": 0.003, "grad_norm": 0.9654676914215088, "grad_norm_var": 0.3881140201880987, "learning_rate": 0.0001, "loss": 320605.125, "loss/crossentropy": 2.579970121383667, "loss/hidden": 0.359375, "loss/logits": 0.11028732359409332, "loss/reg": 320604.65625, "step": 120 }, { "epoch": 0.003025, "grad_norm": 0.9691014289855957, "grad_norm_var": 0.39150146698907184, "learning_rate": 0.0001, "loss": 320055.25, "loss/crossentropy": 2.3624041080474854, "loss/hidden": 0.3671875, "loss/logits": 0.11543390154838562, "loss/reg": 320054.78125, "step": 121 }, { "epoch": 0.00305, "grad_norm": 1.3653072118759155, "grad_norm_var": 0.39183782310619764, "learning_rate": 0.0001, "loss": 319477.5, "loss/crossentropy": 2.8515052795410156, "loss/hidden": 0.375, "loss/logits": 0.1050575003027916, "loss/reg": 319477.03125, "step": 122 }, { "epoch": 0.003075, "grad_norm": 1.1826074123382568, "grad_norm_var": 0.3879316075394361, "learning_rate": 0.0001, "loss": 318915.0938, "loss/crossentropy": 2.2997782230377197, "loss/hidden": 0.37109375, "loss/logits": 0.1062554344534874, "loss/reg": 318914.625, "step": 123 }, { "epoch": 0.0031, "grad_norm": 1.293823003768921, "grad_norm_var": 0.3776996752167721, "learning_rate": 0.0001, "loss": 318354.0625, "loss/crossentropy": 2.4876811504364014, "loss/hidden": 0.376953125, "loss/logits": 0.12040136754512787, "loss/reg": 318353.5625, "step": 124 }, { "epoch": 0.003125, "grad_norm": 0.9141258001327515, "grad_norm_var": 0.3835733183646012, "learning_rate": 0.0001, "loss": 317789.2812, "loss/crossentropy": 2.641594409942627, "loss/hidden": 0.3515625, "loss/logits": 0.10970079898834229, "loss/reg": 317788.8125, "step": 125 }, { "epoch": 0.00315, "grad_norm": 1.066786289215088, "grad_norm_var": 0.3838231454325568, "learning_rate": 0.0001, "loss": 317230.2812, "loss/crossentropy": 2.4842031002044678, "loss/hidden": 0.375, "loss/logits": 0.09922473877668381, "loss/reg": 317229.8125, "step": 126 }, { "epoch": 0.003175, "grad_norm": 0.8703321814537048, "grad_norm_var": 0.38617957642686856, "learning_rate": 0.0001, "loss": 316661.7188, "loss/crossentropy": 2.226027011871338, "loss/hidden": 0.359375, "loss/logits": 0.10238594561815262, "loss/reg": 316661.25, "step": 127 }, { "epoch": 0.0032, "grad_norm": 0.9229066967964172, "grad_norm_var": 0.03339106383164576, "learning_rate": 0.0001, "loss": 316095.2812, "loss/crossentropy": 2.399365186691284, "loss/hidden": 0.361328125, "loss/logits": 0.10705035924911499, "loss/reg": 316094.8125, "step": 128 }, { "epoch": 0.003225, "grad_norm": 0.8661057949066162, "grad_norm_var": 0.02258822527587553, "learning_rate": 0.0001, "loss": 315522.375, "loss/crossentropy": 2.181664228439331, "loss/hidden": 0.35546875, "loss/logits": 0.10982446372509003, "loss/reg": 315521.90625, "step": 129 }, { "epoch": 0.00325, "grad_norm": 1.0535880327224731, "grad_norm_var": 0.02268437097398235, "learning_rate": 0.0001, "loss": 314946.25, "loss/crossentropy": 2.577036142349243, "loss/hidden": 0.37109375, "loss/logits": 0.11703610420227051, "loss/reg": 314945.75, "step": 130 }, { "epoch": 0.003275, "grad_norm": 1.115898847579956, "grad_norm_var": 0.02330423350459072, "learning_rate": 0.0001, "loss": 314301.0938, "loss/crossentropy": 2.495171308517456, "loss/hidden": 0.361328125, "loss/logits": 0.0935671329498291, "loss/reg": 314300.625, "step": 131 }, { "epoch": 0.0033, "grad_norm": 1.5032321214675903, "grad_norm_var": 0.036958687026436544, "learning_rate": 0.0001, "loss": 313705.0938, "loss/crossentropy": 2.527507781982422, "loss/hidden": 0.361328125, "loss/logits": 0.10602883994579315, "loss/reg": 313704.625, "step": 132 }, { "epoch": 0.003325, "grad_norm": 1.007105827331543, "grad_norm_var": 0.035430911148712066, "learning_rate": 0.0001, "loss": 313114.1875, "loss/crossentropy": 2.235227584838867, "loss/hidden": 0.365234375, "loss/logits": 0.1109972670674324, "loss/reg": 313113.71875, "step": 133 }, { "epoch": 0.00335, "grad_norm": 1.1278802156448364, "grad_norm_var": 0.03561144449842336, "learning_rate": 0.0001, "loss": 312513.6875, "loss/crossentropy": 2.4092252254486084, "loss/hidden": 0.3671875, "loss/logits": 0.11512389779090881, "loss/reg": 312513.21875, "step": 134 }, { "epoch": 0.003375, "grad_norm": 0.9926447868347168, "grad_norm_var": 0.033416226700900546, "learning_rate": 0.0001, "loss": 311928.1562, "loss/crossentropy": 2.5909008979797363, "loss/hidden": 0.37109375, "loss/logits": 0.1105237603187561, "loss/reg": 311927.6875, "step": 135 }, { "epoch": 0.0034, "grad_norm": 1.1020681858062744, "grad_norm_var": 0.0325682488842796, "learning_rate": 0.0001, "loss": 311347.2188, "loss/crossentropy": 2.4427216053009033, "loss/hidden": 0.349609375, "loss/logits": 0.10573369264602661, "loss/reg": 311346.75, "step": 136 }, { "epoch": 0.003425, "grad_norm": 1.4713138341903687, "grad_norm_var": 0.040598215379358274, "learning_rate": 0.0001, "loss": 310768.625, "loss/crossentropy": 2.1684117317199707, "loss/hidden": 0.36328125, "loss/logits": 0.13670936226844788, "loss/reg": 310768.125, "step": 137 }, { "epoch": 0.00345, "grad_norm": 1.02042818069458, "grad_norm_var": 0.03656716460911245, "learning_rate": 0.0001, "loss": 310182.9375, "loss/crossentropy": 2.5576694011688232, "loss/hidden": 0.353515625, "loss/logits": 0.1007159948348999, "loss/reg": 310182.46875, "step": 138 }, { "epoch": 0.003475, "grad_norm": 1.0042083263397217, "grad_norm_var": 0.036458819511781294, "learning_rate": 0.0001, "loss": 309594.6562, "loss/crossentropy": 2.290005683898926, "loss/hidden": 0.376953125, "loss/logits": 0.12406025826931, "loss/reg": 309594.15625, "step": 139 }, { "epoch": 0.0035, "grad_norm": 1.167313814163208, "grad_norm_var": 0.03390765521071438, "learning_rate": 0.0001, "loss": 309012.4062, "loss/crossentropy": 3.0499072074890137, "loss/hidden": 0.3671875, "loss/logits": 0.10719737410545349, "loss/reg": 309011.9375, "step": 140 }, { "epoch": 0.003525, "grad_norm": 1.158155918121338, "grad_norm_var": 0.03238307654376082, "learning_rate": 0.0001, "loss": 308423.6562, "loss/crossentropy": 2.15006422996521, "loss/hidden": 0.3671875, "loss/logits": 0.11542148888111115, "loss/reg": 308423.1875, "step": 141 }, { "epoch": 0.00355, "grad_norm": 1.092081069946289, "grad_norm_var": 0.03234267285948652, "learning_rate": 0.0001, "loss": 307830.7188, "loss/crossentropy": 2.273993730545044, "loss/hidden": 0.390625, "loss/logits": 0.13467177748680115, "loss/reg": 307830.1875, "step": 142 }, { "epoch": 0.003575, "grad_norm": 1.0921188592910767, "grad_norm_var": 0.028855911181609867, "learning_rate": 0.0001, "loss": 307240.7812, "loss/crossentropy": 2.25028395652771, "loss/hidden": 0.36328125, "loss/logits": 0.1119830310344696, "loss/reg": 307240.3125, "step": 143 }, { "epoch": 0.0036, "grad_norm": 1.1455613374710083, "grad_norm_var": 0.026516861915888274, "learning_rate": 0.0001, "loss": 306582.3125, "loss/crossentropy": 2.4379734992980957, "loss/hidden": 0.37890625, "loss/logits": 0.1230136901140213, "loss/reg": 306581.8125, "step": 144 }, { "epoch": 0.003625, "grad_norm": 1.1363896131515503, "grad_norm_var": 0.021933560562692318, "learning_rate": 0.0001, "loss": 305979.9688, "loss/crossentropy": 2.93501353263855, "loss/hidden": 0.37890625, "loss/logits": 0.11584587395191193, "loss/reg": 305979.46875, "step": 145 }, { "epoch": 0.00365, "grad_norm": 1.3462218046188354, "grad_norm_var": 0.024036067985930682, "learning_rate": 0.0001, "loss": 305383.7188, "loss/crossentropy": 2.9245474338531494, "loss/hidden": 0.365234375, "loss/logits": 0.0957048088312149, "loss/reg": 305383.25, "step": 146 }, { "epoch": 0.003675, "grad_norm": 1.6365784406661987, "grad_norm_var": 0.03825433404631227, "learning_rate": 0.0001, "loss": 304787.3125, "loss/crossentropy": 2.446303367614746, "loss/hidden": 0.396484375, "loss/logits": 0.13205978274345398, "loss/reg": 304786.78125, "step": 147 }, { "epoch": 0.0037, "grad_norm": 1.2740546464920044, "grad_norm_var": 0.03189546019524201, "learning_rate": 0.0001, "loss": 304129.375, "loss/crossentropy": 2.645003318786621, "loss/hidden": 0.39453125, "loss/logits": 0.11766268312931061, "loss/reg": 304128.875, "step": 148 }, { "epoch": 0.003725, "grad_norm": 1.23747718334198, "grad_norm_var": 0.030105001627378593, "learning_rate": 0.0001, "loss": 303520.3125, "loss/crossentropy": 2.148782253265381, "loss/hidden": 0.380859375, "loss/logits": 0.11950172483921051, "loss/reg": 303519.8125, "step": 149 }, { "epoch": 0.00375, "grad_norm": 1.1281896829605103, "grad_norm_var": 0.03010253596778695, "learning_rate": 0.0001, "loss": 302918.0625, "loss/crossentropy": 2.431253433227539, "loss/hidden": 0.361328125, "loss/logits": 0.10273518413305283, "loss/reg": 302917.59375, "step": 150 }, { "epoch": 0.003775, "grad_norm": 1.233536720275879, "grad_norm_var": 0.02746115673277192, "learning_rate": 0.0001, "loss": 302312.5, "loss/crossentropy": 2.5889945030212402, "loss/hidden": 0.3828125, "loss/logits": 0.13727445900440216, "loss/reg": 302311.96875, "step": 151 }, { "epoch": 0.0038, "grad_norm": 0.9557918310165405, "grad_norm_var": 0.030764173494984705, "learning_rate": 0.0001, "loss": 301711.2812, "loss/crossentropy": 2.881540298461914, "loss/hidden": 0.375, "loss/logits": 0.09907993674278259, "loss/reg": 301710.8125, "step": 152 }, { "epoch": 0.003825, "grad_norm": 1.6195590496063232, "grad_norm_var": 0.03762476341575312, "learning_rate": 0.0001, "loss": 301111.375, "loss/crossentropy": 2.677419424057007, "loss/hidden": 0.41796875, "loss/logits": 0.1280079483985901, "loss/reg": 301110.84375, "step": 153 }, { "epoch": 0.00385, "grad_norm": 1.2070472240447998, "grad_norm_var": 0.035259098369091586, "learning_rate": 0.0001, "loss": 300519.5938, "loss/crossentropy": 2.0665385723114014, "loss/hidden": 0.388671875, "loss/logits": 0.12192954123020172, "loss/reg": 300519.09375, "step": 154 }, { "epoch": 0.003875, "grad_norm": 1.0260303020477295, "grad_norm_var": 0.03467658118889426, "learning_rate": 0.0001, "loss": 299925.3125, "loss/crossentropy": 2.2250208854675293, "loss/hidden": 0.373046875, "loss/logits": 0.09717711806297302, "loss/reg": 299924.84375, "step": 155 }, { "epoch": 0.0039, "grad_norm": 3.2926387786865234, "grad_norm_var": 0.30319103627804694, "learning_rate": 0.0001, "loss": 299323.6562, "loss/crossentropy": 2.505962610244751, "loss/hidden": 0.376953125, "loss/logits": 0.09269723296165466, "loss/reg": 299323.1875, "step": 156 }, { "epoch": 0.003925, "grad_norm": 1.0687026977539062, "grad_norm_var": 0.30596545543291087, "learning_rate": 0.0001, "loss": 298658.0625, "loss/crossentropy": 2.4630236625671387, "loss/hidden": 0.37109375, "loss/logits": 0.10382035374641418, "loss/reg": 298657.59375, "step": 157 }, { "epoch": 0.00395, "grad_norm": 2.530480146408081, "grad_norm_var": 0.38710687727071186, "learning_rate": 0.0001, "loss": 298049.2188, "loss/crossentropy": 2.357905149459839, "loss/hidden": 0.380859375, "loss/logits": 0.10747890919446945, "loss/reg": 298048.71875, "step": 158 }, { "epoch": 0.003975, "grad_norm": 1.1068466901779175, "grad_norm_var": 0.3864507502154694, "learning_rate": 0.0001, "loss": 297454.375, "loss/crossentropy": 2.54504132270813, "loss/hidden": 0.390625, "loss/logits": 0.1359184831380844, "loss/reg": 297453.84375, "step": 159 }, { "epoch": 0.004, "grad_norm": 1.300596833229065, "grad_norm_var": 0.3819891406164392, "learning_rate": 0.0001, "loss": 296827.9062, "loss/crossentropy": 2.3191466331481934, "loss/hidden": 0.369140625, "loss/logits": 0.10781531035900116, "loss/reg": 296827.4375, "step": 160 }, { "epoch": 0.004025, "grad_norm": 1.139121413230896, "grad_norm_var": 0.38187765086207476, "learning_rate": 0.0001, "loss": 296222.1875, "loss/crossentropy": 2.337054491043091, "loss/hidden": 0.359375, "loss/logits": 0.09755780547857285, "loss/reg": 296221.71875, "step": 161 }, { "epoch": 0.00405, "grad_norm": 0.9172402620315552, "grad_norm_var": 0.398967868486981, "learning_rate": 0.0001, "loss": 295604.0938, "loss/crossentropy": 2.536017417907715, "loss/hidden": 0.36328125, "loss/logits": 0.10009163618087769, "loss/reg": 295603.625, "step": 162 }, { "epoch": 0.004075, "grad_norm": 0.87556391954422, "grad_norm_var": 0.4128959927795121, "learning_rate": 0.0001, "loss": 294997.2188, "loss/crossentropy": 2.4224531650543213, "loss/hidden": 0.35546875, "loss/logits": 0.09989573061466217, "loss/reg": 294996.75, "step": 163 }, { "epoch": 0.0041, "grad_norm": 1.1213515996932983, "grad_norm_var": 0.416297804221395, "learning_rate": 0.0001, "loss": 294385.9688, "loss/crossentropy": 2.5139639377593994, "loss/hidden": 0.353515625, "loss/logits": 0.12164813280105591, "loss/reg": 294385.5, "step": 164 }, { "epoch": 0.004125, "grad_norm": 0.9138302803039551, "grad_norm_var": 0.4281321991307845, "learning_rate": 0.0001, "loss": 293774.9375, "loss/crossentropy": 2.6180176734924316, "loss/hidden": 0.37109375, "loss/logits": 0.11297552287578583, "loss/reg": 293774.46875, "step": 165 }, { "epoch": 0.00415, "grad_norm": 0.8265255093574524, "grad_norm_var": 0.442330460385934, "learning_rate": 0.0001, "loss": 293165.0625, "loss/crossentropy": 2.337454080581665, "loss/hidden": 0.357421875, "loss/logits": 0.11443285644054413, "loss/reg": 293164.59375, "step": 166 }, { "epoch": 0.004175, "grad_norm": 1.0448765754699707, "grad_norm_var": 0.4467533252113644, "learning_rate": 0.0001, "loss": 292557.4375, "loss/crossentropy": 2.405266523361206, "loss/hidden": 0.353515625, "loss/logits": 0.09610580652952194, "loss/reg": 292557.0, "step": 167 }, { "epoch": 0.0042, "grad_norm": 1.2320325374603271, "grad_norm_var": 0.43850816507715845, "learning_rate": 0.0001, "loss": 291948.2188, "loss/crossentropy": 2.1647391319274902, "loss/hidden": 0.373046875, "loss/logits": 0.09476009011268616, "loss/reg": 291947.75, "step": 168 }, { "epoch": 0.004225, "grad_norm": 1.198795199394226, "grad_norm_var": 0.4331266932673277, "learning_rate": 0.0001, "loss": 291335.2188, "loss/crossentropy": 2.5491573810577393, "loss/hidden": 0.36328125, "loss/logits": 0.10695995390415192, "loss/reg": 291334.75, "step": 169 }, { "epoch": 0.00425, "grad_norm": 1.2981737852096558, "grad_norm_var": 0.4325150250770269, "learning_rate": 0.0001, "loss": 290728.0, "loss/crossentropy": 2.8310611248016357, "loss/hidden": 0.376953125, "loss/logits": 0.1097162589430809, "loss/reg": 290727.5, "step": 170 }, { "epoch": 0.004275, "grad_norm": 0.8877711296081543, "grad_norm_var": 0.4388671877783653, "learning_rate": 0.0001, "loss": 290118.9375, "loss/crossentropy": 2.337736129760742, "loss/hidden": 0.361328125, "loss/logits": 0.10747654736042023, "loss/reg": 290118.46875, "step": 171 }, { "epoch": 0.0043, "grad_norm": 1.618420124053955, "grad_norm_var": 0.16860597836987498, "learning_rate": 0.0001, "loss": 289504.875, "loss/crossentropy": 2.511380195617676, "loss/hidden": 0.376953125, "loss/logits": 0.11618395149707794, "loss/reg": 289504.375, "step": 172 }, { "epoch": 0.004325, "grad_norm": 1.0306227207183838, "grad_norm_var": 0.16932527275910872, "learning_rate": 0.0001, "loss": 288894.0, "loss/crossentropy": 2.384922742843628, "loss/hidden": 0.3515625, "loss/logits": 0.11106235533952713, "loss/reg": 288893.53125, "step": 173 }, { "epoch": 0.00435, "grad_norm": 1.1711888313293457, "grad_norm_var": 0.041883230518856604, "learning_rate": 0.0001, "loss": 288282.2812, "loss/crossentropy": 2.558091878890991, "loss/hidden": 0.365234375, "loss/logits": 0.11737295985221863, "loss/reg": 288281.8125, "step": 174 }, { "epoch": 0.004375, "grad_norm": 1.187247395515442, "grad_norm_var": 0.04230506281728168, "learning_rate": 0.0001, "loss": 287670.5, "loss/crossentropy": 2.7218947410583496, "loss/hidden": 0.375, "loss/logits": 0.12176042795181274, "loss/reg": 287670.0, "step": 175 }, { "epoch": 0.0044, "grad_norm": 0.8527270555496216, "grad_norm_var": 0.04347262990668099, "learning_rate": 0.0001, "loss": 287040.0, "loss/crossentropy": 2.3621020317077637, "loss/hidden": 0.3359375, "loss/logits": 0.09604518115520477, "loss/reg": 287039.5625, "step": 176 }, { "epoch": 0.004425, "grad_norm": 1.0103626251220703, "grad_norm_var": 0.0435318982609776, "learning_rate": 0.0001, "loss": 286424.9062, "loss/crossentropy": 2.7167232036590576, "loss/hidden": 0.34375, "loss/logits": 0.09909898042678833, "loss/reg": 286424.46875, "step": 177 }, { "epoch": 0.00445, "grad_norm": 1.8277630805969238, "grad_norm_var": 0.07629581584707286, "learning_rate": 0.0001, "loss": 285803.625, "loss/crossentropy": 2.4057064056396484, "loss/hidden": 0.423828125, "loss/logits": 0.12081070989370346, "loss/reg": 285803.09375, "step": 178 }, { "epoch": 0.004475, "grad_norm": 1.0141959190368652, "grad_norm_var": 0.07277399737205728, "learning_rate": 0.0001, "loss": 285182.7188, "loss/crossentropy": 2.4887070655822754, "loss/hidden": 0.369140625, "loss/logits": 0.10165692120790482, "loss/reg": 285182.25, "step": 179 }, { "epoch": 0.0045, "grad_norm": 1.0775957107543945, "grad_norm_var": 0.07300095477065095, "learning_rate": 0.0001, "loss": 284551.3125, "loss/crossentropy": 2.5259671211242676, "loss/hidden": 0.38671875, "loss/logits": 0.12639477849006653, "loss/reg": 284550.8125, "step": 180 }, { "epoch": 0.004525, "grad_norm": 0.741888701915741, "grad_norm_var": 0.07996517019108704, "learning_rate": 0.0001, "loss": 283933.2812, "loss/crossentropy": 2.320598840713501, "loss/hidden": 0.33984375, "loss/logits": 0.09457631409168243, "loss/reg": 283932.84375, "step": 181 }, { "epoch": 0.00455, "grad_norm": 1.2044017314910889, "grad_norm_var": 0.07378781633814714, "learning_rate": 0.0001, "loss": 283313.875, "loss/crossentropy": 2.6596872806549072, "loss/hidden": 0.365234375, "loss/logits": 0.11112990975379944, "loss/reg": 283313.40625, "step": 182 }, { "epoch": 0.004575, "grad_norm": 1.0499355792999268, "grad_norm_var": 0.07371858828104802, "learning_rate": 0.0001, "loss": 282699.0312, "loss/crossentropy": 2.4527785778045654, "loss/hidden": 0.365234375, "loss/logits": 0.10764066874980927, "loss/reg": 282698.5625, "step": 183 }, { "epoch": 0.0046, "grad_norm": 0.9653274416923523, "grad_norm_var": 0.07525412007987213, "learning_rate": 0.0001, "loss": 282087.6562, "loss/crossentropy": 2.3212976455688477, "loss/hidden": 0.37109375, "loss/logits": 0.1268739551305771, "loss/reg": 282087.15625, "step": 184 }, { "epoch": 0.004625, "grad_norm": 0.8533391952514648, "grad_norm_var": 0.079706512148127, "learning_rate": 0.0001, "loss": 281480.6875, "loss/crossentropy": 2.3971142768859863, "loss/hidden": 0.345703125, "loss/logits": 0.09729812294244766, "loss/reg": 281480.25, "step": 185 }, { "epoch": 0.00465, "grad_norm": 0.9148571491241455, "grad_norm_var": 0.0793712865271918, "learning_rate": 0.0001, "loss": 280867.7188, "loss/crossentropy": 2.2173333168029785, "loss/hidden": 0.333984375, "loss/logits": 0.08855113387107849, "loss/reg": 280867.28125, "step": 186 }, { "epoch": 0.004675, "grad_norm": 0.8931951522827148, "grad_norm_var": 0.07922833522774221, "learning_rate": 0.0001, "loss": 280253.6875, "loss/crossentropy": 2.5096046924591064, "loss/hidden": 0.337890625, "loss/logits": 0.10861176252365112, "loss/reg": 280253.25, "step": 187 }, { "epoch": 0.0047, "grad_norm": 0.9277564883232117, "grad_norm_var": 0.06022543723158867, "learning_rate": 0.0001, "loss": 279635.9062, "loss/crossentropy": 2.5655405521392822, "loss/hidden": 0.359375, "loss/logits": 0.10575452446937561, "loss/reg": 279635.4375, "step": 188 }, { "epoch": 0.004725, "grad_norm": 0.8951659202575684, "grad_norm_var": 0.06163460251059214, "learning_rate": 0.0001, "loss": 279024.125, "loss/crossentropy": 2.498629093170166, "loss/hidden": 0.35546875, "loss/logits": 0.09899453818798065, "loss/reg": 279023.65625, "step": 189 }, { "epoch": 0.00475, "grad_norm": 0.9445829391479492, "grad_norm_var": 0.06078005444577621, "learning_rate": 0.0001, "loss": 278415.8125, "loss/crossentropy": 2.5467264652252197, "loss/hidden": 0.353515625, "loss/logits": 0.09790721535682678, "loss/reg": 278415.375, "step": 190 }, { "epoch": 0.004775, "grad_norm": 1.0030640363693237, "grad_norm_var": 0.058854968365466796, "learning_rate": 0.0001, "loss": 277809.5312, "loss/crossentropy": 2.3620338439941406, "loss/hidden": 0.35546875, "loss/logits": 0.08987599611282349, "loss/reg": 277809.09375, "step": 191 }, { "epoch": 0.0048, "grad_norm": 1.0268861055374146, "grad_norm_var": 0.05707516016780034, "learning_rate": 0.0001, "loss": 277195.1562, "loss/crossentropy": 2.4163973331451416, "loss/hidden": 0.3359375, "loss/logits": 0.10799846798181534, "loss/reg": 277194.71875, "step": 192 }, { "epoch": 0.004825, "grad_norm": 1.0668983459472656, "grad_norm_var": 0.05718799704836983, "learning_rate": 0.0001, "loss": 276591.3125, "loss/crossentropy": 2.436892509460449, "loss/hidden": 0.361328125, "loss/logits": 0.11163410544395447, "loss/reg": 276590.84375, "step": 193 }, { "epoch": 0.00485, "grad_norm": 0.8416240811347961, "grad_norm_var": 0.012472236631485067, "learning_rate": 0.0001, "loss": 275983.6562, "loss/crossentropy": 2.4544501304626465, "loss/hidden": 0.357421875, "loss/logits": 0.09674258530139923, "loss/reg": 275983.1875, "step": 194 }, { "epoch": 0.004875, "grad_norm": 0.9170304536819458, "grad_norm_var": 0.012409338821028607, "learning_rate": 0.0001, "loss": 275381.8438, "loss/crossentropy": 2.3440940380096436, "loss/hidden": 0.345703125, "loss/logits": 0.10958852618932724, "loss/reg": 275381.375, "step": 195 }, { "epoch": 0.0049, "grad_norm": 0.9952634572982788, "grad_norm_var": 0.011517069584062688, "learning_rate": 0.0001, "loss": 274752.875, "loss/crossentropy": 2.655498504638672, "loss/hidden": 0.357421875, "loss/logits": 0.1187039464712143, "loss/reg": 274752.40625, "step": 196 }, { "epoch": 0.004925, "grad_norm": 0.8331409692764282, "grad_norm_var": 0.009474079189952281, "learning_rate": 0.0001, "loss": 274145.5938, "loss/crossentropy": 2.363325357437134, "loss/hidden": 0.34375, "loss/logits": 0.1032324731349945, "loss/reg": 274145.15625, "step": 197 }, { "epoch": 0.00495, "grad_norm": 1.0865329504013062, "grad_norm_var": 0.0064743748957604525, "learning_rate": 0.0001, "loss": 273545.7812, "loss/crossentropy": 2.2604920864105225, "loss/hidden": 0.357421875, "loss/logits": 0.1005285307765007, "loss/reg": 273545.3125, "step": 198 }, { "epoch": 0.004975, "grad_norm": 0.9372029900550842, "grad_norm_var": 0.00578024785383775, "learning_rate": 0.0001, "loss": 272943.7188, "loss/crossentropy": 2.70699143409729, "loss/hidden": 0.357421875, "loss/logits": 0.10466171056032181, "loss/reg": 272943.25, "step": 199 }, { "epoch": 0.005, "grad_norm": 1.295333981513977, "grad_norm_var": 0.013531057663448332, "learning_rate": 0.0001, "loss": 272339.8125, "loss/crossentropy": 2.5356268882751465, "loss/hidden": 0.34375, "loss/logits": 0.09528132528066635, "loss/reg": 272339.375, "step": 200 }, { "epoch": 0.005025, "grad_norm": 0.8280879855155945, "grad_norm_var": 0.013945141975632147, "learning_rate": 0.0001, "loss": 271735.5625, "loss/crossentropy": 2.7488901615142822, "loss/hidden": 0.341796875, "loss/logits": 0.10489709675312042, "loss/reg": 271735.125, "step": 201 }, { "epoch": 0.00505, "grad_norm": 1.2747125625610352, "grad_norm_var": 0.01973283747601891, "learning_rate": 0.0001, "loss": 271125.9062, "loss/crossentropy": 2.9518277645111084, "loss/hidden": 0.375, "loss/logits": 0.124430812895298, "loss/reg": 271125.40625, "step": 202 }, { "epoch": 0.005075, "grad_norm": 1.2745908498764038, "grad_norm_var": 0.024135134657858496, "learning_rate": 0.0001, "loss": 270510.0938, "loss/crossentropy": 2.5481607913970947, "loss/hidden": 0.36328125, "loss/logits": 0.1074289083480835, "loss/reg": 270509.625, "step": 203 }, { "epoch": 0.0051, "grad_norm": 0.9992681741714478, "grad_norm_var": 0.023677797963842655, "learning_rate": 0.0001, "loss": 269877.4688, "loss/crossentropy": 2.6645333766937256, "loss/hidden": 0.357421875, "loss/logits": 0.1071658581495285, "loss/reg": 269877.0, "step": 204 }, { "epoch": 0.005125, "grad_norm": 1.423200011253357, "grad_norm_var": 0.03275789245082638, "learning_rate": 0.0001, "loss": 269255.5625, "loss/crossentropy": 2.802064895629883, "loss/hidden": 0.36328125, "loss/logits": 0.08967677503824234, "loss/reg": 269255.125, "step": 205 }, { "epoch": 0.00515, "grad_norm": 1.1624194383621216, "grad_norm_var": 0.03275731314503016, "learning_rate": 0.0001, "loss": 268633.5938, "loss/crossentropy": 2.4441721439361572, "loss/hidden": 0.38671875, "loss/logits": 0.09957188367843628, "loss/reg": 268633.09375, "step": 206 }, { "epoch": 0.005175, "grad_norm": 0.9844810962677002, "grad_norm_var": 0.032920781672822114, "learning_rate": 0.0001, "loss": 268009.5, "loss/crossentropy": 2.4509806632995605, "loss/hidden": 0.357421875, "loss/logits": 0.09912946820259094, "loss/reg": 268009.03125, "step": 207 }, { "epoch": 0.0052, "grad_norm": 1.0119577646255493, "grad_norm_var": 0.03299896367337376, "learning_rate": 0.0001, "loss": 267392.7188, "loss/crossentropy": 2.3468456268310547, "loss/hidden": 0.361328125, "loss/logits": 0.10302117466926575, "loss/reg": 267392.25, "step": 208 }, { "epoch": 0.005225, "grad_norm": 0.9958096146583557, "grad_norm_var": 0.033232689832916586, "learning_rate": 0.0001, "loss": 266773.6562, "loss/crossentropy": 2.5680363178253174, "loss/hidden": 0.345703125, "loss/logits": 0.10389181971549988, "loss/reg": 266773.21875, "step": 209 }, { "epoch": 0.00525, "grad_norm": 1.0967997312545776, "grad_norm_var": 0.030083708105304044, "learning_rate": 0.0001, "loss": 266151.75, "loss/crossentropy": 2.4047651290893555, "loss/hidden": 0.373046875, "loss/logits": 0.1094539687037468, "loss/reg": 266151.28125, "step": 210 }, { "epoch": 0.005275, "grad_norm": 0.8511112928390503, "grad_norm_var": 0.03169748523245202, "learning_rate": 0.0001, "loss": 265518.3438, "loss/crossentropy": 2.393298387527466, "loss/hidden": 0.330078125, "loss/logits": 0.1038215309381485, "loss/reg": 265517.90625, "step": 211 }, { "epoch": 0.0053, "grad_norm": 1.0173771381378174, "grad_norm_var": 0.031520604359032575, "learning_rate": 0.0001, "loss": 264902.6875, "loss/crossentropy": 2.589514970779419, "loss/hidden": 0.37109375, "loss/logits": 0.10674653202295303, "loss/reg": 264902.21875, "step": 212 }, { "epoch": 0.005325, "grad_norm": 1.0270274877548218, "grad_norm_var": 0.027824445234146098, "learning_rate": 0.0001, "loss": 264265.8125, "loss/crossentropy": 2.455270290374756, "loss/hidden": 0.3515625, "loss/logits": 0.10016101598739624, "loss/reg": 264265.375, "step": 213 }, { "epoch": 0.00535, "grad_norm": 0.952225387096405, "grad_norm_var": 0.02881909636722586, "learning_rate": 0.0001, "loss": 263649.0625, "loss/crossentropy": 2.4784069061279297, "loss/hidden": 0.341796875, "loss/logits": 0.0942714512348175, "loss/reg": 263648.625, "step": 214 }, { "epoch": 0.005375, "grad_norm": 1.1497489213943481, "grad_norm_var": 0.02785863109532749, "learning_rate": 0.0001, "loss": 263029.5312, "loss/crossentropy": 2.5415937900543213, "loss/hidden": 0.365234375, "loss/logits": 0.10113269090652466, "loss/reg": 263029.0625, "step": 215 }, { "epoch": 0.0054, "grad_norm": 1.118820309638977, "grad_norm_var": 0.024832392848142825, "learning_rate": 0.0001, "loss": 262410.8125, "loss/crossentropy": 2.605389356613159, "loss/hidden": 0.359375, "loss/logits": 0.08895421028137207, "loss/reg": 262410.375, "step": 216 }, { "epoch": 0.005425, "grad_norm": 1.3159630298614502, "grad_norm_var": 0.02377871624893686, "learning_rate": 0.0001, "loss": 261791.7344, "loss/crossentropy": 2.860142230987549, "loss/hidden": 0.373046875, "loss/logits": 0.07822292298078537, "loss/reg": 261791.28125, "step": 217 }, { "epoch": 0.00545, "grad_norm": 1.0709484815597534, "grad_norm_var": 0.02172128046532335, "learning_rate": 0.0001, "loss": 261152.7188, "loss/crossentropy": 2.735309600830078, "loss/hidden": 0.357421875, "loss/logits": 0.09882272779941559, "loss/reg": 261152.265625, "step": 218 }, { "epoch": 0.005475, "grad_norm": 0.9251193404197693, "grad_norm_var": 0.020787409451776744, "learning_rate": 0.0001, "loss": 260535.3281, "loss/crossentropy": 2.57698130607605, "loss/hidden": 0.349609375, "loss/logits": 0.10102309286594391, "loss/reg": 260534.875, "step": 219 }, { "epoch": 0.0055, "grad_norm": 1.2007627487182617, "grad_norm_var": 0.021454394906030537, "learning_rate": 0.0001, "loss": 259921.6406, "loss/crossentropy": 2.6678407192230225, "loss/hidden": 0.373046875, "loss/logits": 0.12056195735931396, "loss/reg": 259921.140625, "step": 220 }, { "epoch": 0.005525, "grad_norm": 0.8163579702377319, "grad_norm_var": 0.016821595485903732, "learning_rate": 0.0001, "loss": 259304.5312, "loss/crossentropy": 2.159184217453003, "loss/hidden": 0.3359375, "loss/logits": 0.08772530406713486, "loss/reg": 259304.109375, "step": 221 }, { "epoch": 0.00555, "grad_norm": 1.004481315612793, "grad_norm_var": 0.015877592297527447, "learning_rate": 0.0001, "loss": 258690.1094, "loss/crossentropy": 2.620434045791626, "loss/hidden": 0.35546875, "loss/logits": 0.11029927432537079, "loss/reg": 258689.640625, "step": 222 }, { "epoch": 0.005575, "grad_norm": 1.2820781469345093, "grad_norm_var": 0.01946037259012514, "learning_rate": 0.0001, "loss": 258077.8125, "loss/crossentropy": 2.3812220096588135, "loss/hidden": 0.3671875, "loss/logits": 0.11590490490198135, "loss/reg": 258077.328125, "step": 223 }, { "epoch": 0.0056, "grad_norm": 1.1068248748779297, "grad_norm_var": 0.019512738349682583, "learning_rate": 0.0001, "loss": 257462.875, "loss/crossentropy": 2.499555826187134, "loss/hidden": 0.34375, "loss/logits": 0.10359837114810944, "loss/reg": 257462.421875, "step": 224 }, { "epoch": 0.005625, "grad_norm": 1.0697591304779053, "grad_norm_var": 0.019239198728426683, "learning_rate": 0.0001, "loss": 256847.6719, "loss/crossentropy": 2.564305305480957, "loss/hidden": 0.357421875, "loss/logits": 0.10096286982297897, "loss/reg": 256847.21875, "step": 225 }, { "epoch": 0.00565, "grad_norm": 1.013181209564209, "grad_norm_var": 0.019297556456657598, "learning_rate": 0.0001, "loss": 256187.0781, "loss/crossentropy": 2.682983636856079, "loss/hidden": 0.373046875, "loss/logits": 0.10669070482254028, "loss/reg": 256186.59375, "step": 226 }, { "epoch": 0.005675, "grad_norm": 1.149176836013794, "grad_norm_var": 0.016643495102077507, "learning_rate": 0.0001, "loss": 255567.5156, "loss/crossentropy": 2.8012313842773438, "loss/hidden": 0.37109375, "loss/logits": 0.1071043312549591, "loss/reg": 255567.03125, "step": 227 }, { "epoch": 0.0057, "grad_norm": 0.8703486323356628, "grad_norm_var": 0.019148532368986424, "learning_rate": 0.0001, "loss": 254945.2188, "loss/crossentropy": 2.1328365802764893, "loss/hidden": 0.341796875, "loss/logits": 0.09531014412641525, "loss/reg": 254944.78125, "step": 228 }, { "epoch": 0.005725, "grad_norm": 0.9841340780258179, "grad_norm_var": 0.019492424773756397, "learning_rate": 0.0001, "loss": 254324.0156, "loss/crossentropy": 2.6227455139160156, "loss/hidden": 0.359375, "loss/logits": 0.10204833000898361, "loss/reg": 254323.546875, "step": 229 }, { "epoch": 0.00575, "grad_norm": 0.8693890571594238, "grad_norm_var": 0.02115991781307726, "learning_rate": 0.0001, "loss": 253708.0469, "loss/crossentropy": 2.373120069503784, "loss/hidden": 0.333984375, "loss/logits": 0.1063462570309639, "loss/reg": 253707.609375, "step": 230 }, { "epoch": 0.005775, "grad_norm": 0.8079307675361633, "grad_norm_var": 0.024335258901194465, "learning_rate": 0.0001, "loss": 253093.1562, "loss/crossentropy": 2.293689250946045, "loss/hidden": 0.345703125, "loss/logits": 0.10375718027353287, "loss/reg": 253092.703125, "step": 231 }, { "epoch": 0.0058, "grad_norm": 0.8402238488197327, "grad_norm_var": 0.026177766881752735, "learning_rate": 0.0001, "loss": 252477.5938, "loss/crossentropy": 2.4137558937072754, "loss/hidden": 0.3203125, "loss/logits": 0.09599176049232483, "loss/reg": 252477.171875, "step": 232 }, { "epoch": 0.005825, "grad_norm": 1.118818759918213, "grad_norm_var": 0.020838201426947003, "learning_rate": 0.0001, "loss": 251863.7812, "loss/crossentropy": 2.377997398376465, "loss/hidden": 0.359375, "loss/logits": 0.1122322753071785, "loss/reg": 251863.3125, "step": 233 }, { "epoch": 0.00585, "grad_norm": 1.1597261428833008, "grad_norm_var": 0.02207478026816852, "learning_rate": 0.0001, "loss": 251232.2969, "loss/crossentropy": 2.248708724975586, "loss/hidden": 0.34375, "loss/logits": 0.10326389968395233, "loss/reg": 251231.84375, "step": 234 }, { "epoch": 0.005875, "grad_norm": 0.8862115144729614, "grad_norm_var": 0.022628637096132904, "learning_rate": 0.0001, "loss": 250618.3281, "loss/crossentropy": 2.364410877227783, "loss/hidden": 0.357421875, "loss/logits": 0.09520143270492554, "loss/reg": 250617.875, "step": 235 }, { "epoch": 0.0059, "grad_norm": 2.3220930099487305, "grad_norm_var": 0.12955472788699715, "learning_rate": 0.0001, "loss": 250009.4219, "loss/crossentropy": 2.3695480823516846, "loss/hidden": 0.4140625, "loss/logits": 0.20651775598526, "loss/reg": 250008.796875, "step": 236 }, { "epoch": 0.005925, "grad_norm": 0.8119528293609619, "grad_norm_var": 0.1297115526028732, "learning_rate": 0.0001, "loss": 249394.3438, "loss/crossentropy": 2.531947374343872, "loss/hidden": 0.328125, "loss/logits": 0.10816951841115952, "loss/reg": 249393.90625, "step": 237 }, { "epoch": 0.00595, "grad_norm": 0.8456658124923706, "grad_norm_var": 0.1329087010824144, "learning_rate": 0.0001, "loss": 248792.0156, "loss/crossentropy": 2.1139674186706543, "loss/hidden": 0.341796875, "loss/logits": 0.09102342277765274, "loss/reg": 248791.578125, "step": 238 }, { "epoch": 0.005975, "grad_norm": 1.0733762979507446, "grad_norm_var": 0.12975996092384426, "learning_rate": 0.0001, "loss": 248187.4844, "loss/crossentropy": 2.523265838623047, "loss/hidden": 0.345703125, "loss/logits": 0.10305313020944595, "loss/reg": 248187.03125, "step": 239 }, { "epoch": 0.006, "grad_norm": 1.1087757349014282, "grad_norm_var": 0.129772885642123, "learning_rate": 0.0001, "loss": 247579.5469, "loss/crossentropy": 2.562262535095215, "loss/hidden": 0.35546875, "loss/logits": 0.10083188116550446, "loss/reg": 247579.09375, "step": 240 }, { "epoch": 0.006025, "grad_norm": 1.3459455966949463, "grad_norm_var": 0.13496698854201672, "learning_rate": 0.0001, "loss": 246977.7344, "loss/crossentropy": 2.211017370223999, "loss/hidden": 0.349609375, "loss/logits": 0.09050680696964264, "loss/reg": 246977.296875, "step": 241 }, { "epoch": 0.00605, "grad_norm": 1.1334515810012817, "grad_norm_var": 0.13487275370998758, "learning_rate": 0.0001, "loss": 246389.9375, "loss/crossentropy": 2.424520492553711, "loss/hidden": 0.34375, "loss/logits": 0.09681625664234161, "loss/reg": 246389.5, "step": 242 }, { "epoch": 0.006075, "grad_norm": 0.8859056234359741, "grad_norm_var": 0.13688003263884654, "learning_rate": 0.0001, "loss": 245796.7031, "loss/crossentropy": 2.399360179901123, "loss/hidden": 0.341796875, "loss/logits": 0.10117021203041077, "loss/reg": 245796.265625, "step": 243 }, { "epoch": 0.0061, "grad_norm": 1.0999157428741455, "grad_norm_var": 0.13416995876895163, "learning_rate": 0.0001, "loss": 245189.2188, "loss/crossentropy": 2.5314061641693115, "loss/hidden": 0.3671875, "loss/logits": 0.09776099026203156, "loss/reg": 245188.75, "step": 244 }, { "epoch": 0.006125, "grad_norm": 0.9259365797042847, "grad_norm_var": 0.13513208581838518, "learning_rate": 0.0001, "loss": 244560.7344, "loss/crossentropy": 2.1860015392303467, "loss/hidden": 0.345703125, "loss/logits": 0.09684273600578308, "loss/reg": 244560.296875, "step": 245 }, { "epoch": 0.00615, "grad_norm": 1.2072253227233887, "grad_norm_var": 0.13290427526062487, "learning_rate": 0.0001, "loss": 243912.7031, "loss/crossentropy": 2.7621207237243652, "loss/hidden": 0.359375, "loss/logits": 0.10011032223701477, "loss/reg": 243912.25, "step": 246 }, { "epoch": 0.006175, "grad_norm": 0.9330417513847351, "grad_norm_var": 0.12903841831431506, "learning_rate": 0.0001, "loss": 243308.4375, "loss/crossentropy": 2.5903942584991455, "loss/hidden": 0.341796875, "loss/logits": 0.08663488924503326, "loss/reg": 243308.015625, "step": 247 }, { "epoch": 0.0062, "grad_norm": 0.9449116587638855, "grad_norm_var": 0.12601160933316582, "learning_rate": 0.0001, "loss": 242708.8438, "loss/crossentropy": 2.523956537246704, "loss/hidden": 0.34375, "loss/logits": 0.1024409681558609, "loss/reg": 242708.390625, "step": 248 }, { "epoch": 0.006225, "grad_norm": 0.9360359907150269, "grad_norm_var": 0.12795021034255089, "learning_rate": 0.0001, "loss": 242110.125, "loss/crossentropy": 2.697937488555908, "loss/hidden": 0.34375, "loss/logits": 0.10301453620195389, "loss/reg": 242109.671875, "step": 249 }, { "epoch": 0.00625, "grad_norm": 1.4438672065734863, "grad_norm_var": 0.13521121045655154, "learning_rate": 0.0001, "loss": 241507.6875, "loss/crossentropy": 2.2453622817993164, "loss/hidden": 0.369140625, "loss/logits": 0.15388593077659607, "loss/reg": 241507.171875, "step": 250 }, { "epoch": 0.006275, "grad_norm": 1.023462176322937, "grad_norm_var": 0.13212816249865156, "learning_rate": 0.0001, "loss": 240912.5156, "loss/crossentropy": 2.5658745765686035, "loss/hidden": 0.33203125, "loss/logits": 0.0870145931839943, "loss/reg": 240912.09375, "step": 251 }, { "epoch": 0.0063, "grad_norm": 1.0394799709320068, "grad_norm_var": 0.03066997116386473, "learning_rate": 0.0001, "loss": 240319.1562, "loss/crossentropy": 2.660517930984497, "loss/hidden": 0.369140625, "loss/logits": 0.09973147511482239, "loss/reg": 240318.6875, "step": 252 }, { "epoch": 0.006325, "grad_norm": 0.8985840082168579, "grad_norm_var": 0.028419025472375382, "learning_rate": 0.0001, "loss": 239699.5781, "loss/crossentropy": 2.5660433769226074, "loss/hidden": 0.34375, "loss/logits": 0.10593757778406143, "loss/reg": 239699.125, "step": 253 }, { "epoch": 0.00635, "grad_norm": 1.0081883668899536, "grad_norm_var": 0.025580286139437903, "learning_rate": 0.0001, "loss": 239101.4531, "loss/crossentropy": 2.7612223625183105, "loss/hidden": 0.357421875, "loss/logits": 0.09220311045646667, "loss/reg": 239101.0, "step": 254 }, { "epoch": 0.006375, "grad_norm": 0.9036446809768677, "grad_norm_var": 0.027146159272535185, "learning_rate": 0.0001, "loss": 238506.4844, "loss/crossentropy": 2.500701665878296, "loss/hidden": 0.341796875, "loss/logits": 0.0896071344614029, "loss/reg": 238506.046875, "step": 255 }, { "epoch": 0.0064, "grad_norm": 1.1582329273223877, "grad_norm_var": 0.027670804759285694, "learning_rate": 0.0001, "loss": 237910.5781, "loss/crossentropy": 2.4937164783477783, "loss/hidden": 0.328125, "loss/logits": 0.08801822364330292, "loss/reg": 237910.15625, "step": 256 }, { "epoch": 0.006425, "grad_norm": 0.9264727830886841, "grad_norm_var": 0.02242301018755981, "learning_rate": 0.0001, "loss": 237308.7812, "loss/crossentropy": 2.568125009536743, "loss/hidden": 0.31640625, "loss/logits": 0.07117190212011337, "loss/reg": 237308.390625, "step": 257 }, { "epoch": 0.00645, "grad_norm": 0.9694778323173523, "grad_norm_var": 0.021825782526880135, "learning_rate": 0.0001, "loss": 236710.5625, "loss/crossentropy": 2.389742612838745, "loss/hidden": 0.35546875, "loss/logits": 0.10670766979455948, "loss/reg": 236710.09375, "step": 258 }, { "epoch": 0.006475, "grad_norm": 0.9396619200706482, "grad_norm_var": 0.02105226506656847, "learning_rate": 0.0001, "loss": 236112.0, "loss/crossentropy": 2.7042641639709473, "loss/hidden": 0.34375, "loss/logits": 0.09515395760536194, "loss/reg": 236111.5625, "step": 259 }, { "epoch": 0.0065, "grad_norm": 1.0471324920654297, "grad_norm_var": 0.020680741980486004, "learning_rate": 0.0001, "loss": 235514.3906, "loss/crossentropy": 2.3326146602630615, "loss/hidden": 0.353515625, "loss/logits": 0.11733515560626984, "loss/reg": 235513.921875, "step": 260 }, { "epoch": 0.006525, "grad_norm": 0.9425455331802368, "grad_norm_var": 0.020491703959659387, "learning_rate": 0.0001, "loss": 234854.2656, "loss/crossentropy": 2.465421438217163, "loss/hidden": 0.341796875, "loss/logits": 0.0917583629488945, "loss/reg": 234853.828125, "step": 261 }, { "epoch": 0.00655, "grad_norm": 1.0369884967803955, "grad_norm_var": 0.018056091150610066, "learning_rate": 0.0001, "loss": 234247.1875, "loss/crossentropy": 2.373871326446533, "loss/hidden": 0.328125, "loss/logits": 0.09279663860797882, "loss/reg": 234246.765625, "step": 262 }, { "epoch": 0.006575, "grad_norm": 0.9980863928794861, "grad_norm_var": 0.017657570783222276, "learning_rate": 0.0001, "loss": 233649.6875, "loss/crossentropy": 2.6594350337982178, "loss/hidden": 0.357421875, "loss/logits": 0.09325754642486572, "loss/reg": 233649.234375, "step": 263 }, { "epoch": 0.0066, "grad_norm": 0.9231501221656799, "grad_norm_var": 0.017886320334470678, "learning_rate": 0.0001, "loss": 233047.1406, "loss/crossentropy": 2.5805892944335938, "loss/hidden": 0.333984375, "loss/logits": 0.09794628620147705, "loss/reg": 233046.703125, "step": 264 }, { "epoch": 0.006625, "grad_norm": 1.0291575193405151, "grad_norm_var": 0.017482775408658568, "learning_rate": 0.0001, "loss": 232452.5938, "loss/crossentropy": 2.751997232437134, "loss/hidden": 0.33984375, "loss/logits": 0.09924400597810745, "loss/reg": 232452.15625, "step": 265 }, { "epoch": 0.00665, "grad_norm": 0.9338252544403076, "grad_norm_var": 0.0047809106518514, "learning_rate": 0.0001, "loss": 231850.6875, "loss/crossentropy": 2.4896867275238037, "loss/hidden": 0.36328125, "loss/logits": 0.10598400980234146, "loss/reg": 231850.21875, "step": 266 }, { "epoch": 0.006675, "grad_norm": 0.9800401926040649, "grad_norm_var": 0.004682617947830892, "learning_rate": 0.0001, "loss": 231259.6719, "loss/crossentropy": 2.419142723083496, "loss/hidden": 0.35546875, "loss/logits": 0.10172034800052643, "loss/reg": 231259.21875, "step": 267 }, { "epoch": 0.0067, "grad_norm": 1.3181782960891724, "grad_norm_var": 0.011620460850276437, "learning_rate": 0.0001, "loss": 230605.5781, "loss/crossentropy": 2.649991273880005, "loss/hidden": 0.361328125, "loss/logits": 0.102925606071949, "loss/reg": 230605.109375, "step": 268 }, { "epoch": 0.006725, "grad_norm": 1.3538788557052612, "grad_norm_var": 0.018369025736734025, "learning_rate": 0.0001, "loss": 230004.8594, "loss/crossentropy": 2.4643285274505615, "loss/hidden": 0.34375, "loss/logits": 0.10253345966339111, "loss/reg": 230004.40625, "step": 269 }, { "epoch": 0.00675, "grad_norm": 1.4021903276443481, "grad_norm_var": 0.026962756524618155, "learning_rate": 0.0001, "loss": 229408.0, "loss/crossentropy": 2.2580270767211914, "loss/hidden": 0.34375, "loss/logits": 0.08600744605064392, "loss/reg": 229407.5625, "step": 270 }, { "epoch": 0.006775, "grad_norm": 0.983797013759613, "grad_norm_var": 0.025758329972640683, "learning_rate": 0.0001, "loss": 228810.3281, "loss/crossentropy": 2.394120693206787, "loss/hidden": 0.337890625, "loss/logits": 0.10623958706855774, "loss/reg": 228809.890625, "step": 271 }, { "epoch": 0.0068, "grad_norm": 0.9135763049125671, "grad_norm_var": 0.02625990416445673, "learning_rate": 0.0001, "loss": 228215.3438, "loss/crossentropy": 2.6530239582061768, "loss/hidden": 0.37109375, "loss/logits": 0.10234525799751282, "loss/reg": 228214.875, "step": 272 }, { "epoch": 0.006825, "grad_norm": 0.9926636815071106, "grad_norm_var": 0.025499722066653265, "learning_rate": 0.0001, "loss": 227619.7812, "loss/crossentropy": 2.3669707775115967, "loss/hidden": 0.34375, "loss/logits": 0.09970035403966904, "loss/reg": 227619.34375, "step": 273 }, { "epoch": 0.00685, "grad_norm": 1.1078017950057983, "grad_norm_var": 0.025251574412599117, "learning_rate": 0.0001, "loss": 227035.0469, "loss/crossentropy": 2.6442811489105225, "loss/hidden": 0.34375, "loss/logits": 0.11636050045490265, "loss/reg": 227034.59375, "step": 274 }, { "epoch": 0.006875, "grad_norm": 1.0784547328948975, "grad_norm_var": 0.02429490171527709, "learning_rate": 0.0001, "loss": 226429.125, "loss/crossentropy": 2.456956386566162, "loss/hidden": 0.349609375, "loss/logits": 0.09312474727630615, "loss/reg": 226428.6875, "step": 275 }, { "epoch": 0.0069, "grad_norm": 0.9759635329246521, "grad_norm_var": 0.02478188381783776, "learning_rate": 0.0001, "loss": 225820.25, "loss/crossentropy": 2.6639130115509033, "loss/hidden": 0.34375, "loss/logits": 0.09759189188480377, "loss/reg": 225819.8125, "step": 276 }, { "epoch": 0.006925, "grad_norm": 1.0306111574172974, "grad_norm_var": 0.023879888291922433, "learning_rate": 0.0001, "loss": 225229.0, "loss/crossentropy": 2.3712949752807617, "loss/hidden": 0.34375, "loss/logits": 0.09938957542181015, "loss/reg": 225228.5625, "step": 277 }, { "epoch": 0.00695, "grad_norm": 0.8460413813591003, "grad_norm_var": 0.026901071203976896, "learning_rate": 0.0001, "loss": 224635.5938, "loss/crossentropy": 2.413285255432129, "loss/hidden": 0.359375, "loss/logits": 0.10232223570346832, "loss/reg": 224635.125, "step": 278 }, { "epoch": 0.006975, "grad_norm": 0.9385935664176941, "grad_norm_var": 0.027567505122787835, "learning_rate": 0.0001, "loss": 224046.9062, "loss/crossentropy": 2.5851972103118896, "loss/hidden": 0.345703125, "loss/logits": 0.10651391744613647, "loss/reg": 224046.453125, "step": 279 }, { "epoch": 0.007, "grad_norm": 1.1753954887390137, "grad_norm_var": 0.027261276031140782, "learning_rate": 0.0001, "loss": 223464.125, "loss/crossentropy": 2.621198892593384, "loss/hidden": 0.37109375, "loss/logits": 0.1016470193862915, "loss/reg": 223463.65625, "step": 280 }, { "epoch": 0.007025, "grad_norm": 0.7150670289993286, "grad_norm_var": 0.03498090695509711, "learning_rate": 0.0001, "loss": 222877.25, "loss/crossentropy": 2.3241894245147705, "loss/hidden": 0.330078125, "loss/logits": 0.08896858990192413, "loss/reg": 222876.828125, "step": 281 }, { "epoch": 0.00705, "grad_norm": 0.7701132893562317, "grad_norm_var": 0.039118337071005614, "learning_rate": 0.0001, "loss": 222293.3594, "loss/crossentropy": 2.402475595474243, "loss/hidden": 0.333984375, "loss/logits": 0.08732319623231888, "loss/reg": 222292.9375, "step": 282 }, { "epoch": 0.007075, "grad_norm": 0.8436958193778992, "grad_norm_var": 0.04130473994550788, "learning_rate": 0.0001, "loss": 221710.1719, "loss/crossentropy": 2.3136937618255615, "loss/hidden": 0.341796875, "loss/logits": 0.08763205260038376, "loss/reg": 221709.75, "step": 283 }, { "epoch": 0.0071, "grad_norm": 1.1946781873703003, "grad_norm_var": 0.03747769835346162, "learning_rate": 0.0001, "loss": 221121.3125, "loss/crossentropy": 2.662444829940796, "loss/hidden": 0.34375, "loss/logits": 0.09524821490049362, "loss/reg": 221120.875, "step": 284 }, { "epoch": 0.007125, "grad_norm": 1.2086204290390015, "grad_norm_var": 0.032333006419608526, "learning_rate": 0.0001, "loss": 220536.6875, "loss/crossentropy": 2.615415334701538, "loss/hidden": 0.35546875, "loss/logits": 0.08893512934446335, "loss/reg": 220536.25, "step": 285 }, { "epoch": 0.00715, "grad_norm": 0.9618238210678101, "grad_norm_var": 0.02148886117231328, "learning_rate": 0.0001, "loss": 219940.6406, "loss/crossentropy": 2.6531989574432373, "loss/hidden": 0.353515625, "loss/logits": 0.10557064414024353, "loss/reg": 219940.1875, "step": 286 }, { "epoch": 0.007175, "grad_norm": 1.1346673965454102, "grad_norm_var": 0.022916324895915455, "learning_rate": 0.0001, "loss": 219349.0469, "loss/crossentropy": 2.2792797088623047, "loss/hidden": 0.365234375, "loss/logits": 0.0966588705778122, "loss/reg": 219348.578125, "step": 287 }, { "epoch": 0.0072, "grad_norm": 1.011553406715393, "grad_norm_var": 0.022478923702978902, "learning_rate": 0.0001, "loss": 218766.7188, "loss/crossentropy": 2.4701690673828125, "loss/hidden": 0.35546875, "loss/logits": 0.10134139657020569, "loss/reg": 218766.265625, "step": 288 }, { "epoch": 0.007225, "grad_norm": 1.0193150043487549, "grad_norm_var": 0.02250041335545263, "learning_rate": 0.0001, "loss": 218185.0312, "loss/crossentropy": 2.465268135070801, "loss/hidden": 0.333984375, "loss/logits": 0.09241659939289093, "loss/reg": 218184.609375, "step": 289 }, { "epoch": 0.00725, "grad_norm": 1.0223941802978516, "grad_norm_var": 0.021737527223071717, "learning_rate": 0.0001, "loss": 217599.0938, "loss/crossentropy": 2.7111759185791016, "loss/hidden": 0.330078125, "loss/logits": 0.07478834688663483, "loss/reg": 217598.6875, "step": 290 }, { "epoch": 0.007275, "grad_norm": 0.9252673983573914, "grad_norm_var": 0.02150853476019776, "learning_rate": 0.0001, "loss": 217004.875, "loss/crossentropy": 2.551020860671997, "loss/hidden": 0.3515625, "loss/logits": 0.09227164834737778, "loss/reg": 217004.4375, "step": 291 }, { "epoch": 0.0073, "grad_norm": 1.1825777292251587, "grad_norm_var": 0.02390391973484823, "learning_rate": 0.0001, "loss": 216435.7344, "loss/crossentropy": 2.4061431884765625, "loss/hidden": 0.337890625, "loss/logits": 0.09683596342802048, "loss/reg": 216435.296875, "step": 292 }, { "epoch": 0.007325, "grad_norm": 1.2532509565353394, "grad_norm_var": 0.027946987478518166, "learning_rate": 0.0001, "loss": 215860.2344, "loss/crossentropy": 2.4278337955474854, "loss/hidden": 0.392578125, "loss/logits": 0.11733637750148773, "loss/reg": 215859.71875, "step": 293 }, { "epoch": 0.00735, "grad_norm": 1.0421700477600098, "grad_norm_var": 0.02599317334285066, "learning_rate": 0.0001, "loss": 215285.9531, "loss/crossentropy": 2.326347827911377, "loss/hidden": 0.333984375, "loss/logits": 0.09628565609455109, "loss/reg": 215285.515625, "step": 294 }, { "epoch": 0.007375, "grad_norm": 0.8057188391685486, "grad_norm_var": 0.028626581178707273, "learning_rate": 0.0001, "loss": 214712.9844, "loss/crossentropy": 2.355774402618408, "loss/hidden": 0.34375, "loss/logits": 0.1015491932630539, "loss/reg": 214712.546875, "step": 295 }, { "epoch": 0.0074, "grad_norm": 0.9605076909065247, "grad_norm_var": 0.026964136984593035, "learning_rate": 0.0001, "loss": 214139.4062, "loss/crossentropy": 2.6580722332000732, "loss/hidden": 0.3671875, "loss/logits": 0.10701446235179901, "loss/reg": 214138.9375, "step": 296 }, { "epoch": 0.007425, "grad_norm": 1.0147887468338013, "grad_norm_var": 0.02106352582271733, "learning_rate": 0.0001, "loss": 213561.5625, "loss/crossentropy": 2.488769054412842, "loss/hidden": 0.337890625, "loss/logits": 0.09455054253339767, "loss/reg": 213561.125, "step": 297 }, { "epoch": 0.00745, "grad_norm": 0.9428306818008423, "grad_norm_var": 0.017128520051063786, "learning_rate": 0.0001, "loss": 212931.4375, "loss/crossentropy": 2.7574522495269775, "loss/hidden": 0.333984375, "loss/logits": 0.08713274449110031, "loss/reg": 212931.015625, "step": 298 }, { "epoch": 0.007475, "grad_norm": 0.9792951345443726, "grad_norm_var": 0.014859793940526804, "learning_rate": 0.0001, "loss": 212353.3281, "loss/crossentropy": 2.5566937923431396, "loss/hidden": 0.359375, "loss/logits": 0.08882761001586914, "loss/reg": 212352.875, "step": 299 }, { "epoch": 0.0075, "grad_norm": 0.896796464920044, "grad_norm_var": 0.01431050394401887, "learning_rate": 0.0001, "loss": 211783.0781, "loss/crossentropy": 2.5626933574676514, "loss/hidden": 0.330078125, "loss/logits": 0.09205923229455948, "loss/reg": 211782.65625, "step": 300 }, { "epoch": 0.007525, "grad_norm": 0.7680262923240662, "grad_norm_var": 0.015515187773533511, "learning_rate": 0.0001, "loss": 211217.1875, "loss/crossentropy": 2.431285858154297, "loss/hidden": 0.33203125, "loss/logits": 0.0967300534248352, "loss/reg": 211216.765625, "step": 301 }, { "epoch": 0.00755, "grad_norm": 0.9614930152893066, "grad_norm_var": 0.015516660641321244, "learning_rate": 0.0001, "loss": 210649.875, "loss/crossentropy": 2.1730730533599854, "loss/hidden": 0.33984375, "loss/logits": 0.09347091615200043, "loss/reg": 210649.4375, "step": 302 }, { "epoch": 0.007575, "grad_norm": 1.0022411346435547, "grad_norm_var": 0.014147341758013413, "learning_rate": 0.0001, "loss": 210079.5938, "loss/crossentropy": 2.476670503616333, "loss/hidden": 0.376953125, "loss/logits": 0.11385416984558105, "loss/reg": 210079.109375, "step": 303 }, { "epoch": 0.0076, "grad_norm": 0.8967432975769043, "grad_norm_var": 0.014591702837682874, "learning_rate": 0.0001, "loss": 209458.6562, "loss/crossentropy": 2.4919116497039795, "loss/hidden": 0.33984375, "loss/logits": 0.10090088099241257, "loss/reg": 209458.21875, "step": 304 }, { "epoch": 0.007625, "grad_norm": 1.0569437742233276, "grad_norm_var": 0.014879512513798866, "learning_rate": 0.0001, "loss": 208890.5938, "loss/crossentropy": 2.506409168243408, "loss/hidden": 0.34375, "loss/logits": 0.0949743390083313, "loss/reg": 208890.15625, "step": 305 }, { "epoch": 0.00765, "grad_norm": 0.9816184639930725, "grad_norm_var": 0.014763490776195794, "learning_rate": 0.0001, "loss": 208320.8594, "loss/crossentropy": 2.578554153442383, "loss/hidden": 0.337890625, "loss/logits": 0.10027292370796204, "loss/reg": 208320.421875, "step": 306 }, { "epoch": 0.007675, "grad_norm": 0.8720800876617432, "grad_norm_var": 0.015324127636003245, "learning_rate": 0.0001, "loss": 207765.6406, "loss/crossentropy": 2.460149049758911, "loss/hidden": 0.3203125, "loss/logits": 0.07679739594459534, "loss/reg": 207765.25, "step": 307 }, { "epoch": 0.0077, "grad_norm": 1.0239520072937012, "grad_norm_var": 0.012529052004580344, "learning_rate": 0.0001, "loss": 207164.1094, "loss/crossentropy": 2.575766086578369, "loss/hidden": 0.349609375, "loss/logits": 0.0994313433766365, "loss/reg": 207163.65625, "step": 308 }, { "epoch": 0.007725, "grad_norm": 0.8378457427024841, "grad_norm_var": 0.007412576724656938, "learning_rate": 0.0001, "loss": 206549.0469, "loss/crossentropy": 2.478156566619873, "loss/hidden": 0.33984375, "loss/logits": 0.08868305385112762, "loss/reg": 206548.625, "step": 309 }, { "epoch": 0.00775, "grad_norm": 0.9334253668785095, "grad_norm_var": 0.006673036559606219, "learning_rate": 0.0001, "loss": 205981.6719, "loss/crossentropy": 2.2035024166107178, "loss/hidden": 0.326171875, "loss/logits": 0.09419551491737366, "loss/reg": 205981.25, "step": 310 }, { "epoch": 0.007775, "grad_norm": 0.9910398721694946, "grad_norm_var": 0.0056647394639551, "learning_rate": 0.0001, "loss": 205414.7656, "loss/crossentropy": 2.25825834274292, "loss/hidden": 0.375, "loss/logits": 0.1014915332198143, "loss/reg": 205414.296875, "step": 311 }, { "epoch": 0.0078, "grad_norm": 1.8309675455093384, "grad_norm_var": 0.05482355491819148, "learning_rate": 0.0001, "loss": 204837.3125, "loss/crossentropy": 2.665398120880127, "loss/hidden": 0.388671875, "loss/logits": 0.11154994368553162, "loss/reg": 204836.8125, "step": 312 }, { "epoch": 0.007825, "grad_norm": 1.2233872413635254, "grad_norm_var": 0.05797169042160041, "learning_rate": 0.0001, "loss": 204219.4844, "loss/crossentropy": 2.435774326324463, "loss/hidden": 0.36328125, "loss/logits": 0.09434737265110016, "loss/reg": 204219.03125, "step": 313 }, { "epoch": 0.00785, "grad_norm": 1.1289325952529907, "grad_norm_var": 0.05840960239376898, "learning_rate": 0.0001, "loss": 203649.0625, "loss/crossentropy": 2.789903163909912, "loss/hidden": 0.35546875, "loss/logits": 0.09921029210090637, "loss/reg": 203648.609375, "step": 314 }, { "epoch": 0.007875, "grad_norm": 1.1229325532913208, "grad_norm_var": 0.05884196868796773, "learning_rate": 0.0001, "loss": 203077.5625, "loss/crossentropy": 2.476925849914551, "loss/hidden": 0.359375, "loss/logits": 0.09086183458566666, "loss/reg": 203077.109375, "step": 315 }, { "epoch": 0.0079, "grad_norm": 1.3443208932876587, "grad_norm_var": 0.06323050957540814, "learning_rate": 0.0001, "loss": 202519.1094, "loss/crossentropy": 2.7508223056793213, "loss/hidden": 0.37109375, "loss/logits": 0.09828442335128784, "loss/reg": 202518.640625, "step": 316 }, { "epoch": 0.007925, "grad_norm": 0.928200900554657, "grad_norm_var": 0.05857714402731625, "learning_rate": 0.0001, "loss": 201938.7031, "loss/crossentropy": 2.7406234741210938, "loss/hidden": 0.36328125, "loss/logits": 0.09745270013809204, "loss/reg": 201938.25, "step": 317 }, { "epoch": 0.00795, "grad_norm": 0.7533476948738098, "grad_norm_var": 0.06432425513706654, "learning_rate": 0.0001, "loss": 201374.375, "loss/crossentropy": 2.1193270683288574, "loss/hidden": 0.32421875, "loss/logits": 0.09022863209247589, "loss/reg": 201373.953125, "step": 318 }, { "epoch": 0.007975, "grad_norm": 1.1448767185211182, "grad_norm_var": 0.06453541031427007, "learning_rate": 0.0001, "loss": 200807.0156, "loss/crossentropy": 2.310652494430542, "loss/hidden": 0.34375, "loss/logits": 0.09617799520492554, "loss/reg": 200806.578125, "step": 319 }, { "epoch": 0.008, "grad_norm": 0.9061183333396912, "grad_norm_var": 0.06432819005303253, "learning_rate": 0.0001, "loss": 200243.875, "loss/crossentropy": 2.229034900665283, "loss/hidden": 0.33203125, "loss/logits": 0.08925358951091766, "loss/reg": 200243.453125, "step": 320 }, { "epoch": 0.008025, "grad_norm": 0.8542099595069885, "grad_norm_var": 0.06718233247158024, "learning_rate": 0.0001, "loss": 199683.2188, "loss/crossentropy": 2.2956204414367676, "loss/hidden": 0.32421875, "loss/logits": 0.07856299728155136, "loss/reg": 199682.8125, "step": 321 }, { "epoch": 0.00805, "grad_norm": 1.0101763010025024, "grad_norm_var": 0.06695454165222821, "learning_rate": 0.0001, "loss": 199094.9219, "loss/crossentropy": 2.689578056335449, "loss/hidden": 0.3359375, "loss/logits": 0.09941205382347107, "loss/reg": 199094.484375, "step": 322 }, { "epoch": 0.008075, "grad_norm": 1.1850814819335938, "grad_norm_var": 0.06537643569359085, "learning_rate": 0.0001, "loss": 198532.7031, "loss/crossentropy": 2.731456756591797, "loss/hidden": 0.357421875, "loss/logits": 0.09926258027553558, "loss/reg": 198532.25, "step": 323 }, { "epoch": 0.0081, "grad_norm": 1.215268611907959, "grad_norm_var": 0.06633188906198194, "learning_rate": 0.0001, "loss": 197967.2344, "loss/crossentropy": 3.0295090675354004, "loss/hidden": 0.35546875, "loss/logits": 0.10664241015911102, "loss/reg": 197966.765625, "step": 324 }, { "epoch": 0.008125, "grad_norm": 1.026171326637268, "grad_norm_var": 0.062263808409530684, "learning_rate": 0.0001, "loss": 197410.1719, "loss/crossentropy": 2.5564486980438232, "loss/hidden": 0.3671875, "loss/logits": 0.10696598887443542, "loss/reg": 197409.703125, "step": 325 }, { "epoch": 0.00815, "grad_norm": 0.9427042603492737, "grad_norm_var": 0.062063225036427014, "learning_rate": 0.0001, "loss": 196858.6875, "loss/crossentropy": 2.5432636737823486, "loss/hidden": 0.3359375, "loss/logits": 0.09693843126296997, "loss/reg": 196858.25, "step": 326 }, { "epoch": 0.008175, "grad_norm": 1.0309820175170898, "grad_norm_var": 0.06158008077064516, "learning_rate": 0.0001, "loss": 196308.8594, "loss/crossentropy": 2.2705764770507812, "loss/hidden": 0.34375, "loss/logits": 0.0911949872970581, "loss/reg": 196308.421875, "step": 327 }, { "epoch": 0.0082, "grad_norm": 0.8754981756210327, "grad_norm_var": 0.025895035422288125, "learning_rate": 0.0001, "loss": 195759.7188, "loss/crossentropy": 2.734731674194336, "loss/hidden": 0.34375, "loss/logits": 0.09578876197338104, "loss/reg": 195759.28125, "step": 328 }, { "epoch": 0.008225, "grad_norm": 0.8853642344474792, "grad_norm_var": 0.024918108806193592, "learning_rate": 0.0001, "loss": 195213.2344, "loss/crossentropy": 2.5435054302215576, "loss/hidden": 0.3671875, "loss/logits": 0.10717779397964478, "loss/reg": 195212.765625, "step": 329 }, { "epoch": 0.00825, "grad_norm": 0.7243690490722656, "grad_norm_var": 0.029386820407246182, "learning_rate": 0.0001, "loss": 194668.5938, "loss/crossentropy": 2.6787264347076416, "loss/hidden": 0.330078125, "loss/logits": 0.0956207662820816, "loss/reg": 194668.171875, "step": 330 }, { "epoch": 0.008275, "grad_norm": 0.8909747004508972, "grad_norm_var": 0.028850196653393077, "learning_rate": 0.0001, "loss": 194133.2031, "loss/crossentropy": 2.426013231277466, "loss/hidden": 0.345703125, "loss/logits": 0.09088262915611267, "loss/reg": 194132.765625, "step": 331 }, { "epoch": 0.0083, "grad_norm": 0.932327926158905, "grad_norm_var": 0.019575128143828616, "learning_rate": 0.0001, "loss": 193583.8594, "loss/crossentropy": 2.385460138320923, "loss/hidden": 0.33203125, "loss/logits": 0.08535876125097275, "loss/reg": 193583.4375, "step": 332 }, { "epoch": 0.008325, "grad_norm": 1.1217122077941895, "grad_norm_var": 0.021182687079661245, "learning_rate": 0.0001, "loss": 193025.6562, "loss/crossentropy": 2.676365613937378, "loss/hidden": 0.369140625, "loss/logits": 0.10710591077804565, "loss/reg": 193025.1875, "step": 333 }, { "epoch": 0.00835, "grad_norm": 0.92121422290802, "grad_norm_var": 0.018123849895345677, "learning_rate": 0.0001, "loss": 192477.2656, "loss/crossentropy": 2.4655494689941406, "loss/hidden": 0.357421875, "loss/logits": 0.09714263677597046, "loss/reg": 192476.8125, "step": 334 }, { "epoch": 0.008375, "grad_norm": 1.0069211721420288, "grad_norm_var": 0.01626569057576172, "learning_rate": 0.0001, "loss": 191934.8438, "loss/crossentropy": 2.6041626930236816, "loss/hidden": 0.357421875, "loss/logits": 0.08919402956962585, "loss/reg": 191934.390625, "step": 335 }, { "epoch": 0.0084, "grad_norm": 0.9137717485427856, "grad_norm_var": 0.01620358310926281, "learning_rate": 0.0001, "loss": 191393.7969, "loss/crossentropy": 2.503648042678833, "loss/hidden": 0.3515625, "loss/logits": 0.09979408979415894, "loss/reg": 191393.34375, "step": 336 }, { "epoch": 0.008425, "grad_norm": 1.6518378257751465, "grad_norm_var": 0.043541088119168056, "learning_rate": 0.0001, "loss": 190852.2188, "loss/crossentropy": 2.46239972114563, "loss/hidden": 0.412109375, "loss/logits": 0.13722866773605347, "loss/reg": 190851.671875, "step": 337 }, { "epoch": 0.00845, "grad_norm": 1.1564044952392578, "grad_norm_var": 0.04466845511115949, "learning_rate": 0.0001, "loss": 190309.7969, "loss/crossentropy": 2.662630796432495, "loss/hidden": 0.37109375, "loss/logits": 0.10392837226390839, "loss/reg": 190309.328125, "step": 338 }, { "epoch": 0.008475, "grad_norm": 0.9882319569587708, "grad_norm_var": 0.04302094135640619, "learning_rate": 0.0001, "loss": 189746.9375, "loss/crossentropy": 2.6329383850097656, "loss/hidden": 0.3671875, "loss/logits": 0.0980134829878807, "loss/reg": 189746.46875, "step": 339 }, { "epoch": 0.0085, "grad_norm": 0.9910109043121338, "grad_norm_var": 0.04025769198177695, "learning_rate": 0.0001, "loss": 189200.5781, "loss/crossentropy": 2.8005034923553467, "loss/hidden": 0.36328125, "loss/logits": 0.10183151066303253, "loss/reg": 189200.109375, "step": 340 }, { "epoch": 0.008525, "grad_norm": 0.9753293395042419, "grad_norm_var": 0.040267042628729106, "learning_rate": 0.0001, "loss": 188662.3906, "loss/crossentropy": 2.3686320781707764, "loss/hidden": 0.328125, "loss/logits": 0.08583434671163559, "loss/reg": 188661.984375, "step": 341 }, { "epoch": 0.00855, "grad_norm": 0.961402952671051, "grad_norm_var": 0.04014469928526877, "learning_rate": 0.0001, "loss": 188133.3438, "loss/crossentropy": 2.5603463649749756, "loss/hidden": 0.3359375, "loss/logits": 0.08635769784450531, "loss/reg": 188132.921875, "step": 342 }, { "epoch": 0.008575, "grad_norm": 1.043996810913086, "grad_norm_var": 0.040206082506468434, "learning_rate": 0.0001, "loss": 187566.3281, "loss/crossentropy": 2.4850785732269287, "loss/hidden": 0.326171875, "loss/logits": 0.0760430246591568, "loss/reg": 187565.921875, "step": 343 }, { "epoch": 0.0086, "grad_norm": 0.9029485583305359, "grad_norm_var": 0.03978826045033466, "learning_rate": 0.0001, "loss": 187032.0938, "loss/crossentropy": 2.2597060203552246, "loss/hidden": 0.34375, "loss/logits": 0.10078242421150208, "loss/reg": 187031.65625, "step": 344 }, { "epoch": 0.008625, "grad_norm": 1.0176180601119995, "grad_norm_var": 0.03878524009466933, "learning_rate": 0.0001, "loss": 186498.3906, "loss/crossentropy": 2.4756250381469727, "loss/hidden": 0.359375, "loss/logits": 0.09760268777608871, "loss/reg": 186497.9375, "step": 345 }, { "epoch": 0.00865, "grad_norm": 0.9344642162322998, "grad_norm_var": 0.03347254015602245, "learning_rate": 0.0001, "loss": 185958.5469, "loss/crossentropy": 2.703340768814087, "loss/hidden": 0.349609375, "loss/logits": 0.08978617191314697, "loss/reg": 185958.109375, "step": 346 }, { "epoch": 0.008675, "grad_norm": 0.8480978012084961, "grad_norm_var": 0.03435728661884322, "learning_rate": 0.0001, "loss": 185421.1875, "loss/crossentropy": 2.5134730339050293, "loss/hidden": 0.333984375, "loss/logits": 0.08980560302734375, "loss/reg": 185420.765625, "step": 347 }, { "epoch": 0.0087, "grad_norm": 0.8679680228233337, "grad_norm_var": 0.03539387937272805, "learning_rate": 0.0001, "loss": 184879.4688, "loss/crossentropy": 2.4469387531280518, "loss/hidden": 0.349609375, "loss/logits": 0.0879959762096405, "loss/reg": 184879.03125, "step": 348 }, { "epoch": 0.008725, "grad_norm": 1.590740442276001, "grad_norm_var": 0.05557060158014301, "learning_rate": 0.0001, "loss": 184311.4062, "loss/crossentropy": 2.9886884689331055, "loss/hidden": 0.36328125, "loss/logits": 0.10285791009664536, "loss/reg": 184310.9375, "step": 349 }, { "epoch": 0.00875, "grad_norm": 1.0205501317977905, "grad_norm_var": 0.05450480041651103, "learning_rate": 0.0001, "loss": 183769.2344, "loss/crossentropy": 2.7456204891204834, "loss/hidden": 0.37109375, "loss/logits": 0.10173928737640381, "loss/reg": 183768.765625, "step": 350 }, { "epoch": 0.008775, "grad_norm": 0.8315656781196594, "grad_norm_var": 0.05753804406173935, "learning_rate": 0.0001, "loss": 183234.9531, "loss/crossentropy": 2.419807195663452, "loss/hidden": 0.337890625, "loss/logits": 0.09012450277805328, "loss/reg": 183234.53125, "step": 351 }, { "epoch": 0.0088, "grad_norm": 1.123649001121521, "grad_norm_var": 0.056660911801374364, "learning_rate": 0.0001, "loss": 182707.2656, "loss/crossentropy": 2.7368407249450684, "loss/hidden": 0.3515625, "loss/logits": 0.09511645883321762, "loss/reg": 182706.8125, "step": 352 }, { "epoch": 0.008825, "grad_norm": 0.9307337999343872, "grad_norm_var": 0.03193119714682216, "learning_rate": 0.0001, "loss": 182174.3906, "loss/crossentropy": 2.529085397720337, "loss/hidden": 0.3359375, "loss/logits": 0.07927797734737396, "loss/reg": 182173.96875, "step": 353 }, { "epoch": 0.00885, "grad_norm": 0.8824712634086609, "grad_norm_var": 0.031330232119417885, "learning_rate": 0.0001, "loss": 181640.3438, "loss/crossentropy": 2.314749240875244, "loss/hidden": 0.330078125, "loss/logits": 0.09226064383983612, "loss/reg": 181639.921875, "step": 354 }, { "epoch": 0.008875, "grad_norm": 0.8409653902053833, "grad_norm_var": 0.032807275137196494, "learning_rate": 0.0001, "loss": 181106.4531, "loss/crossentropy": 2.33561635017395, "loss/hidden": 0.34375, "loss/logits": 0.09517761319875717, "loss/reg": 181106.015625, "step": 355 }, { "epoch": 0.0089, "grad_norm": 0.9306114315986633, "grad_norm_var": 0.03298864164169767, "learning_rate": 0.0001, "loss": 180575.0625, "loss/crossentropy": 2.4940664768218994, "loss/hidden": 0.3359375, "loss/logits": 0.09492896497249603, "loss/reg": 180574.625, "step": 356 }, { "epoch": 0.008925, "grad_norm": 0.7635557055473328, "grad_norm_var": 0.035964318367633044, "learning_rate": 0.0001, "loss": 180047.5469, "loss/crossentropy": 2.4428975582122803, "loss/hidden": 0.3359375, "loss/logits": 0.09701186418533325, "loss/reg": 180047.109375, "step": 357 }, { "epoch": 0.00895, "grad_norm": 1.1170213222503662, "grad_norm_var": 0.037336672350648095, "learning_rate": 0.0001, "loss": 179509.2656, "loss/crossentropy": 2.3574512004852295, "loss/hidden": 0.33203125, "loss/logits": 0.11497388780117035, "loss/reg": 179508.8125, "step": 358 }, { "epoch": 0.008975, "grad_norm": 0.7869230508804321, "grad_norm_var": 0.03920273191698739, "learning_rate": 0.0001, "loss": 178985.1719, "loss/crossentropy": 2.291933059692383, "loss/hidden": 0.328125, "loss/logits": 0.08891676366329193, "loss/reg": 178984.75, "step": 359 }, { "epoch": 0.009, "grad_norm": 0.9132789969444275, "grad_norm_var": 0.039128246987934705, "learning_rate": 0.0001, "loss": 178473.2969, "loss/crossentropy": 2.655181884765625, "loss/hidden": 0.359375, "loss/logits": 0.11087474972009659, "loss/reg": 178472.828125, "step": 360 }, { "epoch": 0.009025, "grad_norm": 0.8164347410202026, "grad_norm_var": 0.04017976554790762, "learning_rate": 0.0001, "loss": 177953.1406, "loss/crossentropy": 2.362248659133911, "loss/hidden": 0.3359375, "loss/logits": 0.10053600370883942, "loss/reg": 177952.703125, "step": 361 }, { "epoch": 0.00905, "grad_norm": 1.0977365970611572, "grad_norm_var": 0.041508992287856414, "learning_rate": 0.0001, "loss": 177436.2656, "loss/crossentropy": 2.665569305419922, "loss/hidden": 0.357421875, "loss/logits": 0.09883129596710205, "loss/reg": 177435.8125, "step": 362 }, { "epoch": 0.009075, "grad_norm": 1.1380103826522827, "grad_norm_var": 0.04243092831887196, "learning_rate": 0.0001, "loss": 176886.3906, "loss/crossentropy": 2.4346649646759033, "loss/hidden": 0.361328125, "loss/logits": 0.09544631838798523, "loss/reg": 176885.9375, "step": 363 }, { "epoch": 0.0091, "grad_norm": 1.7445480823516846, "grad_norm_var": 0.07756442404939948, "learning_rate": 0.0001, "loss": 176361.0625, "loss/crossentropy": 2.4675376415252686, "loss/hidden": 0.36328125, "loss/logits": 0.10034389793872833, "loss/reg": 176360.59375, "step": 364 }, { "epoch": 0.009125, "grad_norm": 0.9684613347053528, "grad_norm_var": 0.055494477448874056, "learning_rate": 0.0001, "loss": 175839.0781, "loss/crossentropy": 2.854846239089966, "loss/hidden": 0.37109375, "loss/logits": 0.12111840397119522, "loss/reg": 175838.578125, "step": 365 }, { "epoch": 0.00915, "grad_norm": 0.9467345476150513, "grad_norm_var": 0.055575263489238046, "learning_rate": 0.0001, "loss": 175320.0, "loss/crossentropy": 2.522322654724121, "loss/hidden": 0.333984375, "loss/logits": 0.08145372569561005, "loss/reg": 175319.578125, "step": 366 }, { "epoch": 0.009175, "grad_norm": 1.475239634513855, "grad_norm_var": 0.06791183377343075, "learning_rate": 0.0001, "loss": 174809.4688, "loss/crossentropy": 2.6860809326171875, "loss/hidden": 0.375, "loss/logits": 0.10367526113986969, "loss/reg": 174808.984375, "step": 367 }, { "epoch": 0.0092, "grad_norm": 0.9959954619407654, "grad_norm_var": 0.067332491810451, "learning_rate": 0.0001, "loss": 174250.1875, "loss/crossentropy": 2.5077319145202637, "loss/hidden": 0.341796875, "loss/logits": 0.09400855004787445, "loss/reg": 174249.75, "step": 368 }, { "epoch": 0.009225, "grad_norm": 0.776086151599884, "grad_norm_var": 0.07070489089481526, "learning_rate": 0.0001, "loss": 173736.8906, "loss/crossentropy": 2.2909703254699707, "loss/hidden": 0.330078125, "loss/logits": 0.08630555123090744, "loss/reg": 173736.46875, "step": 369 }, { "epoch": 0.00925, "grad_norm": 0.9044707417488098, "grad_norm_var": 0.07035481726732086, "learning_rate": 0.0001, "loss": 173222.7188, "loss/crossentropy": 2.2393484115600586, "loss/hidden": 0.34375, "loss/logits": 0.09456537663936615, "loss/reg": 173222.28125, "step": 370 }, { "epoch": 0.009275, "grad_norm": 0.9271029233932495, "grad_norm_var": 0.0688369333629584, "learning_rate": 0.0001, "loss": 172693.1875, "loss/crossentropy": 2.1928179264068604, "loss/hidden": 0.357421875, "loss/logits": 0.09735684096813202, "loss/reg": 172692.734375, "step": 371 }, { "epoch": 0.0093, "grad_norm": 1.1868821382522583, "grad_norm_var": 0.06992523411455233, "learning_rate": 0.0001, "loss": 172176.3438, "loss/crossentropy": 2.3856608867645264, "loss/hidden": 0.3359375, "loss/logits": 0.08821407705545425, "loss/reg": 172175.921875, "step": 372 }, { "epoch": 0.009325, "grad_norm": 0.9966070652008057, "grad_norm_var": 0.06488801284089368, "learning_rate": 0.0001, "loss": 171664.9531, "loss/crossentropy": 2.5923867225646973, "loss/hidden": 0.322265625, "loss/logits": 0.08066350221633911, "loss/reg": 171664.546875, "step": 373 }, { "epoch": 0.00935, "grad_norm": 1.5168896913528442, "grad_norm_var": 0.07848294009414543, "learning_rate": 0.0001, "loss": 171161.125, "loss/crossentropy": 2.563632011413574, "loss/hidden": 0.37109375, "loss/logits": 0.09065365791320801, "loss/reg": 171160.65625, "step": 374 }, { "epoch": 0.009375, "grad_norm": 1.0048420429229736, "grad_norm_var": 0.07309627264863429, "learning_rate": 0.0001, "loss": 170612.1406, "loss/crossentropy": 2.9609603881835938, "loss/hidden": 0.3515625, "loss/logits": 0.07979125529527664, "loss/reg": 170611.703125, "step": 375 }, { "epoch": 0.0094, "grad_norm": 1.1617481708526611, "grad_norm_var": 0.07116372542663024, "learning_rate": 0.0001, "loss": 170073.0312, "loss/crossentropy": 2.7246451377868652, "loss/hidden": 0.357421875, "loss/logits": 0.08837760984897614, "loss/reg": 170072.578125, "step": 376 }, { "epoch": 0.009425, "grad_norm": 2.0842902660369873, "grad_norm_var": 0.12308328174345645, "learning_rate": 0.0001, "loss": 169564.4531, "loss/crossentropy": 3.3340728282928467, "loss/hidden": 0.392578125, "loss/logits": 0.09503665566444397, "loss/reg": 169563.96875, "step": 377 }, { "epoch": 0.00945, "grad_norm": 1.0924917459487915, "grad_norm_var": 0.123144523949518, "learning_rate": 0.0001, "loss": 169061.9688, "loss/crossentropy": 2.3122220039367676, "loss/hidden": 0.365234375, "loss/logits": 0.09052115678787231, "loss/reg": 169061.515625, "step": 378 }, { "epoch": 0.009475, "grad_norm": 1.092575192451477, "grad_norm_var": 0.12354321705638256, "learning_rate": 0.0001, "loss": 168547.3281, "loss/crossentropy": 2.7103536128997803, "loss/hidden": 0.373046875, "loss/logits": 0.08894005417823792, "loss/reg": 168546.859375, "step": 379 }, { "epoch": 0.0095, "grad_norm": 0.9879716634750366, "grad_norm_var": 0.10233713040041952, "learning_rate": 0.0001, "loss": 168007.6094, "loss/crossentropy": 2.4375011920928955, "loss/hidden": 0.333984375, "loss/logits": 0.0795358344912529, "loss/reg": 168007.203125, "step": 380 }, { "epoch": 0.009525, "grad_norm": 1.2313728332519531, "grad_norm_var": 0.10091046048275355, "learning_rate": 0.0001, "loss": 167493.8594, "loss/crossentropy": 2.495237350463867, "loss/hidden": 0.3515625, "loss/logits": 0.09075236320495605, "loss/reg": 167493.421875, "step": 381 }, { "epoch": 0.00955, "grad_norm": 1.0618592500686646, "grad_norm_var": 0.09863663992902129, "learning_rate": 0.0001, "loss": 166984.2812, "loss/crossentropy": 2.63665771484375, "loss/hidden": 0.361328125, "loss/logits": 0.0936121791601181, "loss/reg": 166983.828125, "step": 382 }, { "epoch": 0.009575, "grad_norm": 0.8288941979408264, "grad_norm_var": 0.09723720292607822, "learning_rate": 0.0001, "loss": 166477.375, "loss/crossentropy": 2.3199474811553955, "loss/hidden": 0.341796875, "loss/logits": 0.09022307395935059, "loss/reg": 166476.9375, "step": 383 }, { "epoch": 0.0096, "grad_norm": 0.9028894305229187, "grad_norm_var": 0.09926415799220058, "learning_rate": 0.0001, "loss": 165976.7656, "loss/crossentropy": 2.2981746196746826, "loss/hidden": 0.357421875, "loss/logits": 0.09859772026538849, "loss/reg": 165976.3125, "step": 384 }, { "epoch": 0.009625, "grad_norm": 0.8596852421760559, "grad_norm_var": 0.09598108104973425, "learning_rate": 0.0001, "loss": 165479.5469, "loss/crossentropy": 2.7811458110809326, "loss/hidden": 0.39453125, "loss/logits": 0.11124817281961441, "loss/reg": 165479.046875, "step": 385 }, { "epoch": 0.00965, "grad_norm": 0.9104531407356262, "grad_norm_var": 0.09581535998637468, "learning_rate": 0.0001, "loss": 164984.2969, "loss/crossentropy": 2.1067349910736084, "loss/hidden": 0.35546875, "loss/logits": 0.09003035724163055, "loss/reg": 164983.84375, "step": 386 }, { "epoch": 0.009675, "grad_norm": 0.8352228999137878, "grad_norm_var": 0.09864986513955144, "learning_rate": 0.0001, "loss": 164489.4844, "loss/crossentropy": 2.380166530609131, "loss/hidden": 0.337890625, "loss/logits": 0.09342695772647858, "loss/reg": 164489.046875, "step": 387 }, { "epoch": 0.0097, "grad_norm": 0.7623141407966614, "grad_norm_var": 0.10554492279767776, "learning_rate": 0.0001, "loss": 164000.4062, "loss/crossentropy": 2.1462862491607666, "loss/hidden": 0.330078125, "loss/logits": 0.08137300610542297, "loss/reg": 164000.0, "step": 388 }, { "epoch": 0.009725, "grad_norm": 0.8828011155128479, "grad_norm_var": 0.10766734548416322, "learning_rate": 0.0001, "loss": 163505.2031, "loss/crossentropy": 2.5420355796813965, "loss/hidden": 0.333984375, "loss/logits": 0.08073568344116211, "loss/reg": 163504.78125, "step": 389 }, { "epoch": 0.00975, "grad_norm": 1.0450934171676636, "grad_norm_var": 0.09384582994403522, "learning_rate": 0.0001, "loss": 163012.25, "loss/crossentropy": 2.5532455444335938, "loss/hidden": 0.35546875, "loss/logits": 0.0926065444946289, "loss/reg": 163011.796875, "step": 390 }, { "epoch": 0.009775, "grad_norm": 0.9712976813316345, "grad_norm_var": 0.09410261616215033, "learning_rate": 0.0001, "loss": 162519.625, "loss/crossentropy": 2.5090253353118896, "loss/hidden": 0.37109375, "loss/logits": 0.09214738011360168, "loss/reg": 162519.15625, "step": 391 }, { "epoch": 0.0098, "grad_norm": 0.9004305005073547, "grad_norm_var": 0.09428308254190747, "learning_rate": 0.0001, "loss": 162016.3281, "loss/crossentropy": 2.5954947471618652, "loss/hidden": 0.365234375, "loss/logits": 0.09887327998876572, "loss/reg": 162015.859375, "step": 392 }, { "epoch": 0.009825, "grad_norm": 0.979634165763855, "grad_norm_var": 0.014986441797913471, "learning_rate": 0.0001, "loss": 161516.9375, "loss/crossentropy": 2.4483871459960938, "loss/hidden": 0.369140625, "loss/logits": 0.09639716148376465, "loss/reg": 161516.46875, "step": 393 }, { "epoch": 0.00985, "grad_norm": 0.8064605593681335, "grad_norm_var": 0.015011118141572648, "learning_rate": 0.0001, "loss": 161025.2812, "loss/crossentropy": 2.655043363571167, "loss/hidden": 0.357421875, "loss/logits": 0.10133032500743866, "loss/reg": 161024.828125, "step": 394 }, { "epoch": 0.009875, "grad_norm": 0.9056344032287598, "grad_norm_var": 0.013421822749984299, "learning_rate": 0.0001, "loss": 160534.5938, "loss/crossentropy": 2.545508623123169, "loss/hidden": 0.353515625, "loss/logits": 0.0969725251197815, "loss/reg": 160534.140625, "step": 395 }, { "epoch": 0.0099, "grad_norm": 1.2923853397369385, "grad_norm_var": 0.021586792538579876, "learning_rate": 0.0001, "loss": 160045.3281, "loss/crossentropy": 2.465571880340576, "loss/hidden": 0.3828125, "loss/logits": 0.10686793923377991, "loss/reg": 160044.84375, "step": 396 }, { "epoch": 0.009925, "grad_norm": 1.1261005401611328, "grad_norm_var": 0.018309319694934344, "learning_rate": 0.0001, "loss": 159557.3281, "loss/crossentropy": 2.82228684425354, "loss/hidden": 0.365234375, "loss/logits": 0.10036415606737137, "loss/reg": 159556.859375, "step": 397 }, { "epoch": 0.00995, "grad_norm": 1.027556300163269, "grad_norm_var": 0.01783441826447003, "learning_rate": 0.0001, "loss": 159073.2969, "loss/crossentropy": 2.621041774749756, "loss/hidden": 0.357421875, "loss/logits": 0.09349606931209564, "loss/reg": 159072.84375, "step": 398 }, { "epoch": 0.009975, "grad_norm": 1.0642552375793457, "grad_norm_var": 0.017816103035810337, "learning_rate": 0.0001, "loss": 158590.3594, "loss/crossentropy": 2.474287748336792, "loss/hidden": 0.365234375, "loss/logits": 0.10626979172229767, "loss/reg": 158589.890625, "step": 399 }, { "epoch": 0.01, "grad_norm": 1.4468071460723877, "grad_norm_var": 0.03256261720358751, "learning_rate": 0.0001, "loss": 158087.3125, "loss/crossentropy": 2.40484619140625, "loss/hidden": 0.376953125, "loss/logits": 0.10928124189376831, "loss/reg": 158086.828125, "step": 400 }, { "epoch": 0.010025, "grad_norm": 1.073398232460022, "grad_norm_var": 0.031746375115213714, "learning_rate": 0.0001, "loss": 157606.4375, "loss/crossentropy": 2.74391770362854, "loss/hidden": 0.3359375, "loss/logits": 0.0976434051990509, "loss/reg": 157606.0, "step": 401 }, { "epoch": 0.01005, "grad_norm": 0.9355063438415527, "grad_norm_var": 0.031480248395230365, "learning_rate": 0.0001, "loss": 157125.625, "loss/crossentropy": 2.8919923305511475, "loss/hidden": 0.34375, "loss/logits": 0.10159678757190704, "loss/reg": 157125.171875, "step": 402 }, { "epoch": 0.010075, "grad_norm": 1.0578041076660156, "grad_norm_var": 0.029584649326750645, "learning_rate": 0.0001, "loss": 156649.6094, "loss/crossentropy": 2.58225679397583, "loss/hidden": 0.34765625, "loss/logits": 0.10078255087137222, "loss/reg": 156649.15625, "step": 403 }, { "epoch": 0.0101, "grad_norm": 1.1878315210342407, "grad_norm_var": 0.026432017201673298, "learning_rate": 0.0001, "loss": 156176.1406, "loss/crossentropy": 2.4818239212036133, "loss/hidden": 0.37109375, "loss/logits": 0.09668318927288055, "loss/reg": 156175.671875, "step": 404 }, { "epoch": 0.010125, "grad_norm": 0.9575701951980591, "grad_norm_var": 0.02517501771146251, "learning_rate": 0.0001, "loss": 155698.6094, "loss/crossentropy": 2.7240066528320312, "loss/hidden": 0.373046875, "loss/logits": 0.09359210729598999, "loss/reg": 155698.140625, "step": 405 }, { "epoch": 0.01015, "grad_norm": 1.1337554454803467, "grad_norm_var": 0.025624751561605173, "learning_rate": 0.0001, "loss": 155223.0469, "loss/crossentropy": 2.814995050430298, "loss/hidden": 0.361328125, "loss/logits": 0.07844653725624084, "loss/reg": 155222.609375, "step": 406 }, { "epoch": 0.010175, "grad_norm": 1.8449225425720215, "grad_norm_var": 0.06367491192329258, "learning_rate": 0.0001, "loss": 154748.3906, "loss/crossentropy": 2.988466262817383, "loss/hidden": 0.451171875, "loss/logits": 0.14243215322494507, "loss/reg": 154747.796875, "step": 407 }, { "epoch": 0.0102, "grad_norm": 1.2929456233978271, "grad_norm_var": 0.062401523680452785, "learning_rate": 0.0001, "loss": 154275.25, "loss/crossentropy": 2.7575523853302, "loss/hidden": 0.369140625, "loss/logits": 0.08892758190631866, "loss/reg": 154274.796875, "step": 408 }, { "epoch": 0.010225, "grad_norm": 1.2019063234329224, "grad_norm_var": 0.060935676088661225, "learning_rate": 0.0001, "loss": 153802.2656, "loss/crossentropy": 2.486464023590088, "loss/hidden": 0.37890625, "loss/logits": 0.09872272610664368, "loss/reg": 153801.78125, "step": 409 }, { "epoch": 0.01025, "grad_norm": 1.1344937086105347, "grad_norm_var": 0.05275884218374903, "learning_rate": 0.0001, "loss": 153332.4062, "loss/crossentropy": 2.8986072540283203, "loss/hidden": 0.376953125, "loss/logits": 0.11386501789093018, "loss/reg": 153331.921875, "step": 410 }, { "epoch": 0.010275, "grad_norm": 1.3534355163574219, "grad_norm_var": 0.049645825083297034, "learning_rate": 0.0001, "loss": 152866.0156, "loss/crossentropy": 2.475773572921753, "loss/hidden": 0.390625, "loss/logits": 0.09350752830505371, "loss/reg": 152865.53125, "step": 411 }, { "epoch": 0.0103, "grad_norm": 0.7451173663139343, "grad_norm_var": 0.06130727780234874, "learning_rate": 0.0001, "loss": 152397.2031, "loss/crossentropy": 2.3074593544006348, "loss/hidden": 0.3359375, "loss/logits": 0.07472251355648041, "loss/reg": 152396.796875, "step": 412 }, { "epoch": 0.010325, "grad_norm": 1.9421056509017944, "grad_norm_var": 0.09907635194970485, "learning_rate": 0.0001, "loss": 151888.7656, "loss/crossentropy": 2.791627883911133, "loss/hidden": 0.375, "loss/logits": 0.08407286554574966, "loss/reg": 151888.3125, "step": 413 }, { "epoch": 0.01035, "grad_norm": 1.2029465436935425, "grad_norm_var": 0.09667484496340699, "learning_rate": 0.0001, "loss": 151422.8594, "loss/crossentropy": 2.326814651489258, "loss/hidden": 0.373046875, "loss/logits": 0.08045229315757751, "loss/reg": 151422.40625, "step": 414 }, { "epoch": 0.010375, "grad_norm": 0.9819397330284119, "grad_norm_var": 0.09884528804566108, "learning_rate": 0.0001, "loss": 150928.8125, "loss/crossentropy": 2.497159242630005, "loss/hidden": 0.375, "loss/logits": 0.09636622667312622, "loss/reg": 150928.34375, "step": 415 }, { "epoch": 0.0104, "grad_norm": 0.9080580472946167, "grad_norm_var": 0.10057013774897232, "learning_rate": 0.0001, "loss": 150443.7188, "loss/crossentropy": 2.4876914024353027, "loss/hidden": 0.361328125, "loss/logits": 0.08849430084228516, "loss/reg": 150443.265625, "step": 416 }, { "epoch": 0.010425, "grad_norm": 0.7911828756332397, "grad_norm_var": 0.10973268347362262, "learning_rate": 0.0001, "loss": 149973.9688, "loss/crossentropy": 2.4696006774902344, "loss/hidden": 0.35546875, "loss/logits": 0.08970560133457184, "loss/reg": 149973.53125, "step": 417 }, { "epoch": 0.01045, "grad_norm": 1.0096138715744019, "grad_norm_var": 0.10778883463160321, "learning_rate": 0.0001, "loss": 149510.4219, "loss/crossentropy": 2.434112787246704, "loss/hidden": 0.361328125, "loss/logits": 0.09028290212154388, "loss/reg": 149509.96875, "step": 418 }, { "epoch": 0.010475, "grad_norm": 1.0627332925796509, "grad_norm_var": 0.10771556252531594, "learning_rate": 0.0001, "loss": 149044.7969, "loss/crossentropy": 2.982994794845581, "loss/hidden": 0.37890625, "loss/logits": 0.0978410542011261, "loss/reg": 149044.3125, "step": 419 }, { "epoch": 0.0105, "grad_norm": 1.0105903148651123, "grad_norm_var": 0.10930270282818669, "learning_rate": 0.0001, "loss": 148578.8438, "loss/crossentropy": 2.3700225353240967, "loss/hidden": 0.349609375, "loss/logits": 0.09978696703910828, "loss/reg": 148578.390625, "step": 420 }, { "epoch": 0.010525, "grad_norm": 0.9768433570861816, "grad_norm_var": 0.1088035849174147, "learning_rate": 0.0001, "loss": 148120.1875, "loss/crossentropy": 2.602926015853882, "loss/hidden": 0.369140625, "loss/logits": 0.10290530323982239, "loss/reg": 148119.71875, "step": 421 }, { "epoch": 0.01055, "grad_norm": 0.9353519082069397, "grad_norm_var": 0.11201198441522882, "learning_rate": 0.0001, "loss": 147660.4844, "loss/crossentropy": 2.577287435531616, "loss/hidden": 0.341796875, "loss/logits": 0.0870780348777771, "loss/reg": 147660.0625, "step": 422 }, { "epoch": 0.010575, "grad_norm": 1.143856167793274, "grad_norm_var": 0.07773815687167945, "learning_rate": 0.0001, "loss": 147202.5312, "loss/crossentropy": 2.737713575363159, "loss/hidden": 0.392578125, "loss/logits": 0.1179521381855011, "loss/reg": 147202.015625, "step": 423 }, { "epoch": 0.0106, "grad_norm": 0.9985752105712891, "grad_norm_var": 0.07580946258821748, "learning_rate": 0.0001, "loss": 146743.3125, "loss/crossentropy": 2.3523805141448975, "loss/hidden": 0.357421875, "loss/logits": 0.09739204496145248, "loss/reg": 146742.859375, "step": 424 }, { "epoch": 0.010625, "grad_norm": 0.9732621908187866, "grad_norm_var": 0.07558668643125382, "learning_rate": 0.0001, "loss": 146291.2812, "loss/crossentropy": 2.7479147911071777, "loss/hidden": 0.37109375, "loss/logits": 0.10617141425609589, "loss/reg": 146290.796875, "step": 425 }, { "epoch": 0.01065, "grad_norm": 0.9876713752746582, "grad_norm_var": 0.07573274274057457, "learning_rate": 0.0001, "loss": 145810.1719, "loss/crossentropy": 2.486886501312256, "loss/hidden": 0.35546875, "loss/logits": 0.09212806820869446, "loss/reg": 145809.71875, "step": 426 }, { "epoch": 0.010675, "grad_norm": 1.0289918184280396, "grad_norm_var": 0.06978905016892202, "learning_rate": 0.0001, "loss": 145358.7812, "loss/crossentropy": 2.5645861625671387, "loss/hidden": 0.37109375, "loss/logits": 0.09653377532958984, "loss/reg": 145358.3125, "step": 427 }, { "epoch": 0.0107, "grad_norm": 1.1155734062194824, "grad_norm_var": 0.06361928522481192, "learning_rate": 0.0001, "loss": 144907.2188, "loss/crossentropy": 2.862295150756836, "loss/hidden": 0.37109375, "loss/logits": 0.08550693094730377, "loss/reg": 144906.765625, "step": 428 }, { "epoch": 0.010725, "grad_norm": 1.2797330617904663, "grad_norm_var": 0.013739316120510084, "learning_rate": 0.0001, "loss": 144459.2812, "loss/crossentropy": 2.3312559127807617, "loss/hidden": 0.384765625, "loss/logits": 0.1013282760977745, "loss/reg": 144458.796875, "step": 429 }, { "epoch": 0.01075, "grad_norm": 1.258832573890686, "grad_norm_var": 0.015257258359294626, "learning_rate": 0.0001, "loss": 144007.125, "loss/crossentropy": 2.3650715351104736, "loss/hidden": 0.373046875, "loss/logits": 0.09613895416259766, "loss/reg": 144006.65625, "step": 430 }, { "epoch": 0.010775, "grad_norm": 1.4123529195785522, "grad_norm_var": 0.024139284326330317, "learning_rate": 0.0001, "loss": 143544.5938, "loss/crossentropy": 2.725262403488159, "loss/hidden": 0.375, "loss/logits": 0.09534821659326553, "loss/reg": 143544.125, "step": 431 }, { "epoch": 0.0108, "grad_norm": 1.5468213558197021, "grad_norm_var": 0.0370552443916324, "learning_rate": 0.0001, "loss": 143090.5781, "loss/crossentropy": 3.220313549041748, "loss/hidden": 0.388671875, "loss/logits": 0.09105117619037628, "loss/reg": 143090.09375, "step": 432 }, { "epoch": 0.010825, "grad_norm": 0.9015154242515564, "grad_norm_var": 0.03333559805386462, "learning_rate": 0.0001, "loss": 142640.375, "loss/crossentropy": 2.438876152038574, "loss/hidden": 0.357421875, "loss/logits": 0.0950528234243393, "loss/reg": 142639.921875, "step": 433 }, { "epoch": 0.01085, "grad_norm": 0.9043869376182556, "grad_norm_var": 0.035332891465965334, "learning_rate": 0.0001, "loss": 142186.2969, "loss/crossentropy": 2.4754929542541504, "loss/hidden": 0.365234375, "loss/logits": 0.08686907589435577, "loss/reg": 142185.84375, "step": 434 }, { "epoch": 0.010875, "grad_norm": 0.8575437068939209, "grad_norm_var": 0.03887631132086219, "learning_rate": 0.0001, "loss": 141733.2812, "loss/crossentropy": 2.5210773944854736, "loss/hidden": 0.365234375, "loss/logits": 0.09325699508190155, "loss/reg": 141732.828125, "step": 435 }, { "epoch": 0.0109, "grad_norm": 0.992995023727417, "grad_norm_var": 0.03906610899539905, "learning_rate": 0.0001, "loss": 141284.0312, "loss/crossentropy": 2.807164192199707, "loss/hidden": 0.365234375, "loss/logits": 0.0953291729092598, "loss/reg": 141283.578125, "step": 436 }, { "epoch": 0.010925, "grad_norm": 0.9081130623817444, "grad_norm_var": 0.04032633045922462, "learning_rate": 0.0001, "loss": 140836.0781, "loss/crossentropy": 2.515272378921509, "loss/hidden": 0.361328125, "loss/logits": 0.09115155041217804, "loss/reg": 140835.625, "step": 437 }, { "epoch": 0.01095, "grad_norm": 0.8617503643035889, "grad_norm_var": 0.04206330063874663, "learning_rate": 0.0001, "loss": 140398.0156, "loss/crossentropy": 2.615291118621826, "loss/hidden": 0.361328125, "loss/logits": 0.09375904500484467, "loss/reg": 140397.5625, "step": 438 }, { "epoch": 0.010975, "grad_norm": 0.9350782632827759, "grad_norm_var": 0.04282205208356348, "learning_rate": 0.0001, "loss": 139961.625, "loss/crossentropy": 2.4983971118927, "loss/hidden": 0.361328125, "loss/logits": 0.08958140015602112, "loss/reg": 139961.171875, "step": 439 }, { "epoch": 0.011, "grad_norm": 0.9608613848686218, "grad_norm_var": 0.04322082771104944, "learning_rate": 0.0001, "loss": 139524.0156, "loss/crossentropy": 2.2612545490264893, "loss/hidden": 0.33984375, "loss/logits": 0.08482472598552704, "loss/reg": 139523.59375, "step": 440 }, { "epoch": 0.011025, "grad_norm": 0.9255537390708923, "grad_norm_var": 0.043901110954668456, "learning_rate": 0.0001, "loss": 139087.9688, "loss/crossentropy": 2.6385385990142822, "loss/hidden": 0.365234375, "loss/logits": 0.09528161585330963, "loss/reg": 139087.515625, "step": 441 }, { "epoch": 0.01105, "grad_norm": 0.9135704636573792, "grad_norm_var": 0.04490813570542424, "learning_rate": 0.0001, "loss": 138650.1875, "loss/crossentropy": 2.540478467941284, "loss/hidden": 0.34375, "loss/logits": 0.10281309485435486, "loss/reg": 138649.734375, "step": 442 }, { "epoch": 0.011075, "grad_norm": 0.779073178768158, "grad_norm_var": 0.0495195390762743, "learning_rate": 0.0001, "loss": 138217.0625, "loss/crossentropy": 2.3806254863739014, "loss/hidden": 0.330078125, "loss/logits": 0.08512654900550842, "loss/reg": 138216.640625, "step": 443 }, { "epoch": 0.0111, "grad_norm": 1.2005538940429688, "grad_norm_var": 0.050888272553039814, "learning_rate": 0.0001, "loss": 137793.9844, "loss/crossentropy": 2.5137827396392822, "loss/hidden": 0.369140625, "loss/logits": 0.09710798412561417, "loss/reg": 137793.515625, "step": 444 }, { "epoch": 0.011125, "grad_norm": 1.471663236618042, "grad_norm_var": 0.059327554212960584, "learning_rate": 0.0001, "loss": 137320.3906, "loss/crossentropy": 2.5025644302368164, "loss/hidden": 0.384765625, "loss/logits": 0.10144961625337601, "loss/reg": 137319.90625, "step": 445 }, { "epoch": 0.01115, "grad_norm": 1.9600776433944702, "grad_norm_var": 0.10940810263046583, "learning_rate": 0.0001, "loss": 136887.5156, "loss/crossentropy": 2.6639795303344727, "loss/hidden": 0.38671875, "loss/logits": 0.10284031182527542, "loss/reg": 136887.03125, "step": 446 }, { "epoch": 0.011175, "grad_norm": 1.396752119064331, "grad_norm_var": 0.10876473470327969, "learning_rate": 0.0001, "loss": 136455.7344, "loss/crossentropy": 2.8369083404541016, "loss/hidden": 0.392578125, "loss/logits": 0.1023794487118721, "loss/reg": 136455.234375, "step": 447 }, { "epoch": 0.0112, "grad_norm": 1.3984694480895996, "grad_norm_var": 0.10119854931983961, "learning_rate": 0.0001, "loss": 136018.125, "loss/crossentropy": 2.540158271789551, "loss/hidden": 0.34375, "loss/logits": 0.08261817693710327, "loss/reg": 136017.703125, "step": 448 }, { "epoch": 0.011225, "grad_norm": 1.2194510698318481, "grad_norm_var": 0.09971698574367403, "learning_rate": 0.0001, "loss": 135585.1719, "loss/crossentropy": 2.0608363151550293, "loss/hidden": 0.380859375, "loss/logits": 0.0951135903596878, "loss/reg": 135584.703125, "step": 449 }, { "epoch": 0.01125, "grad_norm": 0.8559896349906921, "grad_norm_var": 0.10116030716406522, "learning_rate": 0.0001, "loss": 135155.8906, "loss/crossentropy": 2.7330076694488525, "loss/hidden": 0.337890625, "loss/logits": 0.09428665041923523, "loss/reg": 135155.453125, "step": 450 }, { "epoch": 0.011275, "grad_norm": 0.9683396816253662, "grad_norm_var": 0.09831116364115952, "learning_rate": 0.0001, "loss": 134730.7344, "loss/crossentropy": 2.4699547290802, "loss/hidden": 0.357421875, "loss/logits": 0.09042903780937195, "loss/reg": 134730.28125, "step": 451 }, { "epoch": 0.0113, "grad_norm": 0.8751084804534912, "grad_norm_var": 0.10100734765911383, "learning_rate": 0.0001, "loss": 134306.2344, "loss/crossentropy": 2.533498764038086, "loss/hidden": 0.365234375, "loss/logits": 0.09291449189186096, "loss/reg": 134305.78125, "step": 452 }, { "epoch": 0.011325, "grad_norm": 1.0929409265518188, "grad_norm_var": 0.09836679254087041, "learning_rate": 0.0001, "loss": 133874.5156, "loss/crossentropy": 2.3817315101623535, "loss/hidden": 0.373046875, "loss/logits": 0.10951289534568787, "loss/reg": 133874.03125, "step": 453 }, { "epoch": 0.01135, "grad_norm": 0.7926502227783203, "grad_norm_var": 0.10098423608836003, "learning_rate": 0.0001, "loss": 133446.7031, "loss/crossentropy": 2.558635711669922, "loss/hidden": 0.3359375, "loss/logits": 0.09265337884426117, "loss/reg": 133446.28125, "step": 454 }, { "epoch": 0.011375, "grad_norm": 0.9337485432624817, "grad_norm_var": 0.10101520586683463, "learning_rate": 0.0001, "loss": 133023.4844, "loss/crossentropy": 2.320953845977783, "loss/hidden": 0.357421875, "loss/logits": 0.08878225088119507, "loss/reg": 133023.03125, "step": 455 }, { "epoch": 0.0114, "grad_norm": 0.7837616205215454, "grad_norm_var": 0.10647470419620443, "learning_rate": 0.0001, "loss": 132599.6875, "loss/crossentropy": 2.449989080429077, "loss/hidden": 0.3359375, "loss/logits": 0.08764292299747467, "loss/reg": 132599.265625, "step": 456 }, { "epoch": 0.011425, "grad_norm": 0.9680254459381104, "grad_norm_var": 0.10561100442267085, "learning_rate": 0.0001, "loss": 132178.375, "loss/crossentropy": 2.3862457275390625, "loss/hidden": 0.369140625, "loss/logits": 0.10386032611131668, "loss/reg": 132177.90625, "step": 457 }, { "epoch": 0.01145, "grad_norm": 1.173683762550354, "grad_norm_var": 0.10335192378294071, "learning_rate": 0.0001, "loss": 131756.8281, "loss/crossentropy": 2.3640592098236084, "loss/hidden": 0.369140625, "loss/logits": 0.09086473286151886, "loss/reg": 131756.375, "step": 458 }, { "epoch": 0.011475, "grad_norm": 1.2718472480773926, "grad_norm_var": 0.09633271636354876, "learning_rate": 0.0001, "loss": 131337.7188, "loss/crossentropy": 2.2730746269226074, "loss/hidden": 0.373046875, "loss/logits": 0.07853664457798004, "loss/reg": 131337.265625, "step": 459 }, { "epoch": 0.0115, "grad_norm": 1.3684788942337036, "grad_norm_var": 0.09927873215353529, "learning_rate": 0.0001, "loss": 130913.3594, "loss/crossentropy": 2.5492196083068848, "loss/hidden": 0.392578125, "loss/logits": 0.10558829456567764, "loss/reg": 130912.859375, "step": 460 }, { "epoch": 0.011525, "grad_norm": 1.0493955612182617, "grad_norm_var": 0.09277364145863345, "learning_rate": 0.0001, "loss": 130492.3594, "loss/crossentropy": 2.433809280395508, "loss/hidden": 0.35546875, "loss/logits": 0.08270187675952911, "loss/reg": 130491.921875, "step": 461 }, { "epoch": 0.01155, "grad_norm": 1.0531160831451416, "grad_norm_var": 0.04402211242043747, "learning_rate": 0.0001, "loss": 130074.4922, "loss/crossentropy": 2.5947558879852295, "loss/hidden": 0.369140625, "loss/logits": 0.08141292631626129, "loss/reg": 130074.0390625, "step": 462 }, { "epoch": 0.011575, "grad_norm": 0.7231173515319824, "grad_norm_var": 0.04349435098153685, "learning_rate": 0.0001, "loss": 129660.0547, "loss/crossentropy": 2.4421298503875732, "loss/hidden": 0.330078125, "loss/logits": 0.07788297533988953, "loss/reg": 129659.6484375, "step": 463 }, { "epoch": 0.0116, "grad_norm": 1.0745733976364136, "grad_norm_var": 0.03426826155179915, "learning_rate": 0.0001, "loss": 129248.9766, "loss/crossentropy": 2.3232181072235107, "loss/hidden": 0.373046875, "loss/logits": 0.09334367513656616, "loss/reg": 129248.5078125, "step": 464 }, { "epoch": 0.011625, "grad_norm": 1.112154245376587, "grad_norm_var": 0.03203088759449593, "learning_rate": 0.0001, "loss": 128838.0469, "loss/crossentropy": 2.710418462753296, "loss/hidden": 0.396484375, "loss/logits": 0.14225494861602783, "loss/reg": 128837.5078125, "step": 465 }, { "epoch": 0.01165, "grad_norm": 1.0500752925872803, "grad_norm_var": 0.030501727928448578, "learning_rate": 0.0001, "loss": 128427.5312, "loss/crossentropy": 2.709763288497925, "loss/hidden": 0.361328125, "loss/logits": 0.09049970656633377, "loss/reg": 128427.078125, "step": 466 }, { "epoch": 0.011675, "grad_norm": 1.289119005203247, "grad_norm_var": 0.03480087410717029, "learning_rate": 0.0001, "loss": 128019.4141, "loss/crossentropy": 2.2913756370544434, "loss/hidden": 0.359375, "loss/logits": 0.08161611109972, "loss/reg": 128018.9765625, "step": 467 }, { "epoch": 0.0117, "grad_norm": 1.0929843187332153, "grad_norm_var": 0.033028832528283404, "learning_rate": 0.0001, "loss": 127613.3672, "loss/crossentropy": 2.8107008934020996, "loss/hidden": 0.365234375, "loss/logits": 0.08901776373386383, "loss/reg": 127612.9140625, "step": 468 }, { "epoch": 0.011725, "grad_norm": 0.9957696795463562, "grad_norm_var": 0.033086650592021684, "learning_rate": 0.0001, "loss": 127206.2734, "loss/crossentropy": 2.3889429569244385, "loss/hidden": 0.361328125, "loss/logits": 0.10045656561851501, "loss/reg": 127205.8125, "step": 469 }, { "epoch": 0.01175, "grad_norm": 1.5600621700286865, "grad_norm_var": 0.04399344490662707, "learning_rate": 0.0001, "loss": 126789.2266, "loss/crossentropy": 2.939469814300537, "loss/hidden": 0.373046875, "loss/logits": 0.09594859182834625, "loss/reg": 126788.7578125, "step": 470 }, { "epoch": 0.011775, "grad_norm": 0.8998208045959473, "grad_norm_var": 0.04478916176521275, "learning_rate": 0.0001, "loss": 126383.0859, "loss/crossentropy": 2.659053325653076, "loss/hidden": 0.333984375, "loss/logits": 0.08160565793514252, "loss/reg": 126382.671875, "step": 471 }, { "epoch": 0.0118, "grad_norm": 1.1243245601654053, "grad_norm_var": 0.03805856841151596, "learning_rate": 0.0001, "loss": 125978.3047, "loss/crossentropy": 2.76472806930542, "loss/hidden": 0.353515625, "loss/logits": 0.0836871862411499, "loss/reg": 125977.8671875, "step": 472 }, { "epoch": 0.011825, "grad_norm": 0.9645205140113831, "grad_norm_var": 0.03812704390658297, "learning_rate": 0.0001, "loss": 125572.5547, "loss/crossentropy": 2.460484743118286, "loss/hidden": 0.337890625, "loss/logits": 0.08670766651630402, "loss/reg": 125572.1328125, "step": 473 }, { "epoch": 0.01185, "grad_norm": 0.9478604793548584, "grad_norm_var": 0.03947780081620251, "learning_rate": 0.0001, "loss": 125171.1719, "loss/crossentropy": 2.975003957748413, "loss/hidden": 0.373046875, "loss/logits": 0.08480578660964966, "loss/reg": 125170.7109375, "step": 474 }, { "epoch": 0.011875, "grad_norm": 1.0086603164672852, "grad_norm_var": 0.03772665503536855, "learning_rate": 0.0001, "loss": 124768.9609, "loss/crossentropy": 2.6586267948150635, "loss/hidden": 0.3671875, "loss/logits": 0.0969114601612091, "loss/reg": 124768.5, "step": 475 }, { "epoch": 0.0119, "grad_norm": 0.8238663673400879, "grad_norm_var": 0.035470889545035314, "learning_rate": 0.0001, "loss": 124371.3203, "loss/crossentropy": 2.324495553970337, "loss/hidden": 0.337890625, "loss/logits": 0.08633776009082794, "loss/reg": 124370.8984375, "step": 476 }, { "epoch": 0.011925, "grad_norm": 1.0551565885543823, "grad_norm_var": 0.03547396768774901, "learning_rate": 0.0001, "loss": 123974.6953, "loss/crossentropy": 2.6109156608581543, "loss/hidden": 0.37109375, "loss/logits": 0.09771596640348434, "loss/reg": 123974.2265625, "step": 477 }, { "epoch": 0.01195, "grad_norm": 0.9296664595603943, "grad_norm_var": 0.036349632924163426, "learning_rate": 0.0001, "loss": 123575.5859, "loss/crossentropy": 2.5287327766418457, "loss/hidden": 0.361328125, "loss/logits": 0.08497745543718338, "loss/reg": 123575.140625, "step": 478 }, { "epoch": 0.011975, "grad_norm": 1.175586223602295, "grad_norm_var": 0.029983631986153354, "learning_rate": 0.0001, "loss": 123167.7812, "loss/crossentropy": 2.451228618621826, "loss/hidden": 0.357421875, "loss/logits": 0.083621084690094, "loss/reg": 123167.34375, "step": 479 }, { "epoch": 0.012, "grad_norm": 0.8889312148094177, "grad_norm_var": 0.03199992643621474, "learning_rate": 0.0001, "loss": 122769.4609, "loss/crossentropy": 2.566922903060913, "loss/hidden": 0.357421875, "loss/logits": 0.08629916608333588, "loss/reg": 122769.015625, "step": 480 }, { "epoch": 0.012025, "grad_norm": 1.517897367477417, "grad_norm_var": 0.04525076342957224, "learning_rate": 0.0001, "loss": 122374.2969, "loss/crossentropy": 2.5881693363189697, "loss/hidden": 0.384765625, "loss/logits": 0.09450060874223709, "loss/reg": 122373.8203125, "step": 481 }, { "epoch": 0.01205, "grad_norm": 1.0205724239349365, "grad_norm_var": 0.04543377173875403, "learning_rate": 0.0001, "loss": 121973.6172, "loss/crossentropy": 2.7237818241119385, "loss/hidden": 0.361328125, "loss/logits": 0.09611287713050842, "loss/reg": 121973.15625, "step": 482 }, { "epoch": 0.012075, "grad_norm": 0.9960222840309143, "grad_norm_var": 0.04266674285080444, "learning_rate": 0.0001, "loss": 121575.7031, "loss/crossentropy": 2.5460588932037354, "loss/hidden": 0.373046875, "loss/logits": 0.08726921677589417, "loss/reg": 121575.2421875, "step": 483 }, { "epoch": 0.0121, "grad_norm": 0.9125394225120544, "grad_norm_var": 0.04397089239013236, "learning_rate": 0.0001, "loss": 121182.1719, "loss/crossentropy": 2.662997007369995, "loss/hidden": 0.3515625, "loss/logits": 0.0756755918264389, "loss/reg": 121181.7421875, "step": 484 }, { "epoch": 0.012125, "grad_norm": 0.8799184560775757, "grad_norm_var": 0.045667945002744154, "learning_rate": 0.0001, "loss": 120790.0859, "loss/crossentropy": 2.5711655616760254, "loss/hidden": 0.365234375, "loss/logits": 0.0949600487947464, "loss/reg": 120789.625, "step": 485 }, { "epoch": 0.01215, "grad_norm": 0.7621967196464539, "grad_norm_var": 0.030564364061018937, "learning_rate": 0.0001, "loss": 120401.1562, "loss/crossentropy": 2.4500555992126465, "loss/hidden": 0.361328125, "loss/logits": 0.09178435802459717, "loss/reg": 120400.703125, "step": 486 }, { "epoch": 0.012175, "grad_norm": 0.8588399291038513, "grad_norm_var": 0.031185143693074582, "learning_rate": 0.0001, "loss": 120011.7656, "loss/crossentropy": 2.455263137817383, "loss/hidden": 0.357421875, "loss/logits": 0.09996655583381653, "loss/reg": 120011.3046875, "step": 487 }, { "epoch": 0.0122, "grad_norm": 2.0252444744110107, "grad_norm_var": 0.09784971121349349, "learning_rate": 0.0001, "loss": 119621.1328, "loss/crossentropy": 2.5841636657714844, "loss/hidden": 0.36328125, "loss/logits": 0.0824744701385498, "loss/reg": 119620.6875, "step": 488 }, { "epoch": 0.012225, "grad_norm": 0.9055572748184204, "grad_norm_var": 0.09872304292664413, "learning_rate": 0.0001, "loss": 119235.2109, "loss/crossentropy": 2.594587564468384, "loss/hidden": 0.36328125, "loss/logits": 0.08088156580924988, "loss/reg": 119234.765625, "step": 489 }, { "epoch": 0.01225, "grad_norm": 1.1274175643920898, "grad_norm_var": 0.09842966132213408, "learning_rate": 0.0001, "loss": 118846.4062, "loss/crossentropy": 2.8951048851013184, "loss/hidden": 0.388671875, "loss/logits": 0.11064436286687851, "loss/reg": 118845.90625, "step": 490 }, { "epoch": 0.012275, "grad_norm": 0.8238899111747742, "grad_norm_var": 0.10171747524936578, "learning_rate": 0.0001, "loss": 118458.2656, "loss/crossentropy": 2.617831230163574, "loss/hidden": 0.337890625, "loss/logits": 0.08420297503471375, "loss/reg": 118457.84375, "step": 491 }, { "epoch": 0.0123, "grad_norm": 0.7952893972396851, "grad_norm_var": 0.10260711635534307, "learning_rate": 0.0001, "loss": 118073.4609, "loss/crossentropy": 2.5448977947235107, "loss/hidden": 0.337890625, "loss/logits": 0.09165728092193604, "loss/reg": 118073.03125, "step": 492 }, { "epoch": 0.012325, "grad_norm": 0.9254248142242432, "grad_norm_var": 0.1034343819358753, "learning_rate": 0.0001, "loss": 117686.6094, "loss/crossentropy": 2.6640286445617676, "loss/hidden": 0.365234375, "loss/logits": 0.08283071964979172, "loss/reg": 117686.1640625, "step": 493 }, { "epoch": 0.01235, "grad_norm": 0.8176378011703491, "grad_norm_var": 0.10577815732465817, "learning_rate": 0.0001, "loss": 117297.7266, "loss/crossentropy": 2.5091538429260254, "loss/hidden": 0.349609375, "loss/logits": 0.08984896540641785, "loss/reg": 117297.2890625, "step": 494 }, { "epoch": 0.012375, "grad_norm": 1.3219102621078491, "grad_norm_var": 0.11001404873489135, "learning_rate": 0.0001, "loss": 116913.25, "loss/crossentropy": 2.4136619567871094, "loss/hidden": 0.39453125, "loss/logits": 0.08754974603652954, "loss/reg": 116912.765625, "step": 495 }, { "epoch": 0.0124, "grad_norm": 0.9934073090553284, "grad_norm_var": 0.10864469870001443, "learning_rate": 0.0001, "loss": 116528.2734, "loss/crossentropy": 2.8950345516204834, "loss/hidden": 0.36328125, "loss/logits": 0.09001494199037552, "loss/reg": 116527.8203125, "step": 496 }, { "epoch": 0.012425, "grad_norm": 1.9397823810577393, "grad_norm_var": 0.14649738254872915, "learning_rate": 0.0001, "loss": 116142.4297, "loss/crossentropy": 2.3993234634399414, "loss/hidden": 0.330078125, "loss/logits": 0.0801636129617691, "loss/reg": 116142.015625, "step": 497 }, { "epoch": 0.01245, "grad_norm": 1.7955193519592285, "grad_norm_var": 0.1790168054387628, "learning_rate": 0.0001, "loss": 115763.0078, "loss/crossentropy": 2.5822932720184326, "loss/hidden": 0.373046875, "loss/logits": 0.07560418546199799, "loss/reg": 115762.5625, "step": 498 }, { "epoch": 0.012475, "grad_norm": 1.4114738702774048, "grad_norm_var": 0.18307315732150767, "learning_rate": 0.0001, "loss": 115380.0938, "loss/crossentropy": 2.757403612136841, "loss/hidden": 0.390625, "loss/logits": 0.10880538821220398, "loss/reg": 115379.59375, "step": 499 }, { "epoch": 0.0125, "grad_norm": 1.706606149673462, "grad_norm_var": 0.19802862511289915, "learning_rate": 0.0001, "loss": 114999.9688, "loss/crossentropy": 3.1950490474700928, "loss/hidden": 0.41796875, "loss/logits": 0.10724850744009018, "loss/reg": 114999.4453125, "step": 500 }, { "epoch": 0.012525, "grad_norm": 0.978772759437561, "grad_norm_var": 0.1945110486983968, "learning_rate": 0.0001, "loss": 114607.5547, "loss/crossentropy": 2.5394797325134277, "loss/hidden": 0.373046875, "loss/logits": 0.11042068153619766, "loss/reg": 114607.0703125, "step": 501 }, { "epoch": 0.01255, "grad_norm": 0.786778450012207, "grad_norm_var": 0.1931161462249265, "learning_rate": 0.0001, "loss": 114225.5312, "loss/crossentropy": 2.4225008487701416, "loss/hidden": 0.373046875, "loss/logits": 0.09658807516098022, "loss/reg": 114225.0625, "step": 502 }, { "epoch": 0.012575, "grad_norm": 1.7830466032028198, "grad_norm_var": 0.20435632639445528, "learning_rate": 0.0001, "loss": 113844.4453, "loss/crossentropy": 2.6307485103607178, "loss/hidden": 0.369140625, "loss/logits": 0.07897418737411499, "loss/reg": 113844.0, "step": 503 }, { "epoch": 0.0126, "grad_norm": 1.090800166130066, "grad_norm_var": 0.16341348624494217, "learning_rate": 0.0001, "loss": 113465.0469, "loss/crossentropy": 2.7216010093688965, "loss/hidden": 0.3828125, "loss/logits": 0.11136461049318314, "loss/reg": 113464.5546875, "step": 504 }, { "epoch": 0.012625, "grad_norm": 0.7676119804382324, "grad_norm_var": 0.1700222008788211, "learning_rate": 0.0001, "loss": 113085.375, "loss/crossentropy": 2.2064309120178223, "loss/hidden": 0.337890625, "loss/logits": 0.07960689067840576, "loss/reg": 113084.9609375, "step": 505 }, { "epoch": 0.01265, "grad_norm": 0.9918925166130066, "grad_norm_var": 0.17232965666638395, "learning_rate": 0.0001, "loss": 112710.0703, "loss/crossentropy": 2.38382625579834, "loss/hidden": 0.369140625, "loss/logits": 0.08772751688957214, "loss/reg": 112709.6171875, "step": 506 }, { "epoch": 0.012675, "grad_norm": 0.8411089181900024, "grad_norm_var": 0.17152345377644476, "learning_rate": 0.0001, "loss": 112330.3438, "loss/crossentropy": 2.376312017440796, "loss/hidden": 0.328125, "loss/logits": 0.07509603351354599, "loss/reg": 112329.9375, "step": 507 }, { "epoch": 0.0127, "grad_norm": 1.2928287982940674, "grad_norm_var": 0.1611958341104075, "learning_rate": 0.0001, "loss": 111957.0781, "loss/crossentropy": 2.7829861640930176, "loss/hidden": 0.3671875, "loss/logits": 0.09470954537391663, "loss/reg": 111956.6171875, "step": 508 }, { "epoch": 0.012725, "grad_norm": 1.4744898080825806, "grad_norm_var": 0.15881738681891258, "learning_rate": 0.0001, "loss": 111584.1328, "loss/crossentropy": 2.372309684753418, "loss/hidden": 0.396484375, "loss/logits": 0.11247822642326355, "loss/reg": 111583.625, "step": 509 }, { "epoch": 0.01275, "grad_norm": 0.9410738348960876, "grad_norm_var": 0.1526603027667998, "learning_rate": 0.0001, "loss": 111213.9766, "loss/crossentropy": 2.5752036571502686, "loss/hidden": 0.373046875, "loss/logits": 0.09293421357870102, "loss/reg": 111213.5078125, "step": 510 }, { "epoch": 0.012775, "grad_norm": 1.239321231842041, "grad_norm_var": 0.15237534080648046, "learning_rate": 0.0001, "loss": 110848.4688, "loss/crossentropy": 2.4036667346954346, "loss/hidden": 0.4140625, "loss/logits": 0.14227986335754395, "loss/reg": 110847.9140625, "step": 511 }, { "epoch": 0.0128, "grad_norm": 0.8401315212249756, "grad_norm_var": 0.15913169358604756, "learning_rate": 0.0001, "loss": 110481.7969, "loss/crossentropy": 2.3782424926757812, "loss/hidden": 0.357421875, "loss/logits": 0.08400236815214157, "loss/reg": 110481.3515625, "step": 512 }, { "epoch": 0.012825, "grad_norm": 0.8280027508735657, "grad_norm_var": 0.13303329187288068, "learning_rate": 0.0001, "loss": 110117.1172, "loss/crossentropy": 2.454601287841797, "loss/hidden": 0.333984375, "loss/logits": 0.07852231711149216, "loss/reg": 110116.703125, "step": 513 }, { "epoch": 0.01285, "grad_norm": 1.0113853216171265, "grad_norm_var": 0.1063868120555579, "learning_rate": 0.0001, "loss": 109748.8125, "loss/crossentropy": 2.382014036178589, "loss/hidden": 0.376953125, "loss/logits": 0.09340295195579529, "loss/reg": 109748.34375, "step": 514 }, { "epoch": 0.012875, "grad_norm": 0.9986621141433716, "grad_norm_var": 0.10121920328154285, "learning_rate": 0.0001, "loss": 109380.7422, "loss/crossentropy": 2.654278039932251, "loss/hidden": 0.373046875, "loss/logits": 0.10119034349918365, "loss/reg": 109380.265625, "step": 515 }, { "epoch": 0.0129, "grad_norm": 0.9399179816246033, "grad_norm_var": 0.07577139049272527, "learning_rate": 0.0001, "loss": 109014.1719, "loss/crossentropy": 2.8084640502929688, "loss/hidden": 0.33984375, "loss/logits": 0.09238427132368088, "loss/reg": 109013.7421875, "step": 516 }, { "epoch": 0.012925, "grad_norm": 0.9189648628234863, "grad_norm_var": 0.07656584860278356, "learning_rate": 0.0001, "loss": 108654.2734, "loss/crossentropy": 2.445586681365967, "loss/hidden": 0.359375, "loss/logits": 0.10389550030231476, "loss/reg": 108653.8125, "step": 517 }, { "epoch": 0.01295, "grad_norm": 0.8263663649559021, "grad_norm_var": 0.07529222214620088, "learning_rate": 0.0001, "loss": 108293.4453, "loss/crossentropy": 2.752142906188965, "loss/hidden": 0.35546875, "loss/logits": 0.09072582423686981, "loss/reg": 108293.0, "step": 518 }, { "epoch": 0.012975, "grad_norm": 1.0929358005523682, "grad_norm_var": 0.03752412651066224, "learning_rate": 0.0001, "loss": 107936.4297, "loss/crossentropy": 2.3689184188842773, "loss/hidden": 0.423828125, "loss/logits": 0.09004764258861542, "loss/reg": 107935.9140625, "step": 519 }, { "epoch": 0.013, "grad_norm": 1.1709879636764526, "grad_norm_var": 0.038833003307898045, "learning_rate": 0.0001, "loss": 107557.6484, "loss/crossentropy": 2.616063117980957, "loss/hidden": 0.373046875, "loss/logits": 0.10525017976760864, "loss/reg": 107557.171875, "step": 520 }, { "epoch": 0.013025, "grad_norm": 0.8341318964958191, "grad_norm_var": 0.036951049164519564, "learning_rate": 0.0001, "loss": 107197.8438, "loss/crossentropy": 2.571011781692505, "loss/hidden": 0.359375, "loss/logits": 0.10364649444818497, "loss/reg": 107197.3828125, "step": 521 }, { "epoch": 0.01305, "grad_norm": 1.233487606048584, "grad_norm_var": 0.03985027429405255, "learning_rate": 0.0001, "loss": 106844.7031, "loss/crossentropy": 2.4951682090759277, "loss/hidden": 0.373046875, "loss/logits": 0.09696684777736664, "loss/reg": 106844.234375, "step": 522 }, { "epoch": 0.013075, "grad_norm": 1.2658134698867798, "grad_norm_var": 0.04041382203118962, "learning_rate": 0.0001, "loss": 106480.8828, "loss/crossentropy": 2.8979501724243164, "loss/hidden": 0.369140625, "loss/logits": 0.08361580967903137, "loss/reg": 106480.4296875, "step": 523 }, { "epoch": 0.0131, "grad_norm": 0.8786676526069641, "grad_norm_var": 0.03809952111847513, "learning_rate": 0.0001, "loss": 106128.5547, "loss/crossentropy": 2.651653528213501, "loss/hidden": 0.365234375, "loss/logits": 0.09834752231836319, "loss/reg": 106128.09375, "step": 524 }, { "epoch": 0.013125, "grad_norm": 0.8635250926017761, "grad_norm_var": 0.025293388164052393, "learning_rate": 0.0001, "loss": 105776.7734, "loss/crossentropy": 2.737975597381592, "loss/hidden": 0.376953125, "loss/logits": 0.10877284407615662, "loss/reg": 105776.2890625, "step": 525 }, { "epoch": 0.01315, "grad_norm": 4.461543083190918, "grad_norm_var": 0.7756616132701457, "learning_rate": 0.0001, "loss": 105422.5156, "loss/crossentropy": 3.27087664604187, "loss/hidden": 0.458984375, "loss/logits": 0.15441305935382843, "loss/reg": 105421.8984375, "step": 526 }, { "epoch": 0.013175, "grad_norm": 0.9257360696792603, "grad_norm_var": 0.780696206640098, "learning_rate": 0.0001, "loss": 105058.2188, "loss/crossentropy": 2.386378765106201, "loss/hidden": 0.34765625, "loss/logits": 0.10172270983457565, "loss/reg": 105057.765625, "step": 527 }, { "epoch": 0.0132, "grad_norm": 0.9832150936126709, "grad_norm_var": 0.7752411115564602, "learning_rate": 0.0001, "loss": 104708.0547, "loss/crossentropy": 2.753894805908203, "loss/hidden": 0.380859375, "loss/logits": 0.10964615643024445, "loss/reg": 104707.5625, "step": 528 }, { "epoch": 0.013225, "grad_norm": 0.8955684900283813, "grad_norm_var": 0.7721564224718672, "learning_rate": 0.0001, "loss": 104329.8594, "loss/crossentropy": 2.3564887046813965, "loss/hidden": 0.35546875, "loss/logits": 0.08882109820842743, "loss/reg": 104329.4140625, "step": 529 }, { "epoch": 0.01325, "grad_norm": 0.8905146718025208, "grad_norm_var": 0.7762109010940305, "learning_rate": 0.0001, "loss": 103978.125, "loss/crossentropy": 2.3647301197052, "loss/hidden": 0.373046875, "loss/logits": 0.09351895749568939, "loss/reg": 103977.65625, "step": 530 }, { "epoch": 0.013275, "grad_norm": 1.150484561920166, "grad_norm_var": 0.7736011035444074, "learning_rate": 0.0001, "loss": 103626.8047, "loss/crossentropy": 2.584282636642456, "loss/hidden": 0.375, "loss/logits": 0.11200837045907974, "loss/reg": 103626.3203125, "step": 531 }, { "epoch": 0.0133, "grad_norm": 1.9985939264297485, "grad_norm_var": 0.8057751180104772, "learning_rate": 0.0001, "loss": 103278.8828, "loss/crossentropy": 2.514350652694702, "loss/hidden": 0.40234375, "loss/logits": 0.1426522135734558, "loss/reg": 103278.3359375, "step": 532 }, { "epoch": 0.013325, "grad_norm": 1.1248385906219482, "grad_norm_var": 0.7986672489099291, "learning_rate": 0.0001, "loss": 102931.1797, "loss/crossentropy": 2.7196877002716064, "loss/hidden": 0.380859375, "loss/logits": 0.102044478058815, "loss/reg": 102930.6953125, "step": 533 }, { "epoch": 0.01335, "grad_norm": 1.074903130531311, "grad_norm_var": 0.7872541863960926, "learning_rate": 0.0001, "loss": 102585.3672, "loss/crossentropy": 2.4676969051361084, "loss/hidden": 0.375, "loss/logits": 0.09518338739871979, "loss/reg": 102584.8984375, "step": 534 }, { "epoch": 0.013375, "grad_norm": 1.7951133251190186, "grad_norm_var": 0.7984208988539524, "learning_rate": 0.0001, "loss": 102241.6562, "loss/crossentropy": 3.1216862201690674, "loss/hidden": 0.42578125, "loss/logits": 0.10711027681827545, "loss/reg": 102241.125, "step": 535 }, { "epoch": 0.0134, "grad_norm": 1.094664454460144, "grad_norm_var": 0.8005730586879667, "learning_rate": 0.0001, "loss": 101879.2578, "loss/crossentropy": 2.677700996398926, "loss/hidden": 0.380859375, "loss/logits": 0.10375779867172241, "loss/reg": 101878.7734375, "step": 536 }, { "epoch": 0.013425, "grad_norm": 2.5311005115509033, "grad_norm_var": 0.8656599200048455, "learning_rate": 0.0001, "loss": 101537.0859, "loss/crossentropy": 2.8987925052642822, "loss/hidden": 0.42578125, "loss/logits": 0.24518340826034546, "loss/reg": 101536.4140625, "step": 537 }, { "epoch": 0.01345, "grad_norm": 1.2476500272750854, "grad_norm_var": 0.8652674144164189, "learning_rate": 0.0001, "loss": 101195.7656, "loss/crossentropy": 2.3096818923950195, "loss/hidden": 0.388671875, "loss/logits": 0.08652019500732422, "loss/reg": 101195.2890625, "step": 538 }, { "epoch": 0.013475, "grad_norm": 1.0766700506210327, "grad_norm_var": 0.8721199107049263, "learning_rate": 0.0001, "loss": 100856.5469, "loss/crossentropy": 2.541144609451294, "loss/hidden": 0.396484375, "loss/logits": 0.10943476855754852, "loss/reg": 100856.0390625, "step": 539 }, { "epoch": 0.0135, "grad_norm": 1.6914061307907104, "grad_norm_var": 0.8528948007387608, "learning_rate": 0.0001, "loss": 100516.1016, "loss/crossentropy": 2.6160571575164795, "loss/hidden": 0.4296875, "loss/logits": 0.12198935449123383, "loss/reg": 100515.546875, "step": 540 }, { "epoch": 0.013525, "grad_norm": 1.2223988771438599, "grad_norm_var": 0.8310705723616335, "learning_rate": 0.0001, "loss": 100175.4688, "loss/crossentropy": 2.515537977218628, "loss/hidden": 0.40234375, "loss/logits": 0.08977995067834854, "loss/reg": 100174.9765625, "step": 541 }, { "epoch": 0.01355, "grad_norm": 0.8259720206260681, "grad_norm_var": 0.2265506831187555, "learning_rate": 0.0001, "loss": 99839.1875, "loss/crossentropy": 2.3948302268981934, "loss/hidden": 0.361328125, "loss/logits": 0.08301985263824463, "loss/reg": 99838.7421875, "step": 542 }, { "epoch": 0.013575, "grad_norm": 0.8785867691040039, "grad_norm_var": 0.2289359161668815, "learning_rate": 0.0001, "loss": 99498.9609, "loss/crossentropy": 2.3601667881011963, "loss/hidden": 0.375, "loss/logits": 0.08771521598100662, "loss/reg": 99498.5, "step": 543 }, { "epoch": 0.0136, "grad_norm": 0.8317751288414001, "grad_norm_var": 0.23636409549236057, "learning_rate": 0.0001, "loss": 99163.625, "loss/crossentropy": 2.643152952194214, "loss/hidden": 0.365234375, "loss/logits": 0.08720413595438004, "loss/reg": 99163.171875, "step": 544 }, { "epoch": 0.013625, "grad_norm": 0.9161708354949951, "grad_norm_var": 0.23536031020803988, "learning_rate": 0.0001, "loss": 98827.1016, "loss/crossentropy": 2.6195507049560547, "loss/hidden": 0.369140625, "loss/logits": 0.0870186984539032, "loss/reg": 98826.6484375, "step": 545 }, { "epoch": 0.01365, "grad_norm": 1.1316457986831665, "grad_norm_var": 0.22673158391756795, "learning_rate": 0.0001, "loss": 98492.0391, "loss/crossentropy": 2.3038763999938965, "loss/hidden": 0.37890625, "loss/logits": 0.09377077966928482, "loss/reg": 98491.5625, "step": 546 }, { "epoch": 0.013675, "grad_norm": 1.3292113542556763, "grad_norm_var": 0.2254748810643837, "learning_rate": 0.0001, "loss": 98157.9375, "loss/crossentropy": 2.382305860519409, "loss/hidden": 0.373046875, "loss/logits": 0.09558240324258804, "loss/reg": 98157.46875, "step": 547 }, { "epoch": 0.0137, "grad_norm": 1.2655940055847168, "grad_norm_var": 0.1906005621512387, "learning_rate": 0.0001, "loss": 97828.7734, "loss/crossentropy": 2.9607603549957275, "loss/hidden": 0.390625, "loss/logits": 0.09649337083101273, "loss/reg": 97828.2890625, "step": 548 }, { "epoch": 0.013725, "grad_norm": 1.2224743366241455, "grad_norm_var": 0.18953631993640882, "learning_rate": 0.0001, "loss": 97497.375, "loss/crossentropy": 2.421534776687622, "loss/hidden": 0.40625, "loss/logits": 0.10133738815784454, "loss/reg": 97496.8671875, "step": 549 }, { "epoch": 0.01375, "grad_norm": 0.9172767996788025, "grad_norm_var": 0.19494695427785905, "learning_rate": 0.0001, "loss": 97166.6797, "loss/crossentropy": 2.6120481491088867, "loss/hidden": 0.384765625, "loss/logits": 0.09978368878364563, "loss/reg": 97166.1953125, "step": 550 }, { "epoch": 0.013775, "grad_norm": 0.9448373317718506, "grad_norm_var": 0.17817502771646507, "learning_rate": 0.0001, "loss": 96838.8984, "loss/crossentropy": 2.3174571990966797, "loss/hidden": 0.376953125, "loss/logits": 0.09136012196540833, "loss/reg": 96838.4296875, "step": 551 }, { "epoch": 0.0138, "grad_norm": 0.877375602722168, "grad_norm_var": 0.1840462984259357, "learning_rate": 0.0001, "loss": 96509.8906, "loss/crossentropy": 2.413999319076538, "loss/hidden": 0.375, "loss/logits": 0.10230405628681183, "loss/reg": 96509.4140625, "step": 552 }, { "epoch": 0.013825, "grad_norm": 0.716687023639679, "grad_norm_var": 0.06339745650757174, "learning_rate": 0.0001, "loss": 96183.0312, "loss/crossentropy": 2.5708112716674805, "loss/hidden": 0.359375, "loss/logits": 0.0769326239824295, "loss/reg": 96182.59375, "step": 553 }, { "epoch": 0.01385, "grad_norm": 1.237869143486023, "grad_norm_var": 0.06316978103030278, "learning_rate": 0.0001, "loss": 95858.75, "loss/crossentropy": 2.6847381591796875, "loss/hidden": 0.43359375, "loss/logits": 0.11960765719413757, "loss/reg": 95858.1953125, "step": 554 }, { "epoch": 0.013875, "grad_norm": 1.0143837928771973, "grad_norm_var": 0.06333918788560121, "learning_rate": 0.0001, "loss": 95535.5703, "loss/crossentropy": 2.547424077987671, "loss/hidden": 0.373046875, "loss/logits": 0.10015293955802917, "loss/reg": 95535.09375, "step": 555 }, { "epoch": 0.0139, "grad_norm": 1.0703991651535034, "grad_norm_var": 0.03549074686594148, "learning_rate": 0.0001, "loss": 95211.0859, "loss/crossentropy": 2.3821773529052734, "loss/hidden": 0.376953125, "loss/logits": 0.10645414888858795, "loss/reg": 95210.6015625, "step": 556 }, { "epoch": 0.013925, "grad_norm": 0.9475630521774292, "grad_norm_var": 0.03298411734819121, "learning_rate": 0.0001, "loss": 94887.0703, "loss/crossentropy": 2.0802979469299316, "loss/hidden": 0.380859375, "loss/logits": 0.08815540373325348, "loss/reg": 94886.6015625, "step": 557 }, { "epoch": 0.01395, "grad_norm": 0.9503076076507568, "grad_norm_var": 0.030932836160949256, "learning_rate": 0.0001, "loss": 94565.9141, "loss/crossentropy": 2.40145206451416, "loss/hidden": 0.3515625, "loss/logits": 0.10239367187023163, "loss/reg": 94565.4609375, "step": 558 }, { "epoch": 0.013975, "grad_norm": 1.001888632774353, "grad_norm_var": 0.029627885822190794, "learning_rate": 0.0001, "loss": 94249.375, "loss/crossentropy": 2.5415737628936768, "loss/hidden": 0.376953125, "loss/logits": 0.0858616754412651, "loss/reg": 94248.9140625, "step": 559 }, { "epoch": 0.014, "grad_norm": 0.8086162209510803, "grad_norm_var": 0.030253320954721833, "learning_rate": 0.0001, "loss": 93932.4844, "loss/crossentropy": 2.635380506515503, "loss/hidden": 0.359375, "loss/logits": 0.07743102312088013, "loss/reg": 93932.046875, "step": 560 }, { "epoch": 0.014025, "grad_norm": 0.9812893271446228, "grad_norm_var": 0.029599325823544357, "learning_rate": 0.0001, "loss": 93618.0938, "loss/crossentropy": 2.4184508323669434, "loss/hidden": 0.369140625, "loss/logits": 0.08096189796924591, "loss/reg": 93617.640625, "step": 561 }, { "epoch": 0.01405, "grad_norm": 1.1916530132293701, "grad_norm_var": 0.0306689382673141, "learning_rate": 0.0001, "loss": 93306.0234, "loss/crossentropy": 2.4902424812316895, "loss/hidden": 0.3984375, "loss/logits": 0.10383652150630951, "loss/reg": 93305.5234375, "step": 562 }, { "epoch": 0.014075, "grad_norm": 0.9122821688652039, "grad_norm_var": 0.024891042003138592, "learning_rate": 0.0001, "loss": 92991.9062, "loss/crossentropy": 2.4827306270599365, "loss/hidden": 0.373046875, "loss/logits": 0.09174686670303345, "loss/reg": 92991.4453125, "step": 563 }, { "epoch": 0.0141, "grad_norm": 0.9577797055244446, "grad_norm_var": 0.020067592909861532, "learning_rate": 0.0001, "loss": 92681.2266, "loss/crossentropy": 2.670959949493408, "loss/hidden": 0.361328125, "loss/logits": 0.09282038360834122, "loss/reg": 92680.7734375, "step": 564 }, { "epoch": 0.014125, "grad_norm": 1.2030467987060547, "grad_norm_var": 0.01947485873974557, "learning_rate": 0.0001, "loss": 92370.6641, "loss/crossentropy": 2.9116733074188232, "loss/hidden": 0.390625, "loss/logits": 0.10250720381736755, "loss/reg": 92370.171875, "step": 565 }, { "epoch": 0.01415, "grad_norm": 0.925117552280426, "grad_norm_var": 0.0194096484452747, "learning_rate": 0.0001, "loss": 92061.8906, "loss/crossentropy": 2.763044834136963, "loss/hidden": 0.3828125, "loss/logits": 0.10554227977991104, "loss/reg": 92061.3984375, "step": 566 }, { "epoch": 0.014175, "grad_norm": 0.9973451495170593, "grad_norm_var": 0.019309056507003423, "learning_rate": 0.0001, "loss": 91752.0859, "loss/crossentropy": 2.4450998306274414, "loss/hidden": 0.390625, "loss/logits": 0.13355585932731628, "loss/reg": 91751.5625, "step": 567 }, { "epoch": 0.0142, "grad_norm": 1.3838785886764526, "grad_norm_var": 0.027933021499690118, "learning_rate": 0.0001, "loss": 91444.9141, "loss/crossentropy": 2.6224660873413086, "loss/hidden": 0.37890625, "loss/logits": 0.08861524611711502, "loss/reg": 91444.4453125, "step": 568 }, { "epoch": 0.014225, "grad_norm": 1.522208571434021, "grad_norm_var": 0.03604393237560212, "learning_rate": 0.0001, "loss": 91137.6016, "loss/crossentropy": 2.6689343452453613, "loss/hidden": 0.392578125, "loss/logits": 0.10325664281845093, "loss/reg": 91137.109375, "step": 569 }, { "epoch": 0.01425, "grad_norm": 0.8374614119529724, "grad_norm_var": 0.037054228421395864, "learning_rate": 0.0001, "loss": 90830.6172, "loss/crossentropy": 2.6271815299987793, "loss/hidden": 0.373046875, "loss/logits": 0.0959094762802124, "loss/reg": 90830.1484375, "step": 570 }, { "epoch": 0.014275, "grad_norm": 0.8968411684036255, "grad_norm_var": 0.03838309652858253, "learning_rate": 0.0001, "loss": 90525.5781, "loss/crossentropy": 2.5133578777313232, "loss/hidden": 0.375, "loss/logits": 0.10282105207443237, "loss/reg": 90525.1015625, "step": 571 }, { "epoch": 0.0143, "grad_norm": 1.0100209712982178, "grad_norm_var": 0.038339889865839166, "learning_rate": 0.0001, "loss": 90223.0859, "loss/crossentropy": 2.4223990440368652, "loss/hidden": 0.376953125, "loss/logits": 0.10173159837722778, "loss/reg": 90222.609375, "step": 572 }, { "epoch": 0.014325, "grad_norm": 0.8718961477279663, "grad_norm_var": 0.03955925754722214, "learning_rate": 0.0001, "loss": 89917.3984, "loss/crossentropy": 2.210176944732666, "loss/hidden": 0.369140625, "loss/logits": 0.09007933735847473, "loss/reg": 89916.9375, "step": 573 }, { "epoch": 0.01435, "grad_norm": 1.0079243183135986, "grad_norm_var": 0.039168142839422854, "learning_rate": 0.0001, "loss": 89616.25, "loss/crossentropy": 2.6199238300323486, "loss/hidden": 0.376953125, "loss/logits": 0.09161718934774399, "loss/reg": 89615.78125, "step": 574 }, { "epoch": 0.014375, "grad_norm": 1.3659257888793945, "grad_norm_var": 0.04599762261593446, "learning_rate": 0.0001, "loss": 89312.8125, "loss/crossentropy": 2.835010051727295, "loss/hidden": 0.4140625, "loss/logits": 0.11617124080657959, "loss/reg": 89312.28125, "step": 575 }, { "epoch": 0.0144, "grad_norm": 0.9582456350326538, "grad_norm_var": 0.04248980195343724, "learning_rate": 0.0001, "loss": 89009.8359, "loss/crossentropy": 2.6877806186676025, "loss/hidden": 0.375, "loss/logits": 0.10095453262329102, "loss/reg": 89009.359375, "step": 576 }, { "epoch": 0.014425, "grad_norm": 1.1128160953521729, "grad_norm_var": 0.04212170640374529, "learning_rate": 0.0001, "loss": 88710.6953, "loss/crossentropy": 2.6519720554351807, "loss/hidden": 0.3984375, "loss/logits": 0.118956059217453, "loss/reg": 88710.1796875, "step": 577 }, { "epoch": 0.01445, "grad_norm": 0.9801219701766968, "grad_norm_var": 0.04154788913363323, "learning_rate": 0.0001, "loss": 88412.6406, "loss/crossentropy": 2.6760315895080566, "loss/hidden": 0.3671875, "loss/logits": 0.08030475676059723, "loss/reg": 88412.1953125, "step": 578 }, { "epoch": 0.014475, "grad_norm": 0.9177776575088501, "grad_norm_var": 0.041442321658872024, "learning_rate": 0.0001, "loss": 88107.7578, "loss/crossentropy": 2.2624671459198, "loss/hidden": 0.3828125, "loss/logits": 0.0892476812005043, "loss/reg": 88107.2890625, "step": 579 }, { "epoch": 0.0145, "grad_norm": 1.1240119934082031, "grad_norm_var": 0.04091981190463655, "learning_rate": 0.0001, "loss": 87812.875, "loss/crossentropy": 2.497429370880127, "loss/hidden": 0.3828125, "loss/logits": 0.10727068781852722, "loss/reg": 87812.3828125, "step": 580 }, { "epoch": 0.014525, "grad_norm": 0.884501576423645, "grad_norm_var": 0.041596667959251814, "learning_rate": 0.0001, "loss": 87488.25, "loss/crossentropy": 2.6077654361724854, "loss/hidden": 0.369140625, "loss/logits": 0.1036926656961441, "loss/reg": 87487.7734375, "step": 581 }, { "epoch": 0.01455, "grad_norm": 0.8187013864517212, "grad_norm_var": 0.044072914499201234, "learning_rate": 0.0001, "loss": 87187.7188, "loss/crossentropy": 2.4284141063690186, "loss/hidden": 0.34765625, "loss/logits": 0.08528505265712738, "loss/reg": 87187.2890625, "step": 582 }, { "epoch": 0.014575, "grad_norm": 1.1420753002166748, "grad_norm_var": 0.0444990484171594, "learning_rate": 0.0001, "loss": 86893.1641, "loss/crossentropy": 2.579559564590454, "loss/hidden": 0.373046875, "loss/logits": 0.10599015653133392, "loss/reg": 86892.6875, "step": 583 }, { "epoch": 0.0146, "grad_norm": 0.8006011247634888, "grad_norm_var": 0.03996373630048636, "learning_rate": 0.0001, "loss": 86596.6172, "loss/crossentropy": 2.7767887115478516, "loss/hidden": 0.3359375, "loss/logits": 0.09008356928825378, "loss/reg": 86596.1875, "step": 584 }, { "epoch": 0.014625, "grad_norm": 0.9800305366516113, "grad_norm_var": 0.02172002933442022, "learning_rate": 0.0001, "loss": 86303.8984, "loss/crossentropy": 2.4792940616607666, "loss/hidden": 0.376953125, "loss/logits": 0.09733758866786957, "loss/reg": 86303.421875, "step": 585 }, { "epoch": 0.01465, "grad_norm": 1.0853314399719238, "grad_norm_var": 0.02078939040552138, "learning_rate": 0.0001, "loss": 86007.5703, "loss/crossentropy": 2.667954683303833, "loss/hidden": 0.369140625, "loss/logits": 0.08128130435943604, "loss/reg": 86007.1171875, "step": 586 }, { "epoch": 0.014675, "grad_norm": 1.3247121572494507, "grad_norm_var": 0.02650028433099957, "learning_rate": 0.0001, "loss": 85712.7266, "loss/crossentropy": 2.5810751914978027, "loss/hidden": 0.375, "loss/logits": 0.10023487359285355, "loss/reg": 85712.25, "step": 587 }, { "epoch": 0.0147, "grad_norm": 1.313215732574463, "grad_norm_var": 0.03167885628815436, "learning_rate": 0.0001, "loss": 85422.0781, "loss/crossentropy": 2.421856164932251, "loss/hidden": 0.396484375, "loss/logits": 0.10593342781066895, "loss/reg": 85421.578125, "step": 588 }, { "epoch": 0.014725, "grad_norm": 0.9700228571891785, "grad_norm_var": 0.030042102630343695, "learning_rate": 0.0001, "loss": 85133.0859, "loss/crossentropy": 2.4789178371429443, "loss/hidden": 0.375, "loss/logits": 0.08909668773412704, "loss/reg": 85132.625, "step": 589 }, { "epoch": 0.01475, "grad_norm": 1.1963694095611572, "grad_norm_var": 0.03122634137597822, "learning_rate": 0.0001, "loss": 84843.0078, "loss/crossentropy": 2.433040142059326, "loss/hidden": 0.404296875, "loss/logits": 0.10144652426242828, "loss/reg": 84842.5, "step": 590 }, { "epoch": 0.014775, "grad_norm": 0.992373526096344, "grad_norm_var": 0.024755451206358654, "learning_rate": 0.0001, "loss": 84551.25, "loss/crossentropy": 2.5630834102630615, "loss/hidden": 0.39453125, "loss/logits": 0.09752138704061508, "loss/reg": 84550.7578125, "step": 591 }, { "epoch": 0.0148, "grad_norm": 0.8379811644554138, "grad_norm_var": 0.026931197603058725, "learning_rate": 0.0001, "loss": 84259.4141, "loss/crossentropy": 2.4513022899627686, "loss/hidden": 0.37109375, "loss/logits": 0.10591454803943634, "loss/reg": 84258.9375, "step": 592 }, { "epoch": 0.014825, "grad_norm": 0.9364086985588074, "grad_norm_var": 0.026929201080542895, "learning_rate": 0.0001, "loss": 83951.6641, "loss/crossentropy": 2.608539342880249, "loss/hidden": 0.369140625, "loss/logits": 0.0893184095621109, "loss/reg": 83951.203125, "step": 593 }, { "epoch": 0.01485, "grad_norm": 0.8221688270568848, "grad_norm_var": 0.029307624736031577, "learning_rate": 0.0001, "loss": 83660.5469, "loss/crossentropy": 2.3135383129119873, "loss/hidden": 0.3828125, "loss/logits": 0.09200223535299301, "loss/reg": 83660.0703125, "step": 594 }, { "epoch": 0.014875, "grad_norm": 0.9461011290550232, "grad_norm_var": 0.029012726690810557, "learning_rate": 0.0001, "loss": 83372.8125, "loss/crossentropy": 2.4558818340301514, "loss/hidden": 0.37109375, "loss/logits": 0.09155316650867462, "loss/reg": 83372.3515625, "step": 595 }, { "epoch": 0.0149, "grad_norm": 1.0248985290527344, "grad_norm_var": 0.028132075434765116, "learning_rate": 0.0001, "loss": 83085.7266, "loss/crossentropy": 2.8080692291259766, "loss/hidden": 0.375, "loss/logits": 0.09577822685241699, "loss/reg": 83085.2578125, "step": 596 }, { "epoch": 0.014925, "grad_norm": 1.0959073305130005, "grad_norm_var": 0.0275367479929386, "learning_rate": 0.0001, "loss": 82797.8594, "loss/crossentropy": 2.641843795776367, "loss/hidden": 0.380859375, "loss/logits": 0.09072021394968033, "loss/reg": 82797.390625, "step": 597 }, { "epoch": 0.01495, "grad_norm": 0.8215295672416687, "grad_norm_var": 0.027462120182847074, "learning_rate": 0.0001, "loss": 82510.1328, "loss/crossentropy": 2.4811935424804688, "loss/hidden": 0.365234375, "loss/logits": 0.09065172076225281, "loss/reg": 82509.6796875, "step": 598 }, { "epoch": 0.014975, "grad_norm": 0.9130824208259583, "grad_norm_var": 0.02695446037684032, "learning_rate": 0.0001, "loss": 82227.9922, "loss/crossentropy": 2.62929105758667, "loss/hidden": 0.380859375, "loss/logits": 0.0900956392288208, "loss/reg": 82227.5234375, "step": 599 }, { "epoch": 0.015, "grad_norm": 0.9190696477890015, "grad_norm_var": 0.024622010106409423, "learning_rate": 0.0001, "loss": 81942.7578, "loss/crossentropy": 2.5592992305755615, "loss/hidden": 0.376953125, "loss/logits": 0.10351692140102386, "loss/reg": 81942.2734375, "step": 600 }, { "epoch": 0.015025, "grad_norm": 0.9625332951545715, "grad_norm_var": 0.024713862414948787, "learning_rate": 0.0001, "loss": 81661.8047, "loss/crossentropy": 2.3075287342071533, "loss/hidden": 0.392578125, "loss/logits": 0.11193102598190308, "loss/reg": 81661.296875, "step": 601 }, { "epoch": 0.01505, "grad_norm": 1.1114728450775146, "grad_norm_var": 0.025018770937851778, "learning_rate": 0.0001, "loss": 81382.1797, "loss/crossentropy": 2.69278621673584, "loss/hidden": 0.380859375, "loss/logits": 0.09133666008710861, "loss/reg": 81381.7109375, "step": 602 }, { "epoch": 0.015075, "grad_norm": 0.9171868562698364, "grad_norm_var": 0.018392723266432872, "learning_rate": 0.0001, "loss": 81098.6562, "loss/crossentropy": 2.861598253250122, "loss/hidden": 0.3671875, "loss/logits": 0.09767521917819977, "loss/reg": 81098.1875, "step": 603 }, { "epoch": 0.0151, "grad_norm": 0.7906949520111084, "grad_norm_var": 0.012678853894032234, "learning_rate": 0.0001, "loss": 80821.0938, "loss/crossentropy": 2.511326789855957, "loss/hidden": 0.380859375, "loss/logits": 0.09441511332988739, "loss/reg": 80820.6171875, "step": 604 }, { "epoch": 0.015125, "grad_norm": 0.981177568435669, "grad_norm_var": 0.012711037559269285, "learning_rate": 0.0001, "loss": 80546.1406, "loss/crossentropy": 2.412504196166992, "loss/hidden": 0.384765625, "loss/logits": 0.09406470507383347, "loss/reg": 80545.6640625, "step": 605 }, { "epoch": 0.01515, "grad_norm": 1.2353384494781494, "grad_norm_var": 0.014063660269151497, "learning_rate": 0.0001, "loss": 80270.0938, "loss/crossentropy": 2.577329635620117, "loss/hidden": 0.376953125, "loss/logits": 0.09719591587781906, "loss/reg": 80269.6171875, "step": 606 }, { "epoch": 0.015175, "grad_norm": 1.114750623703003, "grad_norm_var": 0.015581013741776673, "learning_rate": 0.0001, "loss": 79995.7422, "loss/crossentropy": 2.8609046936035156, "loss/hidden": 0.375, "loss/logits": 0.09995967149734497, "loss/reg": 79995.265625, "step": 607 }, { "epoch": 0.0152, "grad_norm": 0.9995872378349304, "grad_norm_var": 0.014489421651203335, "learning_rate": 0.0001, "loss": 79713.2734, "loss/crossentropy": 2.586801767349243, "loss/hidden": 0.39453125, "loss/logits": 0.10346552729606628, "loss/reg": 79712.7734375, "step": 608 }, { "epoch": 0.015225, "grad_norm": 0.9819250106811523, "grad_norm_var": 0.014387769896785689, "learning_rate": 0.0001, "loss": 79437.2422, "loss/crossentropy": 2.422132730484009, "loss/hidden": 0.380859375, "loss/logits": 0.09481838345527649, "loss/reg": 79436.765625, "step": 609 }, { "epoch": 0.01525, "grad_norm": 1.50832998752594, "grad_norm_var": 0.0296176065794699, "learning_rate": 0.0001, "loss": 79164.0312, "loss/crossentropy": 2.769239664077759, "loss/hidden": 0.423828125, "loss/logits": 0.0994962751865387, "loss/reg": 79163.5078125, "step": 610 }, { "epoch": 0.015275, "grad_norm": 0.6924430727958679, "grad_norm_var": 0.03614592535571727, "learning_rate": 0.0001, "loss": 78892.0547, "loss/crossentropy": 2.398139476776123, "loss/hidden": 0.365234375, "loss/logits": 0.09804777801036835, "loss/reg": 78891.59375, "step": 611 }, { "epoch": 0.0153, "grad_norm": 0.72183758020401, "grad_norm_var": 0.04105679580103398, "learning_rate": 0.0001, "loss": 78621.7109, "loss/crossentropy": 2.3019931316375732, "loss/hidden": 0.365234375, "loss/logits": 0.0825042724609375, "loss/reg": 78621.265625, "step": 612 }, { "epoch": 0.015325, "grad_norm": 1.4297187328338623, "grad_norm_var": 0.05293835745936608, "learning_rate": 0.0001, "loss": 78352.9922, "loss/crossentropy": 2.805884599685669, "loss/hidden": 0.388671875, "loss/logits": 0.09254398196935654, "loss/reg": 78352.5078125, "step": 613 }, { "epoch": 0.01535, "grad_norm": 1.1380010843276978, "grad_norm_var": 0.051401707871366976, "learning_rate": 0.0001, "loss": 78081.8672, "loss/crossentropy": 2.540323257446289, "loss/hidden": 0.380859375, "loss/logits": 0.11731082946062088, "loss/reg": 78081.3671875, "step": 614 }, { "epoch": 0.015375, "grad_norm": 0.9623900055885315, "grad_norm_var": 0.05081082889014064, "learning_rate": 0.0001, "loss": 77810.625, "loss/crossentropy": 2.7089173793792725, "loss/hidden": 0.404296875, "loss/logits": 0.08911550790071487, "loss/reg": 77810.1328125, "step": 615 }, { "epoch": 0.0154, "grad_norm": 0.7471880912780762, "grad_norm_var": 0.05518013577421012, "learning_rate": 0.0001, "loss": 77542.5547, "loss/crossentropy": 2.477466583251953, "loss/hidden": 0.3671875, "loss/logits": 0.09427085518836975, "loss/reg": 77542.09375, "step": 616 }, { "epoch": 0.015425, "grad_norm": 0.8823354840278625, "grad_norm_var": 0.056179618541439526, "learning_rate": 0.0001, "loss": 77277.7422, "loss/crossentropy": 2.588768243789673, "loss/hidden": 0.392578125, "loss/logits": 0.11282309889793396, "loss/reg": 77277.234375, "step": 617 }, { "epoch": 0.01545, "grad_norm": 0.8651421070098877, "grad_norm_var": 0.05675088538023313, "learning_rate": 0.0001, "loss": 77011.3125, "loss/crossentropy": 2.5131797790527344, "loss/hidden": 0.37109375, "loss/logits": 0.09642226248979568, "loss/reg": 77010.84375, "step": 618 }, { "epoch": 0.015475, "grad_norm": 0.8223963379859924, "grad_norm_var": 0.05833387600102255, "learning_rate": 0.0001, "loss": 76746.0625, "loss/crossentropy": 2.4448771476745605, "loss/hidden": 0.369140625, "loss/logits": 0.09524812549352646, "loss/reg": 76745.6015625, "step": 619 }, { "epoch": 0.0155, "grad_norm": 1.0320149660110474, "grad_norm_var": 0.055493867252299364, "learning_rate": 0.0001, "loss": 76476.75, "loss/crossentropy": 2.574714183807373, "loss/hidden": 0.390625, "loss/logits": 0.09856754541397095, "loss/reg": 76476.2578125, "step": 620 }, { "epoch": 0.015525, "grad_norm": 1.0086482763290405, "grad_norm_var": 0.05544586106039677, "learning_rate": 0.0001, "loss": 76212.7031, "loss/crossentropy": 2.639218807220459, "loss/hidden": 0.392578125, "loss/logits": 0.11160705238580704, "loss/reg": 76212.1953125, "step": 621 }, { "epoch": 0.01555, "grad_norm": 0.8823007345199585, "grad_norm_var": 0.05257570773830693, "learning_rate": 0.0001, "loss": 75951.4297, "loss/crossentropy": 2.27701735496521, "loss/hidden": 0.34375, "loss/logits": 0.09013539552688599, "loss/reg": 75950.9921875, "step": 622 }, { "epoch": 0.015575, "grad_norm": 1.14113187789917, "grad_norm_var": 0.05306922626909578, "learning_rate": 0.0001, "loss": 75692.5703, "loss/crossentropy": 2.671344518661499, "loss/hidden": 0.396484375, "loss/logits": 0.10680802166461945, "loss/reg": 75692.0703125, "step": 623 }, { "epoch": 0.0156, "grad_norm": 0.8108043670654297, "grad_norm_var": 0.05501662590291508, "learning_rate": 0.0001, "loss": 75429.5703, "loss/crossentropy": 2.757770538330078, "loss/hidden": 0.37890625, "loss/logits": 0.08836045861244202, "loss/reg": 75429.1015625, "step": 624 }, { "epoch": 0.015625, "grad_norm": 0.8286943435668945, "grad_norm_var": 0.05637659602012198, "learning_rate": 0.0001, "loss": 75170.6719, "loss/crossentropy": 2.3864805698394775, "loss/hidden": 0.373046875, "loss/logits": 0.09211406856775284, "loss/reg": 75170.203125, "step": 625 }, { "epoch": 0.01565, "grad_norm": 1.0009613037109375, "grad_norm_var": 0.03585083684432448, "learning_rate": 0.0001, "loss": 74912.9531, "loss/crossentropy": 2.4257941246032715, "loss/hidden": 0.392578125, "loss/logits": 0.10842062532901764, "loss/reg": 74912.453125, "step": 626 }, { "epoch": 0.015675, "grad_norm": 1.0110238790512085, "grad_norm_var": 0.031875046203807676, "learning_rate": 0.0001, "loss": 74653.6172, "loss/crossentropy": 2.8187096118927, "loss/hidden": 0.40625, "loss/logits": 0.0988621786236763, "loss/reg": 74653.109375, "step": 627 }, { "epoch": 0.0157, "grad_norm": 1.074616551399231, "grad_norm_var": 0.028672561024481746, "learning_rate": 0.0001, "loss": 74396.4531, "loss/crossentropy": 2.6383824348449707, "loss/hidden": 0.380859375, "loss/logits": 0.09884043782949448, "loss/reg": 74395.9765625, "step": 628 }, { "epoch": 0.015725, "grad_norm": 1.1295496225357056, "grad_norm_var": 0.016198375136296392, "learning_rate": 0.0001, "loss": 74140.1016, "loss/crossentropy": 2.5384268760681152, "loss/hidden": 0.384765625, "loss/logits": 0.08849247545003891, "loss/reg": 74139.625, "step": 629 }, { "epoch": 0.01575, "grad_norm": 1.1694039106369019, "grad_norm_var": 0.017011273804826255, "learning_rate": 0.0001, "loss": 73885.1016, "loss/crossentropy": 2.6295008659362793, "loss/hidden": 0.373046875, "loss/logits": 0.0844004675745964, "loss/reg": 73884.640625, "step": 630 }, { "epoch": 0.015775, "grad_norm": 1.0288974046707153, "grad_norm_var": 0.017304152285151454, "learning_rate": 0.0001, "loss": 73633.0469, "loss/crossentropy": 2.3022804260253906, "loss/hidden": 0.3671875, "loss/logits": 0.08712589740753174, "loss/reg": 73632.59375, "step": 631 }, { "epoch": 0.0158, "grad_norm": 1.0885484218597412, "grad_norm_var": 0.014687348514027516, "learning_rate": 0.0001, "loss": 73382.4141, "loss/crossentropy": 2.400463104248047, "loss/hidden": 0.380859375, "loss/logits": 0.09456802904605865, "loss/reg": 73381.9375, "step": 632 }, { "epoch": 0.015825, "grad_norm": 0.6965525150299072, "grad_norm_var": 0.019413162707278872, "learning_rate": 0.0001, "loss": 73129.8203, "loss/crossentropy": 2.318265914916992, "loss/hidden": 0.34375, "loss/logits": 0.08227289468050003, "loss/reg": 73129.390625, "step": 633 }, { "epoch": 0.01585, "grad_norm": 0.971870481967926, "grad_norm_var": 0.0185700529308282, "learning_rate": 0.0001, "loss": 72881.6953, "loss/crossentropy": 2.730379104614258, "loss/hidden": 0.380859375, "loss/logits": 0.10445239394903183, "loss/reg": 72881.2109375, "step": 634 }, { "epoch": 0.015875, "grad_norm": 0.8180504441261292, "grad_norm_var": 0.018663187891327895, "learning_rate": 0.0001, "loss": 72633.0391, "loss/crossentropy": 2.577846050262451, "loss/hidden": 0.376953125, "loss/logits": 0.08856044709682465, "loss/reg": 72632.5703125, "step": 635 }, { "epoch": 0.0159, "grad_norm": 1.065203070640564, "grad_norm_var": 0.01895858446336399, "learning_rate": 0.0001, "loss": 72386.2109, "loss/crossentropy": 2.308788537979126, "loss/hidden": 0.396484375, "loss/logits": 0.09516717493534088, "loss/reg": 72385.71875, "step": 636 }, { "epoch": 0.015925, "grad_norm": 1.204892873764038, "grad_norm_var": 0.022039542379022472, "learning_rate": 0.0001, "loss": 72130.0, "loss/crossentropy": 3.0494790077209473, "loss/hidden": 0.38671875, "loss/logits": 0.08954030275344849, "loss/reg": 72129.5234375, "step": 637 }, { "epoch": 0.01595, "grad_norm": 0.8242658972740173, "grad_norm_var": 0.02312331942637121, "learning_rate": 0.0001, "loss": 71880.1094, "loss/crossentropy": 2.5868396759033203, "loss/hidden": 0.37109375, "loss/logits": 0.08857108652591705, "loss/reg": 71879.6484375, "step": 638 }, { "epoch": 0.015975, "grad_norm": 1.5980547666549683, "grad_norm_var": 0.04528622991240388, "learning_rate": 0.0001, "loss": 71631.4375, "loss/crossentropy": 2.7629361152648926, "loss/hidden": 0.380859375, "loss/logits": 0.09402740001678467, "loss/reg": 71630.9609375, "step": 639 }, { "epoch": 0.016, "grad_norm": 1.148252248764038, "grad_norm_var": 0.04298691384647809, "learning_rate": 0.0001, "loss": 71380.9609, "loss/crossentropy": 2.600595474243164, "loss/hidden": 0.408203125, "loss/logits": 0.10225421190261841, "loss/reg": 71380.453125, "step": 640 }, { "epoch": 0.016025, "grad_norm": 0.9008048176765442, "grad_norm_var": 0.04126894208351336, "learning_rate": 0.0001, "loss": 71136.0469, "loss/crossentropy": 2.4793899059295654, "loss/hidden": 0.3828125, "loss/logits": 0.09964653849601746, "loss/reg": 71135.5625, "step": 641 }, { "epoch": 0.01605, "grad_norm": 1.5230423212051392, "grad_norm_var": 0.05519127779878321, "learning_rate": 0.0001, "loss": 70889.0625, "loss/crossentropy": 2.731199264526367, "loss/hidden": 0.42578125, "loss/logits": 0.11591259390115738, "loss/reg": 70888.5234375, "step": 642 }, { "epoch": 0.016075, "grad_norm": 0.9102566838264465, "grad_norm_var": 0.05672999381592079, "learning_rate": 0.0001, "loss": 70621.5625, "loss/crossentropy": 2.760890245437622, "loss/hidden": 0.396484375, "loss/logits": 0.10278172791004181, "loss/reg": 70621.0625, "step": 643 }, { "epoch": 0.0161, "grad_norm": 1.081149935722351, "grad_norm_var": 0.056734926693826286, "learning_rate": 0.0001, "loss": 70373.5, "loss/crossentropy": 2.400461196899414, "loss/hidden": 0.40625, "loss/logits": 0.1075059324502945, "loss/reg": 70372.984375, "step": 644 }, { "epoch": 0.016125, "grad_norm": 1.0392025709152222, "grad_norm_var": 0.05655694627189723, "learning_rate": 0.0001, "loss": 70126.2656, "loss/crossentropy": 2.6415491104125977, "loss/hidden": 0.380859375, "loss/logits": 0.09059594571590424, "loss/reg": 70125.796875, "step": 645 }, { "epoch": 0.01615, "grad_norm": 0.9613660573959351, "grad_norm_var": 0.05641525462659834, "learning_rate": 0.0001, "loss": 69883.6328, "loss/crossentropy": 2.573458194732666, "loss/hidden": 0.388671875, "loss/logits": 0.08760470896959305, "loss/reg": 69883.15625, "step": 646 }, { "epoch": 0.016175, "grad_norm": 0.9740030169487, "grad_norm_var": 0.056785681929638974, "learning_rate": 0.0001, "loss": 69640.0156, "loss/crossentropy": 2.484391450881958, "loss/hidden": 0.373046875, "loss/logits": 0.09017197042703629, "loss/reg": 69639.5546875, "step": 647 }, { "epoch": 0.0162, "grad_norm": 0.9880450963973999, "grad_norm_var": 0.0569050432928762, "learning_rate": 0.0001, "loss": 69395.2109, "loss/crossentropy": 2.5021278858184814, "loss/hidden": 0.375, "loss/logits": 0.08895129710435867, "loss/reg": 69394.75, "step": 648 }, { "epoch": 0.016225, "grad_norm": 1.1739388704299927, "grad_norm_var": 0.0490290639565254, "learning_rate": 0.0001, "loss": 69152.9688, "loss/crossentropy": 2.2598655223846436, "loss/hidden": 0.396484375, "loss/logits": 0.09144014120101929, "loss/reg": 69152.484375, "step": 649 }, { "epoch": 0.01625, "grad_norm": 0.9586436748504639, "grad_norm_var": 0.04921993477394175, "learning_rate": 0.0001, "loss": 68904.6719, "loss/crossentropy": 2.626762866973877, "loss/hidden": 0.38671875, "loss/logits": 0.0878828912973404, "loss/reg": 68904.1953125, "step": 650 }, { "epoch": 0.016275, "grad_norm": 1.749466061592102, "grad_norm_var": 0.07176990963569659, "learning_rate": 0.0001, "loss": 68660.1797, "loss/crossentropy": 2.770278215408325, "loss/hidden": 0.408203125, "loss/logits": 0.09789373725652695, "loss/reg": 68659.671875, "step": 651 }, { "epoch": 0.0163, "grad_norm": 1.0988394021987915, "grad_norm_var": 0.07154424731421181, "learning_rate": 0.0001, "loss": 68417.2422, "loss/crossentropy": 2.7384262084960938, "loss/hidden": 0.404296875, "loss/logits": 0.10276539623737335, "loss/reg": 68416.734375, "step": 652 }, { "epoch": 0.016325, "grad_norm": 1.6708260774612427, "grad_norm_var": 0.08955474275790679, "learning_rate": 0.0001, "loss": 68176.5625, "loss/crossentropy": 2.5017573833465576, "loss/hidden": 0.4140625, "loss/logits": 0.10067269206047058, "loss/reg": 68176.046875, "step": 653 }, { "epoch": 0.01635, "grad_norm": 1.069149136543274, "grad_norm_var": 0.08225869387716167, "learning_rate": 0.0001, "loss": 67935.7891, "loss/crossentropy": 2.6416540145874023, "loss/hidden": 0.390625, "loss/logits": 0.10551495850086212, "loss/reg": 67935.2890625, "step": 654 }, { "epoch": 0.016375, "grad_norm": 0.8851730823516846, "grad_norm_var": 0.07407705239805969, "learning_rate": 0.0001, "loss": 67690.7422, "loss/crossentropy": 2.3436880111694336, "loss/hidden": 0.373046875, "loss/logits": 0.09293360263109207, "loss/reg": 67690.2734375, "step": 655 }, { "epoch": 0.0164, "grad_norm": 1.2111107110977173, "grad_norm_var": 0.07444965399694643, "learning_rate": 0.0001, "loss": 67449.0312, "loss/crossentropy": 2.6042330265045166, "loss/hidden": 0.40234375, "loss/logits": 0.09541107714176178, "loss/reg": 67448.53125, "step": 656 }, { "epoch": 0.016425, "grad_norm": 0.9232460260391235, "grad_norm_var": 0.07377383114040156, "learning_rate": 0.0001, "loss": 67209.6172, "loss/crossentropy": 2.577650785446167, "loss/hidden": 0.37890625, "loss/logits": 0.10014642030000687, "loss/reg": 67209.140625, "step": 657 }, { "epoch": 0.01645, "grad_norm": 1.239464282989502, "grad_norm_var": 0.06426360887058274, "learning_rate": 0.0001, "loss": 66972.7734, "loss/crossentropy": 2.5427756309509277, "loss/hidden": 0.3984375, "loss/logits": 0.0848100334405899, "loss/reg": 66972.2890625, "step": 658 }, { "epoch": 0.016475, "grad_norm": 1.168949842453003, "grad_norm_var": 0.0611817648148463, "learning_rate": 0.0001, "loss": 66727.8984, "loss/crossentropy": 2.9303178787231445, "loss/hidden": 0.40625, "loss/logits": 0.11067648977041245, "loss/reg": 66727.3828125, "step": 659 }, { "epoch": 0.0165, "grad_norm": 1.0272209644317627, "grad_norm_var": 0.061765385112204155, "learning_rate": 0.0001, "loss": 66490.5938, "loss/crossentropy": 2.40498423576355, "loss/hidden": 0.408203125, "loss/logits": 0.09394747018814087, "loss/reg": 66490.09375, "step": 660 }, { "epoch": 0.016525, "grad_norm": 1.2891979217529297, "grad_norm_var": 0.06252279059305163, "learning_rate": 0.0001, "loss": 66257.8594, "loss/crossentropy": 2.955780029296875, "loss/hidden": 0.416015625, "loss/logits": 0.10309955477714539, "loss/reg": 66257.34375, "step": 661 }, { "epoch": 0.01655, "grad_norm": 1.0556660890579224, "grad_norm_var": 0.06071574024949246, "learning_rate": 0.0001, "loss": 66024.3906, "loss/crossentropy": 2.543252944946289, "loss/hidden": 0.3828125, "loss/logits": 0.10051153600215912, "loss/reg": 66023.90625, "step": 662 }, { "epoch": 0.016575, "grad_norm": 1.3408629894256592, "grad_norm_var": 0.06026498492915847, "learning_rate": 0.0001, "loss": 65792.9219, "loss/crossentropy": 2.4806578159332275, "loss/hidden": 0.390625, "loss/logits": 0.09797219932079315, "loss/reg": 65792.4296875, "step": 663 }, { "epoch": 0.0166, "grad_norm": 1.8128626346588135, "grad_norm_var": 0.08188244117617766, "learning_rate": 0.0001, "loss": 65555.9375, "loss/crossentropy": 2.7088303565979004, "loss/hidden": 0.4140625, "loss/logits": 0.10116955637931824, "loss/reg": 65555.421875, "step": 664 }, { "epoch": 0.016625, "grad_norm": 1.0112638473510742, "grad_norm_var": 0.08474505865472673, "learning_rate": 0.0001, "loss": 65322.957, "loss/crossentropy": 2.596529722213745, "loss/hidden": 0.384765625, "loss/logits": 0.09807929396629333, "loss/reg": 65322.47265625, "step": 665 }, { "epoch": 0.01665, "grad_norm": 1.0874592065811157, "grad_norm_var": 0.08130189736198545, "learning_rate": 0.0001, "loss": 65090.7266, "loss/crossentropy": 2.3783011436462402, "loss/hidden": 0.408203125, "loss/logits": 0.094398632645607, "loss/reg": 65090.22265625, "step": 666 }, { "epoch": 0.016675, "grad_norm": 0.8276606798171997, "grad_norm_var": 0.07026206154558637, "learning_rate": 0.0001, "loss": 64858.5859, "loss/crossentropy": 2.398869037628174, "loss/hidden": 0.376953125, "loss/logits": 0.09084640443325043, "loss/reg": 64858.1171875, "step": 667 }, { "epoch": 0.0167, "grad_norm": 0.8844419121742249, "grad_norm_var": 0.07516730394582714, "learning_rate": 0.0001, "loss": 64631.7969, "loss/crossentropy": 2.5455760955810547, "loss/hidden": 0.3828125, "loss/logits": 0.09470289200544357, "loss/reg": 64631.3203125, "step": 668 }, { "epoch": 0.016725, "grad_norm": 0.8544840812683105, "grad_norm_var": 0.060839842963105406, "learning_rate": 0.0001, "loss": 64403.6758, "loss/crossentropy": 2.4772214889526367, "loss/hidden": 0.390625, "loss/logits": 0.09808434545993805, "loss/reg": 64403.1875, "step": 669 }, { "epoch": 0.01675, "grad_norm": 1.055088758468628, "grad_norm_var": 0.060920371442028304, "learning_rate": 0.0001, "loss": 64176.7617, "loss/crossentropy": 2.807988405227661, "loss/hidden": 0.38671875, "loss/logits": 0.08935073018074036, "loss/reg": 64176.28515625, "step": 670 }, { "epoch": 0.016775, "grad_norm": 0.8417630791664124, "grad_norm_var": 0.06230839136668986, "learning_rate": 0.0001, "loss": 63948.7422, "loss/crossentropy": 2.2669050693511963, "loss/hidden": 0.3828125, "loss/logits": 0.09268943965435028, "loss/reg": 63948.265625, "step": 671 }, { "epoch": 0.0168, "grad_norm": 0.9983002543449402, "grad_norm_var": 0.062040693678979374, "learning_rate": 0.0001, "loss": 63724.1562, "loss/crossentropy": 2.5512285232543945, "loss/hidden": 0.375, "loss/logits": 0.09171494841575623, "loss/reg": 63723.69140625, "step": 672 }, { "epoch": 0.016825, "grad_norm": 1.0449855327606201, "grad_norm_var": 0.06028262317672586, "learning_rate": 0.0001, "loss": 63500.082, "loss/crossentropy": 2.4477298259735107, "loss/hidden": 0.396484375, "loss/logits": 0.1075860857963562, "loss/reg": 63499.578125, "step": 673 }, { "epoch": 0.01685, "grad_norm": 0.907045841217041, "grad_norm_var": 0.06084048188357287, "learning_rate": 0.0001, "loss": 63268.3047, "loss/crossentropy": 2.5063469409942627, "loss/hidden": 0.375, "loss/logits": 0.09125159680843353, "loss/reg": 63267.83984375, "step": 674 }, { "epoch": 0.016875, "grad_norm": 1.0878510475158691, "grad_norm_var": 0.06024055160010679, "learning_rate": 0.0001, "loss": 63049.1797, "loss/crossentropy": 2.737177610397339, "loss/hidden": 0.388671875, "loss/logits": 0.09512165188789368, "loss/reg": 63048.6953125, "step": 675 }, { "epoch": 0.0169, "grad_norm": 1.0298672914505005, "grad_norm_var": 0.06022575925050888, "learning_rate": 0.0001, "loss": 62829.1172, "loss/crossentropy": 2.5746004581451416, "loss/hidden": 0.404296875, "loss/logits": 0.10296541452407837, "loss/reg": 62828.609375, "step": 676 }, { "epoch": 0.016925, "grad_norm": 1.0156315565109253, "grad_norm_var": 0.05692787604339444, "learning_rate": 0.0001, "loss": 62606.5117, "loss/crossentropy": 2.4797537326812744, "loss/hidden": 0.384765625, "loss/logits": 0.10525722801685333, "loss/reg": 62606.0234375, "step": 677 }, { "epoch": 0.01695, "grad_norm": 1.086360216140747, "grad_norm_var": 0.05699581968737051, "learning_rate": 0.0001, "loss": 62389.1406, "loss/crossentropy": 2.767706871032715, "loss/hidden": 0.392578125, "loss/logits": 0.09870638698339462, "loss/reg": 62388.6484375, "step": 678 }, { "epoch": 0.016975, "grad_norm": 0.9417318105697632, "grad_norm_var": 0.05175923504988777, "learning_rate": 0.0001, "loss": 62164.2461, "loss/crossentropy": 2.4770090579986572, "loss/hidden": 0.39453125, "loss/logits": 0.0980028361082077, "loss/reg": 62163.75390625, "step": 679 }, { "epoch": 0.017, "grad_norm": 1.042153000831604, "grad_norm_var": 0.008479507941950818, "learning_rate": 0.0001, "loss": 61943.375, "loss/crossentropy": 2.5883591175079346, "loss/hidden": 0.40234375, "loss/logits": 0.11304843425750732, "loss/reg": 61942.859375, "step": 680 }, { "epoch": 0.017025, "grad_norm": 1.0394786596298218, "grad_norm_var": 0.008638391318302381, "learning_rate": 0.0001, "loss": 61725.0859, "loss/crossentropy": 2.566668748855591, "loss/hidden": 0.392578125, "loss/logits": 0.10709897428750992, "loss/reg": 61724.5859375, "step": 681 }, { "epoch": 0.01705, "grad_norm": 0.9527825713157654, "grad_norm_var": 0.007914538507960363, "learning_rate": 0.0001, "loss": 61510.9727, "loss/crossentropy": 2.6995317935943604, "loss/hidden": 0.392578125, "loss/logits": 0.09625931084156036, "loss/reg": 61510.484375, "step": 682 }, { "epoch": 0.017075, "grad_norm": 0.9171804785728455, "grad_norm_var": 0.006649581285260438, "learning_rate": 0.0001, "loss": 61296.6094, "loss/crossentropy": 2.4217958450317383, "loss/hidden": 0.396484375, "loss/logits": 0.08172821253538132, "loss/reg": 61296.1328125, "step": 683 }, { "epoch": 0.0171, "grad_norm": 0.927759051322937, "grad_norm_var": 0.006208036346572439, "learning_rate": 0.0001, "loss": 61084.2578, "loss/crossentropy": 2.4237263202667236, "loss/hidden": 0.373046875, "loss/logits": 0.08073138445615768, "loss/reg": 61083.8046875, "step": 684 }, { "epoch": 0.017125, "grad_norm": 0.9073169827461243, "grad_norm_var": 0.005470809927099637, "learning_rate": 0.0001, "loss": 60875.1406, "loss/crossentropy": 2.6566085815429688, "loss/hidden": 0.3828125, "loss/logits": 0.09275166690349579, "loss/reg": 60874.6640625, "step": 685 }, { "epoch": 0.01715, "grad_norm": 1.0408992767333984, "grad_norm_var": 0.0053549642915791265, "learning_rate": 0.0001, "loss": 60647.8398, "loss/crossentropy": 2.3780102729797363, "loss/hidden": 0.396484375, "loss/logits": 0.09495106339454651, "loss/reg": 60647.34765625, "step": 686 }, { "epoch": 0.017175, "grad_norm": 0.8319820165634155, "grad_norm_var": 0.005549465233255546, "learning_rate": 0.0001, "loss": 60435.8242, "loss/crossentropy": 2.619631767272949, "loss/hidden": 0.3828125, "loss/logits": 0.08472824096679688, "loss/reg": 60435.35546875, "step": 687 }, { "epoch": 0.0172, "grad_norm": 0.7369972467422485, "grad_norm_var": 0.009378194651902054, "learning_rate": 0.0001, "loss": 60226.2422, "loss/crossentropy": 2.189018487930298, "loss/hidden": 0.365234375, "loss/logits": 0.08153541386127472, "loss/reg": 60225.796875, "step": 688 }, { "epoch": 0.017225, "grad_norm": 1.2337210178375244, "grad_norm_var": 0.013507195588811511, "learning_rate": 0.0001, "loss": 60016.4414, "loss/crossentropy": 2.679335117340088, "loss/hidden": 0.416015625, "loss/logits": 0.1080087423324585, "loss/reg": 60015.91796875, "step": 689 }, { "epoch": 0.01725, "grad_norm": 1.1025989055633545, "grad_norm_var": 0.013964501931947012, "learning_rate": 0.0001, "loss": 59804.4219, "loss/crossentropy": 2.2785794734954834, "loss/hidden": 0.400390625, "loss/logits": 0.09753485023975372, "loss/reg": 59803.92578125, "step": 690 }, { "epoch": 0.017275, "grad_norm": 1.3840587139129639, "grad_norm_var": 0.023178690900109666, "learning_rate": 0.0001, "loss": 59593.8125, "loss/crossentropy": 2.383326530456543, "loss/hidden": 0.419921875, "loss/logits": 0.12039785087108612, "loss/reg": 59593.2734375, "step": 691 }, { "epoch": 0.0173, "grad_norm": 1.042061686515808, "grad_norm_var": 0.023217186149151425, "learning_rate": 0.0001, "loss": 59383.8281, "loss/crossentropy": 2.518843173980713, "loss/hidden": 0.392578125, "loss/logits": 0.09896533191204071, "loss/reg": 59383.3359375, "step": 692 }, { "epoch": 0.017325, "grad_norm": 0.9572771191596985, "grad_norm_var": 0.023406967698831653, "learning_rate": 0.0001, "loss": 59175.8281, "loss/crossentropy": 2.5718486309051514, "loss/hidden": 0.384765625, "loss/logits": 0.09524203091859818, "loss/reg": 59175.34765625, "step": 693 }, { "epoch": 0.01735, "grad_norm": 0.9643810987472534, "grad_norm_var": 0.023079086485804843, "learning_rate": 0.0001, "loss": 58969.3438, "loss/crossentropy": 2.4464197158813477, "loss/hidden": 0.375, "loss/logits": 0.08171060681343079, "loss/reg": 58968.88671875, "step": 694 }, { "epoch": 0.017375, "grad_norm": 0.7643433809280396, "grad_norm_var": 0.02645698184280339, "learning_rate": 0.0001, "loss": 58760.3047, "loss/crossentropy": 2.5445268154144287, "loss/hidden": 0.37109375, "loss/logits": 0.08663898706436157, "loss/reg": 58759.84765625, "step": 695 }, { "epoch": 0.0174, "grad_norm": 1.0930417776107788, "grad_norm_var": 0.026970586420081825, "learning_rate": 0.0001, "loss": 58553.8359, "loss/crossentropy": 2.5427513122558594, "loss/hidden": 0.400390625, "loss/logits": 0.08456844091415405, "loss/reg": 58553.3515625, "step": 696 }, { "epoch": 0.017425, "grad_norm": 1.2481635808944702, "grad_norm_var": 0.030971972800994032, "learning_rate": 0.0001, "loss": 58347.457, "loss/crossentropy": 3.4390416145324707, "loss/hidden": 0.423828125, "loss/logits": 0.10264578461647034, "loss/reg": 58346.9296875, "step": 697 }, { "epoch": 0.01745, "grad_norm": 1.5968362092971802, "grad_norm_var": 0.052281338212470024, "learning_rate": 0.0001, "loss": 58131.2969, "loss/crossentropy": 2.9014079570770264, "loss/hidden": 0.470703125, "loss/logits": 0.1580468714237213, "loss/reg": 58130.66796875, "step": 698 }, { "epoch": 0.017475, "grad_norm": 1.3415122032165527, "grad_norm_var": 0.05620201146869827, "learning_rate": 0.0001, "loss": 57911.957, "loss/crossentropy": 2.282113552093506, "loss/hidden": 0.435546875, "loss/logits": 0.11629819869995117, "loss/reg": 57911.40625, "step": 699 }, { "epoch": 0.0175, "grad_norm": 0.9442893266677856, "grad_norm_var": 0.055898291300073545, "learning_rate": 0.0001, "loss": 57706.0586, "loss/crossentropy": 2.5043060779571533, "loss/hidden": 0.37890625, "loss/logits": 0.08389831334352493, "loss/reg": 57705.59765625, "step": 700 }, { "epoch": 0.017525, "grad_norm": 0.8646631240844727, "grad_norm_var": 0.0569619053459128, "learning_rate": 0.0001, "loss": 57504.4492, "loss/crossentropy": 2.701007127761841, "loss/hidden": 0.396484375, "loss/logits": 0.09037187695503235, "loss/reg": 57503.9609375, "step": 701 }, { "epoch": 0.01755, "grad_norm": 0.956434965133667, "grad_norm_var": 0.057754406244796865, "learning_rate": 0.0001, "loss": 57301.9492, "loss/crossentropy": 2.4940638542175293, "loss/hidden": 0.39453125, "loss/logits": 0.09759010374546051, "loss/reg": 57301.45703125, "step": 702 }, { "epoch": 0.017575, "grad_norm": 0.9836230278015137, "grad_norm_var": 0.054451990386576776, "learning_rate": 0.0001, "loss": 57096.0312, "loss/crossentropy": 2.60841703414917, "loss/hidden": 0.41015625, "loss/logits": 0.109431691467762, "loss/reg": 57095.51171875, "step": 703 }, { "epoch": 0.0176, "grad_norm": 0.8200333714485168, "grad_norm_var": 0.05113104603631058, "learning_rate": 0.0001, "loss": 56892.293, "loss/crossentropy": 2.5531857013702393, "loss/hidden": 0.404296875, "loss/logits": 0.08870550990104675, "loss/reg": 56891.80078125, "step": 704 }, { "epoch": 0.017625, "grad_norm": 0.8858484625816345, "grad_norm_var": 0.05161385658113162, "learning_rate": 0.0001, "loss": 56688.5117, "loss/crossentropy": 2.4622130393981934, "loss/hidden": 0.3984375, "loss/logits": 0.09578855335712433, "loss/reg": 56688.015625, "step": 705 }, { "epoch": 0.01765, "grad_norm": 0.9303566217422485, "grad_norm_var": 0.05247420907563337, "learning_rate": 0.0001, "loss": 56485.6719, "loss/crossentropy": 2.9453697204589844, "loss/hidden": 0.3984375, "loss/logits": 0.10039514303207397, "loss/reg": 56485.171875, "step": 706 }, { "epoch": 0.017675, "grad_norm": 1.0896490812301636, "grad_norm_var": 0.04472156313393646, "learning_rate": 0.0001, "loss": 56285.3008, "loss/crossentropy": 2.870180130004883, "loss/hidden": 0.3984375, "loss/logits": 0.08954556286334991, "loss/reg": 56284.8125, "step": 707 }, { "epoch": 0.0177, "grad_norm": 0.925428032875061, "grad_norm_var": 0.04538664754722578, "learning_rate": 0.0001, "loss": 56084.5703, "loss/crossentropy": 2.3981821537017822, "loss/hidden": 0.384765625, "loss/logits": 0.09831464290618896, "loss/reg": 56084.0859375, "step": 708 }, { "epoch": 0.017725, "grad_norm": 0.9551652669906616, "grad_norm_var": 0.04540539527495232, "learning_rate": 0.0001, "loss": 55886.6172, "loss/crossentropy": 2.4851620197296143, "loss/hidden": 0.408203125, "loss/logits": 0.09828799217939377, "loss/reg": 55886.109375, "step": 709 }, { "epoch": 0.01775, "grad_norm": 1.3149915933609009, "grad_norm_var": 0.050360417836195145, "learning_rate": 0.0001, "loss": 55688.0508, "loss/crossentropy": 2.6232731342315674, "loss/hidden": 0.423828125, "loss/logits": 0.10853007435798645, "loss/reg": 55687.51953125, "step": 710 }, { "epoch": 0.017775, "grad_norm": 1.0394874811172485, "grad_norm_var": 0.04480868587850016, "learning_rate": 0.0001, "loss": 55489.6914, "loss/crossentropy": 2.5964436531066895, "loss/hidden": 0.39453125, "loss/logits": 0.10230209678411484, "loss/reg": 55489.1953125, "step": 711 }, { "epoch": 0.0178, "grad_norm": 1.346766710281372, "grad_norm_var": 0.04988758520215928, "learning_rate": 0.0001, "loss": 55292.3594, "loss/crossentropy": 2.8174843788146973, "loss/hidden": 0.40625, "loss/logits": 0.10428360104560852, "loss/reg": 55291.84765625, "step": 712 }, { "epoch": 0.017825, "grad_norm": 1.1590180397033691, "grad_norm_var": 0.04835816219355919, "learning_rate": 0.0001, "loss": 55098.832, "loss/crossentropy": 2.6847729682922363, "loss/hidden": 0.388671875, "loss/logits": 0.09854494780302048, "loss/reg": 55098.34375, "step": 713 }, { "epoch": 0.01785, "grad_norm": 0.8711786866188049, "grad_norm_var": 0.03050188085823664, "learning_rate": 0.0001, "loss": 54903.668, "loss/crossentropy": 2.5830821990966797, "loss/hidden": 0.388671875, "loss/logits": 0.09112709015607834, "loss/reg": 54903.1875, "step": 714 }, { "epoch": 0.017875, "grad_norm": 1.0323970317840576, "grad_norm_var": 0.02350200522760937, "learning_rate": 0.0001, "loss": 54711.5156, "loss/crossentropy": 2.497689962387085, "loss/hidden": 0.390625, "loss/logits": 0.10076910257339478, "loss/reg": 54711.0234375, "step": 715 }, { "epoch": 0.0179, "grad_norm": 0.903690755367279, "grad_norm_var": 0.02394696248407584, "learning_rate": 0.0001, "loss": 54519.4961, "loss/crossentropy": 2.614609479904175, "loss/hidden": 0.37890625, "loss/logits": 0.08881643414497375, "loss/reg": 54519.02734375, "step": 716 }, { "epoch": 0.017925, "grad_norm": 1.0168510675430298, "grad_norm_var": 0.02254846601204997, "learning_rate": 0.0001, "loss": 54330.6914, "loss/crossentropy": 2.3349671363830566, "loss/hidden": 0.384765625, "loss/logits": 0.08678834140300751, "loss/reg": 54330.21875, "step": 717 }, { "epoch": 0.01795, "grad_norm": 0.8937983512878418, "grad_norm_var": 0.023278044427324042, "learning_rate": 0.0001, "loss": 54141.4766, "loss/crossentropy": 2.5494956970214844, "loss/hidden": 0.39453125, "loss/logits": 0.09560904651880264, "loss/reg": 54140.98828125, "step": 718 }, { "epoch": 0.017975, "grad_norm": 0.8298357725143433, "grad_norm_var": 0.025307676785486895, "learning_rate": 0.0001, "loss": 53953.5859, "loss/crossentropy": 2.6559760570526123, "loss/hidden": 0.3828125, "loss/logits": 0.09700687229633331, "loss/reg": 53953.10546875, "step": 719 }, { "epoch": 0.018, "grad_norm": 0.8975862264633179, "grad_norm_var": 0.023813287575312345, "learning_rate": 0.0001, "loss": 53766.7773, "loss/crossentropy": 2.5280327796936035, "loss/hidden": 0.39453125, "loss/logits": 0.10120564699172974, "loss/reg": 53766.28125, "step": 720 }, { "epoch": 0.018025, "grad_norm": 0.9913856983184814, "grad_norm_var": 0.02282216583665496, "learning_rate": 0.0001, "loss": 53579.6211, "loss/crossentropy": 2.3738551139831543, "loss/hidden": 0.40625, "loss/logits": 0.10090280324220657, "loss/reg": 53579.11328125, "step": 721 }, { "epoch": 0.01805, "grad_norm": 0.872941255569458, "grad_norm_var": 0.023655882749711074, "learning_rate": 0.0001, "loss": 53392.3008, "loss/crossentropy": 2.528557777404785, "loss/hidden": 0.3828125, "loss/logits": 0.10077151656150818, "loss/reg": 53391.81640625, "step": 722 }, { "epoch": 0.018075, "grad_norm": 1.0608396530151367, "grad_norm_var": 0.02339704359274458, "learning_rate": 0.0001, "loss": 53203.9492, "loss/crossentropy": 2.8958404064178467, "loss/hidden": 0.388671875, "loss/logits": 0.09159500896930695, "loss/reg": 53203.46875, "step": 723 }, { "epoch": 0.0181, "grad_norm": 0.9737719893455505, "grad_norm_var": 0.02301757035596806, "learning_rate": 0.0001, "loss": 53018.8477, "loss/crossentropy": 2.9025018215179443, "loss/hidden": 0.396484375, "loss/logits": 0.09637045860290527, "loss/reg": 53018.35546875, "step": 724 }, { "epoch": 0.018125, "grad_norm": 1.0595719814300537, "grad_norm_var": 0.022935775569963925, "learning_rate": 0.0001, "loss": 52824.4766, "loss/crossentropy": 2.671370029449463, "loss/hidden": 0.40234375, "loss/logits": 0.09349768608808517, "loss/reg": 52823.98046875, "step": 725 }, { "epoch": 0.01815, "grad_norm": 1.1013160943984985, "grad_norm_var": 0.017285506348992032, "learning_rate": 0.0001, "loss": 52636.5117, "loss/crossentropy": 2.4408681392669678, "loss/hidden": 0.3984375, "loss/logits": 0.09394092112779617, "loss/reg": 52636.01953125, "step": 726 }, { "epoch": 0.018175, "grad_norm": 1.0487664937973022, "grad_norm_var": 0.017335841551676372, "learning_rate": 0.0001, "loss": 52449.5781, "loss/crossentropy": 2.4244089126586914, "loss/hidden": 0.404296875, "loss/logits": 0.09841768443584442, "loss/reg": 52449.07421875, "step": 727 }, { "epoch": 0.0182, "grad_norm": 1.3364503383636475, "grad_norm_var": 0.016870643779337263, "learning_rate": 0.0001, "loss": 52264.5, "loss/crossentropy": 2.7356204986572266, "loss/hidden": 0.400390625, "loss/logits": 0.094476617872715, "loss/reg": 52264.00390625, "step": 728 }, { "epoch": 0.018225, "grad_norm": 1.2551052570343018, "grad_norm_var": 0.01944541553566581, "learning_rate": 0.0001, "loss": 52080.7422, "loss/crossentropy": 2.345259666442871, "loss/hidden": 0.404296875, "loss/logits": 0.10854319483041763, "loss/reg": 52080.23046875, "step": 729 }, { "epoch": 0.01825, "grad_norm": 0.994271457195282, "grad_norm_var": 0.01812890564488113, "learning_rate": 0.0001, "loss": 51897.2383, "loss/crossentropy": 2.5534796714782715, "loss/hidden": 0.400390625, "loss/logits": 0.09998567402362823, "loss/reg": 51896.73828125, "step": 730 }, { "epoch": 0.018275, "grad_norm": 1.036136507987976, "grad_norm_var": 0.018137563129258405, "learning_rate": 0.0001, "loss": 51714.25, "loss/crossentropy": 2.641772985458374, "loss/hidden": 0.380859375, "loss/logits": 0.09617019444704056, "loss/reg": 51713.7734375, "step": 731 }, { "epoch": 0.0183, "grad_norm": 0.9567418098449707, "grad_norm_var": 0.01751183305709052, "learning_rate": 0.0001, "loss": 51531.0078, "loss/crossentropy": 2.4739558696746826, "loss/hidden": 0.380859375, "loss/logits": 0.08270092308521271, "loss/reg": 51530.54296875, "step": 732 }, { "epoch": 0.018325, "grad_norm": 0.8940505981445312, "grad_norm_var": 0.018511384254257735, "learning_rate": 0.0001, "loss": 51347.918, "loss/crossentropy": 2.1752779483795166, "loss/hidden": 0.388671875, "loss/logits": 0.09863855689764023, "loss/reg": 51347.4296875, "step": 733 }, { "epoch": 0.01835, "grad_norm": 1.2298671007156372, "grad_norm_var": 0.020244154044657077, "learning_rate": 0.0001, "loss": 51164.9102, "loss/crossentropy": 2.7591755390167236, "loss/hidden": 0.4296875, "loss/logits": 0.11854663491249084, "loss/reg": 51164.36328125, "step": 734 }, { "epoch": 0.018375, "grad_norm": 1.0282326936721802, "grad_norm_var": 0.017312361786124993, "learning_rate": 0.0001, "loss": 50983.2852, "loss/crossentropy": 2.5418176651000977, "loss/hidden": 0.4140625, "loss/logits": 0.094975546002388, "loss/reg": 50982.77734375, "step": 735 }, { "epoch": 0.0184, "grad_norm": 1.6109849214553833, "grad_norm_var": 0.034997721610513996, "learning_rate": 0.0001, "loss": 50794.875, "loss/crossentropy": 2.864858627319336, "loss/hidden": 0.421875, "loss/logits": 0.09372644126415253, "loss/reg": 50794.359375, "step": 736 }, { "epoch": 0.018425, "grad_norm": 1.2849129438400269, "grad_norm_var": 0.03649762544530522, "learning_rate": 0.0001, "loss": 50608.5859, "loss/crossentropy": 2.546605348587036, "loss/hidden": 0.40625, "loss/logits": 0.10290227085351944, "loss/reg": 50608.078125, "step": 737 }, { "epoch": 0.01845, "grad_norm": 1.8890656232833862, "grad_norm_var": 0.06904774488376778, "learning_rate": 0.0001, "loss": 50429.8125, "loss/crossentropy": 2.8703765869140625, "loss/hidden": 0.443359375, "loss/logits": 0.10859087854623795, "loss/reg": 50429.26171875, "step": 738 }, { "epoch": 0.018475, "grad_norm": 1.1573561429977417, "grad_norm_var": 0.0681929487293542, "learning_rate": 0.0001, "loss": 50247.793, "loss/crossentropy": 2.488643169403076, "loss/hidden": 0.4140625, "loss/logits": 0.09659995883703232, "loss/reg": 50247.28125, "step": 739 }, { "epoch": 0.0185, "grad_norm": 1.135470986366272, "grad_norm_var": 0.06541238942306339, "learning_rate": 0.0001, "loss": 50067.6562, "loss/crossentropy": 2.4839093685150146, "loss/hidden": 0.41796875, "loss/logits": 0.1023077517747879, "loss/reg": 50067.13671875, "step": 740 }, { "epoch": 0.018525, "grad_norm": 0.9855368733406067, "grad_norm_var": 0.06702907714801304, "learning_rate": 0.0001, "loss": 49887.3594, "loss/crossentropy": 2.554145097732544, "loss/hidden": 0.408203125, "loss/logits": 0.08684562146663666, "loss/reg": 49886.86328125, "step": 741 }, { "epoch": 0.01855, "grad_norm": 0.9068771004676819, "grad_norm_var": 0.07153601226928949, "learning_rate": 0.0001, "loss": 49709.043, "loss/crossentropy": 2.65455961227417, "loss/hidden": 0.392578125, "loss/logits": 0.10292993485927582, "loss/reg": 49708.546875, "step": 742 }, { "epoch": 0.018575, "grad_norm": 0.9256609678268433, "grad_norm_var": 0.07450373194885765, "learning_rate": 0.0001, "loss": 49530.3398, "loss/crossentropy": 2.7470786571502686, "loss/hidden": 0.412109375, "loss/logits": 0.09205284714698792, "loss/reg": 49529.8359375, "step": 743 }, { "epoch": 0.0186, "grad_norm": 1.1988563537597656, "grad_norm_var": 0.07252635823950025, "learning_rate": 0.0001, "loss": 49353.957, "loss/crossentropy": 2.5547876358032227, "loss/hidden": 0.435546875, "loss/logits": 0.10985640436410904, "loss/reg": 49353.41015625, "step": 744 }, { "epoch": 0.018625, "grad_norm": 0.8618044853210449, "grad_norm_var": 0.07697458673344261, "learning_rate": 0.0001, "loss": 49178.1562, "loss/crossentropy": 2.3190128803253174, "loss/hidden": 0.3984375, "loss/logits": 0.09546959400177002, "loss/reg": 49177.6640625, "step": 745 }, { "epoch": 0.01865, "grad_norm": 1.092405080795288, "grad_norm_var": 0.07578759425521137, "learning_rate": 0.0001, "loss": 49006.7148, "loss/crossentropy": 2.772601842880249, "loss/hidden": 0.42578125, "loss/logits": 0.10889579355716705, "loss/reg": 49006.1796875, "step": 746 }, { "epoch": 0.018675, "grad_norm": 0.8835573196411133, "grad_norm_var": 0.0792970681340984, "learning_rate": 0.0001, "loss": 48833.25, "loss/crossentropy": 2.623377799987793, "loss/hidden": 0.3984375, "loss/logits": 0.09446098655462265, "loss/reg": 48832.7578125, "step": 747 }, { "epoch": 0.0187, "grad_norm": 2.9753003120422363, "grad_norm_var": 0.28797693666548735, "learning_rate": 0.0001, "loss": 48662.4414, "loss/crossentropy": 2.5330989360809326, "loss/hidden": 0.466796875, "loss/logits": 0.19345475733280182, "loss/reg": 48661.78125, "step": 748 }, { "epoch": 0.018725, "grad_norm": 1.1538830995559692, "grad_norm_var": 0.2797350796604114, "learning_rate": 0.0001, "loss": 48490.5547, "loss/crossentropy": 2.716426372528076, "loss/hidden": 0.419921875, "loss/logits": 0.09423987567424774, "loss/reg": 48490.0390625, "step": 749 }, { "epoch": 0.01875, "grad_norm": 0.8942911624908447, "grad_norm_var": 0.28856832743585786, "learning_rate": 0.0001, "loss": 48314.0898, "loss/crossentropy": 2.4474997520446777, "loss/hidden": 0.40234375, "loss/logits": 0.11440010368824005, "loss/reg": 48313.57421875, "step": 750 }, { "epoch": 0.018775, "grad_norm": 0.9876633882522583, "grad_norm_var": 0.28986544400916986, "learning_rate": 0.0001, "loss": 48141.3477, "loss/crossentropy": 2.454270839691162, "loss/hidden": 0.384765625, "loss/logits": 0.0848151445388794, "loss/reg": 48140.87890625, "step": 751 }, { "epoch": 0.0188, "grad_norm": 0.9700044393539429, "grad_norm_var": 0.2843915862496468, "learning_rate": 0.0001, "loss": 47969.8867, "loss/crossentropy": 2.5182764530181885, "loss/hidden": 0.404296875, "loss/logits": 0.10214034467935562, "loss/reg": 47969.37890625, "step": 752 }, { "epoch": 0.018825, "grad_norm": 1.2348213195800781, "grad_norm_var": 0.28402413344607497, "learning_rate": 0.0001, "loss": 47798.5234, "loss/crossentropy": 2.8100967407226562, "loss/hidden": 0.439453125, "loss/logits": 0.1163613498210907, "loss/reg": 47797.96875, "step": 753 }, { "epoch": 0.01885, "grad_norm": 0.9870022535324097, "grad_norm_var": 0.2523991444644473, "learning_rate": 0.0001, "loss": 47626.3047, "loss/crossentropy": 2.3341073989868164, "loss/hidden": 0.41796875, "loss/logits": 0.10574543476104736, "loss/reg": 47625.78125, "step": 754 }, { "epoch": 0.018875, "grad_norm": 1.1098355054855347, "grad_norm_var": 0.25247406778994563, "learning_rate": 0.0001, "loss": 47454.8125, "loss/crossentropy": 2.5026051998138428, "loss/hidden": 0.423828125, "loss/logits": 0.09431763738393784, "loss/reg": 47454.29296875, "step": 755 }, { "epoch": 0.0189, "grad_norm": 0.9873075485229492, "grad_norm_var": 0.25401331385406956, "learning_rate": 0.0001, "loss": 47285.4258, "loss/crossentropy": 2.3226237297058105, "loss/hidden": 0.419921875, "loss/logits": 0.09147959202528, "loss/reg": 47284.9140625, "step": 756 }, { "epoch": 0.018925, "grad_norm": 0.8810727596282959, "grad_norm_var": 0.2567726447010756, "learning_rate": 0.0001, "loss": 47116.3984, "loss/crossentropy": 2.390197992324829, "loss/hidden": 0.392578125, "loss/logits": 0.08687802404165268, "loss/reg": 47115.91796875, "step": 757 }, { "epoch": 0.01895, "grad_norm": 0.9770220518112183, "grad_norm_var": 0.25501070736492004, "learning_rate": 0.0001, "loss": 46947.9883, "loss/crossentropy": 2.5187008380889893, "loss/hidden": 0.396484375, "loss/logits": 0.0867588073015213, "loss/reg": 46947.50390625, "step": 758 }, { "epoch": 0.018975, "grad_norm": 1.1931730508804321, "grad_norm_var": 0.25210469699044175, "learning_rate": 0.0001, "loss": 46780.5078, "loss/crossentropy": 2.5223076343536377, "loss/hidden": 0.388671875, "loss/logits": 0.09107695519924164, "loss/reg": 46780.02734375, "step": 759 }, { "epoch": 0.019, "grad_norm": 0.8697965145111084, "grad_norm_var": 0.25669575978909187, "learning_rate": 0.0001, "loss": 46615.582, "loss/crossentropy": 2.593898296356201, "loss/hidden": 0.416015625, "loss/logits": 0.09482245147228241, "loss/reg": 46615.0703125, "step": 760 }, { "epoch": 0.019025, "grad_norm": 0.9785832166671753, "grad_norm_var": 0.2533926460845291, "learning_rate": 0.0001, "loss": 46452.4062, "loss/crossentropy": 2.3558356761932373, "loss/hidden": 0.4140625, "loss/logits": 0.09463217854499817, "loss/reg": 46451.8984375, "step": 761 }, { "epoch": 0.01905, "grad_norm": 0.8423299193382263, "grad_norm_var": 0.258754260549059, "learning_rate": 0.0001, "loss": 46287.6953, "loss/crossentropy": 2.4062812328338623, "loss/hidden": 0.40625, "loss/logits": 0.09871132671833038, "loss/reg": 46287.19140625, "step": 762 }, { "epoch": 0.019075, "grad_norm": 0.9173998236656189, "grad_norm_var": 0.25775734273993856, "learning_rate": 0.0001, "loss": 46124.8672, "loss/crossentropy": 2.4662153720855713, "loss/hidden": 0.396484375, "loss/logits": 0.10506129264831543, "loss/reg": 46124.3671875, "step": 763 }, { "epoch": 0.0191, "grad_norm": 0.8769504427909851, "grad_norm_var": 0.014563933054716201, "learning_rate": 0.0001, "loss": 45961.957, "loss/crossentropy": 2.389399290084839, "loss/hidden": 0.380859375, "loss/logits": 0.08493934571743011, "loss/reg": 45961.4921875, "step": 764 }, { "epoch": 0.019125, "grad_norm": 0.8609285950660706, "grad_norm_var": 0.013578057378831923, "learning_rate": 0.0001, "loss": 45799.5195, "loss/crossentropy": 2.642210006713867, "loss/hidden": 0.396484375, "loss/logits": 0.09388473629951477, "loss/reg": 45799.02734375, "step": 765 }, { "epoch": 0.01915, "grad_norm": 0.8592740297317505, "grad_norm_var": 0.014022235668175101, "learning_rate": 0.0001, "loss": 45639.2656, "loss/crossentropy": 2.7636122703552246, "loss/hidden": 0.400390625, "loss/logits": 0.10290414094924927, "loss/reg": 45638.76171875, "step": 766 }, { "epoch": 0.019175, "grad_norm": 1.0137566328048706, "grad_norm_var": 0.014123379321947441, "learning_rate": 0.0001, "loss": 45479.9531, "loss/crossentropy": 2.917463779449463, "loss/hidden": 0.423828125, "loss/logits": 0.09564699232578278, "loss/reg": 45479.43359375, "step": 767 }, { "epoch": 0.0192, "grad_norm": 0.886622965335846, "grad_norm_var": 0.014585137676438467, "learning_rate": 0.0001, "loss": 45319.2539, "loss/crossentropy": 2.5581212043762207, "loss/hidden": 0.392578125, "loss/logits": 0.09170180559158325, "loss/reg": 45318.76953125, "step": 768 }, { "epoch": 0.019225, "grad_norm": 0.9484365582466125, "grad_norm_var": 0.009493744584728781, "learning_rate": 0.0001, "loss": 45158.2812, "loss/crossentropy": 2.7654075622558594, "loss/hidden": 0.38671875, "loss/logits": 0.08669186383485794, "loss/reg": 45157.80859375, "step": 769 }, { "epoch": 0.01925, "grad_norm": 1.2060688734054565, "grad_norm_var": 0.013593108736964095, "learning_rate": 0.0001, "loss": 44999.1328, "loss/crossentropy": 2.64562726020813, "loss/hidden": 0.40234375, "loss/logits": 0.09077286720275879, "loss/reg": 44998.640625, "step": 770 }, { "epoch": 0.019275, "grad_norm": 1.0256370306015015, "grad_norm_var": 0.012388143634409174, "learning_rate": 0.0001, "loss": 44841.543, "loss/crossentropy": 2.2263450622558594, "loss/hidden": 0.4140625, "loss/logits": 0.09414076805114746, "loss/reg": 44841.03515625, "step": 771 }, { "epoch": 0.0193, "grad_norm": 0.8258554935455322, "grad_norm_var": 0.013381517293385947, "learning_rate": 0.0001, "loss": 44683.5742, "loss/crossentropy": 2.7400131225585938, "loss/hidden": 0.39453125, "loss/logits": 0.10209719091653824, "loss/reg": 44683.078125, "step": 772 }, { "epoch": 0.019325, "grad_norm": 1.02664315700531, "grad_norm_var": 0.013413099005024283, "learning_rate": 0.0001, "loss": 44527.4648, "loss/crossentropy": 2.6620962619781494, "loss/hidden": 0.419921875, "loss/logits": 0.10463500022888184, "loss/reg": 44526.94140625, "step": 773 }, { "epoch": 0.01935, "grad_norm": 0.7048200964927673, "grad_norm_var": 0.017309307493440434, "learning_rate": 0.0001, "loss": 44371.5391, "loss/crossentropy": 2.5526435375213623, "loss/hidden": 0.3828125, "loss/logits": 0.08704051375389099, "loss/reg": 44371.0703125, "step": 774 }, { "epoch": 0.019375, "grad_norm": 1.1765838861465454, "grad_norm_var": 0.01676600216286676, "learning_rate": 0.0001, "loss": 44216.4805, "loss/crossentropy": 2.7540805339813232, "loss/hidden": 0.3984375, "loss/logits": 0.09850931912660599, "loss/reg": 44215.984375, "step": 775 }, { "epoch": 0.0194, "grad_norm": 0.9540759921073914, "grad_norm_var": 0.01643531272497493, "learning_rate": 0.0001, "loss": 44060.5078, "loss/crossentropy": 2.4934980869293213, "loss/hidden": 0.384765625, "loss/logits": 0.09054020047187805, "loss/reg": 44060.03125, "step": 776 }, { "epoch": 0.019425, "grad_norm": 1.2347862720489502, "grad_norm_var": 0.021719260912863508, "learning_rate": 0.0001, "loss": 43907.1484, "loss/crossentropy": 2.648806571960449, "loss/hidden": 0.373046875, "loss/logits": 0.07670263200998306, "loss/reg": 43906.69921875, "step": 777 }, { "epoch": 0.01945, "grad_norm": 1.1064521074295044, "grad_norm_var": 0.021935016499496845, "learning_rate": 0.0001, "loss": 43753.8047, "loss/crossentropy": 2.6057424545288086, "loss/hidden": 0.41796875, "loss/logits": 0.1008932888507843, "loss/reg": 43753.28515625, "step": 778 }, { "epoch": 0.019475, "grad_norm": 0.8709408640861511, "grad_norm_var": 0.02243612940938117, "learning_rate": 0.0001, "loss": 43601.9141, "loss/crossentropy": 2.2285828590393066, "loss/hidden": 0.40234375, "loss/logits": 0.09650826454162598, "loss/reg": 43601.4140625, "step": 779 }, { "epoch": 0.0195, "grad_norm": 0.7791092991828918, "grad_norm_var": 0.024295465395644304, "learning_rate": 0.0001, "loss": 43450.8594, "loss/crossentropy": 2.456794500350952, "loss/hidden": 0.3828125, "loss/logits": 0.08525663614273071, "loss/reg": 43450.390625, "step": 780 }, { "epoch": 0.019525, "grad_norm": 0.9531808495521545, "grad_norm_var": 0.023516516246377708, "learning_rate": 0.0001, "loss": 43299.4375, "loss/crossentropy": 2.855341672897339, "loss/hidden": 0.40625, "loss/logits": 0.09789630770683289, "loss/reg": 43298.93359375, "step": 781 }, { "epoch": 0.01955, "grad_norm": 0.9563007354736328, "grad_norm_var": 0.02263021016615583, "learning_rate": 0.0001, "loss": 43147.2109, "loss/crossentropy": 2.42067813873291, "loss/hidden": 0.4140625, "loss/logits": 0.10505105555057526, "loss/reg": 43146.69140625, "step": 782 }, { "epoch": 0.019575, "grad_norm": 1.034609079360962, "grad_norm_var": 0.02275310556488711, "learning_rate": 0.0001, "loss": 42994.3594, "loss/crossentropy": 2.3903796672821045, "loss/hidden": 0.404296875, "loss/logits": 0.08946970105171204, "loss/reg": 42993.8671875, "step": 783 }, { "epoch": 0.0196, "grad_norm": 0.997693657875061, "grad_norm_var": 0.02213191900271004, "learning_rate": 0.0001, "loss": 42843.2031, "loss/crossentropy": 2.542088508605957, "loss/hidden": 0.40625, "loss/logits": 0.11267843097448349, "loss/reg": 42842.68359375, "step": 784 }, { "epoch": 0.019625, "grad_norm": 0.8993119597434998, "grad_norm_var": 0.02253909786148013, "learning_rate": 0.0001, "loss": 42692.3203, "loss/crossentropy": 2.712129592895508, "loss/hidden": 0.3984375, "loss/logits": 0.08530149608850479, "loss/reg": 42691.8359375, "step": 785 }, { "epoch": 0.01965, "grad_norm": 0.7412862181663513, "grad_norm_var": 0.0223099495310689, "learning_rate": 0.0001, "loss": 42542.2109, "loss/crossentropy": 2.539766788482666, "loss/hidden": 0.40234375, "loss/logits": 0.10429234802722931, "loss/reg": 42541.703125, "step": 786 }, { "epoch": 0.019675, "grad_norm": 0.8172112703323364, "grad_norm_var": 0.023074676517893862, "learning_rate": 0.0001, "loss": 42390.5508, "loss/crossentropy": 2.2713160514831543, "loss/hidden": 0.400390625, "loss/logits": 0.09550666809082031, "loss/reg": 42390.0546875, "step": 787 }, { "epoch": 0.0197, "grad_norm": 0.9562167525291443, "grad_norm_var": 0.02211058591684741, "learning_rate": 0.0001, "loss": 42238.7461, "loss/crossentropy": 2.5588197708129883, "loss/hidden": 0.388671875, "loss/logits": 0.09026595205068588, "loss/reg": 42238.265625, "step": 788 }, { "epoch": 0.019725, "grad_norm": 1.0070997476577759, "grad_norm_var": 0.021936243600945163, "learning_rate": 0.0001, "loss": 42086.7109, "loss/crossentropy": 2.6279189586639404, "loss/hidden": 0.39453125, "loss/logits": 0.08974668383598328, "loss/reg": 42086.2265625, "step": 789 }, { "epoch": 0.01975, "grad_norm": 0.924180269241333, "grad_norm_var": 0.017791513662725324, "learning_rate": 0.0001, "loss": 41936.4023, "loss/crossentropy": 2.5648348331451416, "loss/hidden": 0.388671875, "loss/logits": 0.08369086682796478, "loss/reg": 41935.9296875, "step": 790 }, { "epoch": 0.019775, "grad_norm": 0.8046944737434387, "grad_norm_var": 0.015847980469538867, "learning_rate": 0.0001, "loss": 41785.668, "loss/crossentropy": 2.410979986190796, "loss/hidden": 0.38671875, "loss/logits": 0.09604763239622116, "loss/reg": 41785.18359375, "step": 791 }, { "epoch": 0.0198, "grad_norm": 0.7474042177200317, "grad_norm_var": 0.018124766264193903, "learning_rate": 0.0001, "loss": 41635.7461, "loss/crossentropy": 2.548205852508545, "loss/hidden": 0.37890625, "loss/logits": 0.08188501000404358, "loss/reg": 41635.28515625, "step": 792 }, { "epoch": 0.019825, "grad_norm": 0.8547723889350891, "grad_norm_var": 0.01155053097809707, "learning_rate": 0.0001, "loss": 41487.7812, "loss/crossentropy": 2.542738199234009, "loss/hidden": 0.3828125, "loss/logits": 0.08695337921380997, "loss/reg": 41487.3125, "step": 793 }, { "epoch": 0.01985, "grad_norm": 1.011048674583435, "grad_norm_var": 0.009533349350237034, "learning_rate": 0.0001, "loss": 41338.7852, "loss/crossentropy": 2.647214889526367, "loss/hidden": 0.421875, "loss/logits": 0.10188287496566772, "loss/reg": 41338.26171875, "step": 794 }, { "epoch": 0.019875, "grad_norm": 0.9825843572616577, "grad_norm_var": 0.009921607801237053, "learning_rate": 0.0001, "loss": 41189.8125, "loss/crossentropy": 2.245267629623413, "loss/hidden": 0.439453125, "loss/logits": 0.10342052578926086, "loss/reg": 41189.26953125, "step": 795 }, { "epoch": 0.0199, "grad_norm": 0.9716632962226868, "grad_norm_var": 0.00902815695549628, "learning_rate": 0.0001, "loss": 41040.1719, "loss/crossentropy": 2.4565536975860596, "loss/hidden": 0.4296875, "loss/logits": 0.10906583070755005, "loss/reg": 41039.6328125, "step": 796 }, { "epoch": 0.019925, "grad_norm": 0.9376354813575745, "grad_norm_var": 0.008966617327267067, "learning_rate": 0.0001, "loss": 40893.4688, "loss/crossentropy": 2.5155081748962402, "loss/hidden": 0.427734375, "loss/logits": 0.09355796873569489, "loss/reg": 40892.94921875, "step": 797 }, { "epoch": 0.01995, "grad_norm": 1.3822612762451172, "grad_norm_var": 0.022639252263949538, "learning_rate": 0.0001, "loss": 40746.293, "loss/crossentropy": 2.3780517578125, "loss/hidden": 0.416015625, "loss/logits": 0.11463729292154312, "loss/reg": 40745.76171875, "step": 798 }, { "epoch": 0.019975, "grad_norm": 0.8885180354118347, "grad_norm_var": 0.022166417254800307, "learning_rate": 0.0001, "loss": 40600.6367, "loss/crossentropy": 2.2522025108337402, "loss/hidden": 0.4140625, "loss/logits": 0.11032229661941528, "loss/reg": 40600.11328125, "step": 799 }, { "epoch": 0.02, "grad_norm": 1.1068843603134155, "grad_norm_var": 0.023857459775651315, "learning_rate": 0.0001, "loss": 40454.9258, "loss/crossentropy": 2.6258373260498047, "loss/hidden": 0.44140625, "loss/logits": 0.09626742452383041, "loss/reg": 40454.38671875, "step": 800 }, { "epoch": 0.020025, "grad_norm": 0.9436428546905518, "grad_norm_var": 0.023742458152267654, "learning_rate": 0.0001, "loss": 40309.0195, "loss/crossentropy": 2.503645896911621, "loss/hidden": 0.384765625, "loss/logits": 0.09005875140428543, "loss/reg": 40308.54296875, "step": 801 }, { "epoch": 0.02005, "grad_norm": 1.5282552242279053, "grad_norm_var": 0.04135576468478863, "learning_rate": 0.0001, "loss": 40164.7656, "loss/crossentropy": 2.4166529178619385, "loss/hidden": 0.44140625, "loss/logits": 0.108254574239254, "loss/reg": 40164.21484375, "step": 802 }, { "epoch": 0.020075, "grad_norm": 1.1386072635650635, "grad_norm_var": 0.04034277163636801, "learning_rate": 0.0001, "loss": 40020.0742, "loss/crossentropy": 2.5895628929138184, "loss/hidden": 0.3984375, "loss/logits": 0.09454331547021866, "loss/reg": 40019.58203125, "step": 803 }, { "epoch": 0.0201, "grad_norm": 0.9719331860542297, "grad_norm_var": 0.04024216977857732, "learning_rate": 0.0001, "loss": 39875.0859, "loss/crossentropy": 2.553748846054077, "loss/hidden": 0.4140625, "loss/logits": 0.08906621485948563, "loss/reg": 39874.58203125, "step": 804 }, { "epoch": 0.020125, "grad_norm": 1.075038194656372, "grad_norm_var": 0.04048105789431939, "learning_rate": 0.0001, "loss": 39732.1992, "loss/crossentropy": 2.573615789413452, "loss/hidden": 0.416015625, "loss/logits": 0.09945542365312576, "loss/reg": 39731.68359375, "step": 805 }, { "epoch": 0.02015, "grad_norm": 1.2552541494369507, "grad_norm_var": 0.043242255039610634, "learning_rate": 0.0001, "loss": 39588.8477, "loss/crossentropy": 2.787565231323242, "loss/hidden": 0.4296875, "loss/logits": 0.1017235741019249, "loss/reg": 39588.31640625, "step": 806 }, { "epoch": 0.020175, "grad_norm": 0.9056074023246765, "grad_norm_var": 0.0407461410468116, "learning_rate": 0.0001, "loss": 39446.2148, "loss/crossentropy": 2.705078601837158, "loss/hidden": 0.408203125, "loss/logits": 0.09658461809158325, "loss/reg": 39445.7109375, "step": 807 }, { "epoch": 0.0202, "grad_norm": 0.9769497513771057, "grad_norm_var": 0.0349672339588764, "learning_rate": 0.0001, "loss": 39305.0352, "loss/crossentropy": 2.384089708328247, "loss/hidden": 0.42578125, "loss/logits": 0.1048816367983818, "loss/reg": 39304.50390625, "step": 808 }, { "epoch": 0.020225, "grad_norm": 0.9730967879295349, "grad_norm_var": 0.03263341861796503, "learning_rate": 0.0001, "loss": 39162.4141, "loss/crossentropy": 2.619083881378174, "loss/hidden": 0.423828125, "loss/logits": 0.1224329024553299, "loss/reg": 39161.8671875, "step": 809 }, { "epoch": 0.02025, "grad_norm": 0.903662919998169, "grad_norm_var": 0.03413466668993359, "learning_rate": 0.0001, "loss": 39020.543, "loss/crossentropy": 2.0379161834716797, "loss/hidden": 0.38671875, "loss/logits": 0.07385985553264618, "loss/reg": 39020.08203125, "step": 810 }, { "epoch": 0.020275, "grad_norm": 0.8019759654998779, "grad_norm_var": 0.038009932433176734, "learning_rate": 0.0001, "loss": 38882.8398, "loss/crossentropy": 2.404892921447754, "loss/hidden": 0.392578125, "loss/logits": 0.09625999629497528, "loss/reg": 38882.3515625, "step": 811 }, { "epoch": 0.0203, "grad_norm": 1.1033836603164673, "grad_norm_var": 0.03776134243067393, "learning_rate": 0.0001, "loss": 38746.8789, "loss/crossentropy": 2.6194732189178467, "loss/hidden": 0.4140625, "loss/logits": 0.08940684050321579, "loss/reg": 38746.375, "step": 812 }, { "epoch": 0.020325, "grad_norm": 0.9457865357398987, "grad_norm_var": 0.037637079205596, "learning_rate": 0.0001, "loss": 38602.4922, "loss/crossentropy": 2.4452311992645264, "loss/hidden": 0.400390625, "loss/logits": 0.099249929189682, "loss/reg": 38601.9921875, "step": 813 }, { "epoch": 0.02035, "grad_norm": 0.9055691957473755, "grad_norm_var": 0.031121762090745856, "learning_rate": 0.0001, "loss": 38465.1914, "loss/crossentropy": 2.4715399742126465, "loss/hidden": 0.40234375, "loss/logits": 0.10231912136077881, "loss/reg": 38464.6875, "step": 814 }, { "epoch": 0.020375, "grad_norm": 1.168678879737854, "grad_norm_var": 0.03087272107349374, "learning_rate": 0.0001, "loss": 38328.918, "loss/crossentropy": 2.5399720668792725, "loss/hidden": 0.42578125, "loss/logits": 0.10657870769500732, "loss/reg": 38328.38671875, "step": 815 }, { "epoch": 0.0204, "grad_norm": 1.066002368927002, "grad_norm_var": 0.03063451244188884, "learning_rate": 0.0001, "loss": 38191.5703, "loss/crossentropy": 2.6654410362243652, "loss/hidden": 0.431640625, "loss/logits": 0.10600557923316956, "loss/reg": 38191.03125, "step": 816 }, { "epoch": 0.020425, "grad_norm": 1.0306800603866577, "grad_norm_var": 0.02997275433461438, "learning_rate": 0.0001, "loss": 38056.9844, "loss/crossentropy": 2.3844645023345947, "loss/hidden": 0.39453125, "loss/logits": 0.08257506787776947, "loss/reg": 38056.5078125, "step": 817 }, { "epoch": 0.02045, "grad_norm": 1.1823235750198364, "grad_norm_var": 0.015250149322878655, "learning_rate": 0.0001, "loss": 37920.3672, "loss/crossentropy": 2.38747501373291, "loss/hidden": 0.412109375, "loss/logits": 0.10297746956348419, "loss/reg": 37919.8515625, "step": 818 }, { "epoch": 0.020475, "grad_norm": 1.275691270828247, "grad_norm_var": 0.018495951763542337, "learning_rate": 0.0001, "loss": 37784.2227, "loss/crossentropy": 3.0008273124694824, "loss/hidden": 0.4296875, "loss/logits": 0.09561491012573242, "loss/reg": 37783.69921875, "step": 819 }, { "epoch": 0.0205, "grad_norm": 0.8513725996017456, "grad_norm_var": 0.020399711250510145, "learning_rate": 0.0001, "loss": 37646.6133, "loss/crossentropy": 2.615851640701294, "loss/hidden": 0.423828125, "loss/logits": 0.11105596274137497, "loss/reg": 37646.078125, "step": 820 }, { "epoch": 0.020525, "grad_norm": 1.0822184085845947, "grad_norm_var": 0.020449577202075336, "learning_rate": 0.0001, "loss": 37501.4727, "loss/crossentropy": 2.793099880218506, "loss/hidden": 0.423828125, "loss/logits": 0.10729962587356567, "loss/reg": 37500.94140625, "step": 821 }, { "epoch": 0.02055, "grad_norm": 0.9906818866729736, "grad_norm_var": 0.016764260003671202, "learning_rate": 0.0001, "loss": 37365.0977, "loss/crossentropy": 2.5842463970184326, "loss/hidden": 0.419921875, "loss/logits": 0.09130015224218369, "loss/reg": 37364.5859375, "step": 822 }, { "epoch": 0.020575, "grad_norm": 0.7514151334762573, "grad_norm_var": 0.02040114788076412, "learning_rate": 0.0001, "loss": 37230.4766, "loss/crossentropy": 2.2744431495666504, "loss/hidden": 0.380859375, "loss/logits": 0.07892940938472748, "loss/reg": 37230.015625, "step": 823 }, { "epoch": 0.0206, "grad_norm": 1.0638694763183594, "grad_norm_var": 0.020599328445768526, "learning_rate": 0.0001, "loss": 37097.5586, "loss/crossentropy": 2.402249336242676, "loss/hidden": 0.42578125, "loss/logits": 0.10117131471633911, "loss/reg": 37097.03125, "step": 824 }, { "epoch": 0.020625, "grad_norm": 0.8464297652244568, "grad_norm_var": 0.022158243548479048, "learning_rate": 0.0001, "loss": 36962.1289, "loss/crossentropy": 2.519179344177246, "loss/hidden": 0.419921875, "loss/logits": 0.1031673401594162, "loss/reg": 36961.60546875, "step": 825 }, { "epoch": 0.02065, "grad_norm": 1.0223678350448608, "grad_norm_var": 0.021544095854699682, "learning_rate": 0.0001, "loss": 36831.0234, "loss/crossentropy": 2.6199722290039062, "loss/hidden": 0.42578125, "loss/logits": 0.1116126999258995, "loss/reg": 36830.484375, "step": 826 }, { "epoch": 0.020675, "grad_norm": 0.7614914178848267, "grad_norm_var": 0.02274529440476408, "learning_rate": 0.0001, "loss": 36698.6719, "loss/crossentropy": 2.5858829021453857, "loss/hidden": 0.388671875, "loss/logits": 0.07992447167634964, "loss/reg": 36698.203125, "step": 827 }, { "epoch": 0.0207, "grad_norm": 0.8607364892959595, "grad_norm_var": 0.023177362648318874, "learning_rate": 0.0001, "loss": 36569.9805, "loss/crossentropy": 2.647308588027954, "loss/hidden": 0.41015625, "loss/logits": 0.09774725884199142, "loss/reg": 36569.47265625, "step": 828 }, { "epoch": 0.020725, "grad_norm": 0.7578085660934448, "grad_norm_var": 0.02643966563970855, "learning_rate": 0.0001, "loss": 36439.7461, "loss/crossentropy": 2.263488531112671, "loss/hidden": 0.392578125, "loss/logits": 0.08892738819122314, "loss/reg": 36439.265625, "step": 829 }, { "epoch": 0.02075, "grad_norm": 1.2861242294311523, "grad_norm_var": 0.03191310292394432, "learning_rate": 0.0001, "loss": 36311.9961, "loss/crossentropy": 2.9907784461975098, "loss/hidden": 0.41796875, "loss/logits": 0.10988874733448029, "loss/reg": 36311.46875, "step": 830 }, { "epoch": 0.020775, "grad_norm": 1.1022545099258423, "grad_norm_var": 0.030693779916705907, "learning_rate": 0.0001, "loss": 36182.2773, "loss/crossentropy": 2.7796716690063477, "loss/hidden": 0.412109375, "loss/logits": 0.11308954656124115, "loss/reg": 36181.75390625, "step": 831 }, { "epoch": 0.0208, "grad_norm": 0.8730552792549133, "grad_norm_var": 0.031212378059443265, "learning_rate": 0.0001, "loss": 36055.8984, "loss/crossentropy": 2.366497039794922, "loss/hidden": 0.390625, "loss/logits": 0.09841403365135193, "loss/reg": 36055.41015625, "step": 832 }, { "epoch": 0.020825, "grad_norm": 0.9390790462493896, "grad_norm_var": 0.03116249149287403, "learning_rate": 0.0001, "loss": 35918.8398, "loss/crossentropy": 2.5822160243988037, "loss/hidden": 0.408203125, "loss/logits": 0.09956912696361542, "loss/reg": 35918.33203125, "step": 833 }, { "epoch": 0.02085, "grad_norm": 1.1822123527526855, "grad_norm_var": 0.03115946122038332, "learning_rate": 0.0001, "loss": 35790.5547, "loss/crossentropy": 2.655499219894409, "loss/hidden": 0.421875, "loss/logits": 0.09676744788885117, "loss/reg": 35790.03515625, "step": 834 }, { "epoch": 0.020875, "grad_norm": 1.1304856538772583, "grad_norm_var": 0.026712285164661863, "learning_rate": 0.0001, "loss": 35663.8516, "loss/crossentropy": 2.5775606632232666, "loss/hidden": 0.421875, "loss/logits": 0.09897801280021667, "loss/reg": 35663.33203125, "step": 835 }, { "epoch": 0.0209, "grad_norm": 0.9108983278274536, "grad_norm_var": 0.026001350495837273, "learning_rate": 0.0001, "loss": 35537.7773, "loss/crossentropy": 2.584146022796631, "loss/hidden": 0.392578125, "loss/logits": 0.08021892607212067, "loss/reg": 35537.3046875, "step": 836 }, { "epoch": 0.020925, "grad_norm": 0.7505898475646973, "grad_norm_var": 0.028026628359788693, "learning_rate": 0.0001, "loss": 35412.0938, "loss/crossentropy": 2.5469448566436768, "loss/hidden": 0.39453125, "loss/logits": 0.09684520214796066, "loss/reg": 35411.6015625, "step": 837 }, { "epoch": 0.02095, "grad_norm": 0.981681764125824, "grad_norm_var": 0.027985084585314117, "learning_rate": 0.0001, "loss": 35287.625, "loss/crossentropy": 2.333712339401245, "loss/hidden": 0.4375, "loss/logits": 0.12408019602298737, "loss/reg": 35287.0625, "step": 838 }, { "epoch": 0.020975, "grad_norm": 0.7908355593681335, "grad_norm_var": 0.027031700151352804, "learning_rate": 0.0001, "loss": 35161.8086, "loss/crossentropy": 2.6098623275756836, "loss/hidden": 0.404296875, "loss/logits": 0.1000092625617981, "loss/reg": 35161.3046875, "step": 839 }, { "epoch": 0.021, "grad_norm": 0.7975010871887207, "grad_norm_var": 0.02755505059063334, "learning_rate": 0.0001, "loss": 35037.0391, "loss/crossentropy": 2.4856231212615967, "loss/hidden": 0.390625, "loss/logits": 0.08976828306913376, "loss/reg": 35036.55859375, "step": 840 }, { "epoch": 0.021025, "grad_norm": 0.8434867262840271, "grad_norm_var": 0.02759117022010192, "learning_rate": 0.0001, "loss": 34913.6719, "loss/crossentropy": 2.45410418510437, "loss/hidden": 0.41796875, "loss/logits": 0.11138058453798294, "loss/reg": 34913.140625, "step": 841 }, { "epoch": 0.02105, "grad_norm": 0.7404183745384216, "grad_norm_var": 0.029347114537091306, "learning_rate": 0.0001, "loss": 34790.2539, "loss/crossentropy": 2.7616987228393555, "loss/hidden": 0.40234375, "loss/logits": 0.10481090843677521, "loss/reg": 34789.74609375, "step": 842 }, { "epoch": 0.021075, "grad_norm": 0.7900260090827942, "grad_norm_var": 0.028797636490651493, "learning_rate": 0.0001, "loss": 34668.2109, "loss/crossentropy": 2.348297357559204, "loss/hidden": 0.3984375, "loss/logits": 0.10565009713172913, "loss/reg": 34667.70703125, "step": 843 }, { "epoch": 0.0211, "grad_norm": 0.8611040115356445, "grad_norm_var": 0.02879468818566166, "learning_rate": 0.0001, "loss": 34545.125, "loss/crossentropy": 2.540072202682495, "loss/hidden": 0.416015625, "loss/logits": 0.085359126329422, "loss/reg": 34544.625, "step": 844 }, { "epoch": 0.021125, "grad_norm": 0.8963682651519775, "grad_norm_var": 0.026977908947997755, "learning_rate": 0.0001, "loss": 34422.8633, "loss/crossentropy": 2.5841212272644043, "loss/hidden": 0.3984375, "loss/logits": 0.10156680643558502, "loss/reg": 34422.36328125, "step": 845 }, { "epoch": 0.02115, "grad_norm": 1.0223283767700195, "grad_norm_var": 0.01879276828590027, "learning_rate": 0.0001, "loss": 34301.707, "loss/crossentropy": 2.905853509902954, "loss/hidden": 0.4140625, "loss/logits": 0.09710683673620224, "loss/reg": 34301.1953125, "step": 846 }, { "epoch": 0.021175, "grad_norm": 1.0493470430374146, "grad_norm_var": 0.01763456168458267, "learning_rate": 0.0001, "loss": 34179.6914, "loss/crossentropy": 2.8628482818603516, "loss/hidden": 0.3984375, "loss/logits": 0.0946056917309761, "loss/reg": 34179.19921875, "step": 847 }, { "epoch": 0.0212, "grad_norm": 1.0956487655639648, "grad_norm_var": 0.019635895807511957, "learning_rate": 0.0001, "loss": 34060.0234, "loss/crossentropy": 2.604365110397339, "loss/hidden": 0.4296875, "loss/logits": 0.1112070232629776, "loss/reg": 34059.484375, "step": 848 }, { "epoch": 0.021225, "grad_norm": 1.0503216981887817, "grad_norm_var": 0.020634830180398453, "learning_rate": 0.0001, "loss": 33938.5898, "loss/crossentropy": 2.8042867183685303, "loss/hidden": 0.427734375, "loss/logits": 0.10720456391572952, "loss/reg": 33938.0546875, "step": 849 }, { "epoch": 0.02125, "grad_norm": 0.9675441384315491, "grad_norm_var": 0.016319762626638538, "learning_rate": 0.0001, "loss": 33818.2266, "loss/crossentropy": 2.4751031398773193, "loss/hidden": 0.41796875, "loss/logits": 0.10572968423366547, "loss/reg": 33817.703125, "step": 850 }, { "epoch": 0.021275, "grad_norm": 1.0344288349151611, "grad_norm_var": 0.014167482585636293, "learning_rate": 0.0001, "loss": 33699.5742, "loss/crossentropy": 2.600318670272827, "loss/hidden": 0.39453125, "loss/logits": 0.08344602584838867, "loss/reg": 33699.09765625, "step": 851 }, { "epoch": 0.0213, "grad_norm": 1.407858967781067, "grad_norm_var": 0.029569324902826346, "learning_rate": 0.0001, "loss": 33581.5234, "loss/crossentropy": 2.62861704826355, "loss/hidden": 0.4375, "loss/logits": 0.11908341199159622, "loss/reg": 33580.96875, "step": 852 }, { "epoch": 0.021325, "grad_norm": 0.8948066830635071, "grad_norm_var": 0.02717962112157412, "learning_rate": 0.0001, "loss": 33462.6289, "loss/crossentropy": 2.597301721572876, "loss/hidden": 0.423828125, "loss/logits": 0.10772041976451874, "loss/reg": 33462.09765625, "step": 853 }, { "epoch": 0.02135, "grad_norm": 0.8743274211883545, "grad_norm_var": 0.02746764893301516, "learning_rate": 0.0001, "loss": 33344.4766, "loss/crossentropy": 2.502575397491455, "loss/hidden": 0.41796875, "loss/logits": 0.09394825994968414, "loss/reg": 33343.96484375, "step": 854 }, { "epoch": 0.021375, "grad_norm": 0.7814751267433167, "grad_norm_var": 0.027665246593712566, "learning_rate": 0.0001, "loss": 33217.1211, "loss/crossentropy": 2.4343154430389404, "loss/hidden": 0.392578125, "loss/logits": 0.09242767840623856, "loss/reg": 33216.63671875, "step": 855 }, { "epoch": 0.0214, "grad_norm": 0.8254016637802124, "grad_norm_var": 0.027168216466293357, "learning_rate": 0.0001, "loss": 33098.625, "loss/crossentropy": 2.597825527191162, "loss/hidden": 0.408203125, "loss/logits": 0.09620468318462372, "loss/reg": 33098.12109375, "step": 856 }, { "epoch": 0.021425, "grad_norm": 1.0467920303344727, "grad_norm_var": 0.026974543250588436, "learning_rate": 0.0001, "loss": 32981.0117, "loss/crossentropy": 2.531338691711426, "loss/hidden": 0.4140625, "loss/logits": 0.09141959249973297, "loss/reg": 32980.5078125, "step": 857 }, { "epoch": 0.02145, "grad_norm": 1.297360897064209, "grad_norm_var": 0.030156381415047187, "learning_rate": 0.0001, "loss": 32864.7031, "loss/crossentropy": 2.7989187240600586, "loss/hidden": 0.4296875, "loss/logits": 0.10818947851657867, "loss/reg": 32864.1640625, "step": 858 }, { "epoch": 0.021475, "grad_norm": 0.8410242795944214, "grad_norm_var": 0.02893572218853119, "learning_rate": 0.0001, "loss": 32750.0977, "loss/crossentropy": 2.44685697555542, "loss/hidden": 0.416015625, "loss/logits": 0.09526005387306213, "loss/reg": 32749.5859375, "step": 859 }, { "epoch": 0.0215, "grad_norm": 0.8180389404296875, "grad_norm_var": 0.029829847108710426, "learning_rate": 0.0001, "loss": 32628.207, "loss/crossentropy": 2.5319130420684814, "loss/hidden": 0.39453125, "loss/logits": 0.08277040719985962, "loss/reg": 32627.73046875, "step": 860 }, { "epoch": 0.021525, "grad_norm": 0.878432035446167, "grad_norm_var": 0.030083301371783343, "learning_rate": 0.0001, "loss": 32512.7188, "loss/crossentropy": 2.694110870361328, "loss/hidden": 0.412109375, "loss/logits": 0.0938844084739685, "loss/reg": 32512.212890625, "step": 861 }, { "epoch": 0.02155, "grad_norm": 0.8658337593078613, "grad_norm_var": 0.030998263460790657, "learning_rate": 0.0001, "loss": 32397.1641, "loss/crossentropy": 2.352792739868164, "loss/hidden": 0.423828125, "loss/logits": 0.1003153845667839, "loss/reg": 32396.640625, "step": 862 }, { "epoch": 0.021575, "grad_norm": 1.118543267250061, "grad_norm_var": 0.031909279085837135, "learning_rate": 0.0001, "loss": 32281.918, "loss/crossentropy": 2.7677767276763916, "loss/hidden": 0.439453125, "loss/logits": 0.10245128720998764, "loss/reg": 32281.376953125, "step": 863 }, { "epoch": 0.0216, "grad_norm": 0.9408777356147766, "grad_norm_var": 0.031171847988469815, "learning_rate": 0.0001, "loss": 32169.8145, "loss/crossentropy": 2.556101083755493, "loss/hidden": 0.3984375, "loss/logits": 0.09347447752952576, "loss/reg": 32169.322265625, "step": 864 }, { "epoch": 0.021625, "grad_norm": 0.9442301988601685, "grad_norm_var": 0.0308479210707351, "learning_rate": 0.0001, "loss": 32055.9531, "loss/crossentropy": 2.5342166423797607, "loss/hidden": 0.431640625, "loss/logits": 0.10387556254863739, "loss/reg": 32055.41796875, "step": 865 }, { "epoch": 0.02165, "grad_norm": 0.8331806659698486, "grad_norm_var": 0.03203927249987128, "learning_rate": 0.0001, "loss": 31928.8379, "loss/crossentropy": 2.78352952003479, "loss/hidden": 0.412109375, "loss/logits": 0.08486774563789368, "loss/reg": 31928.341796875, "step": 866 }, { "epoch": 0.021675, "grad_norm": 0.9290958642959595, "grad_norm_var": 0.03172480831427588, "learning_rate": 0.0001, "loss": 31813.7832, "loss/crossentropy": 2.6741678714752197, "loss/hidden": 0.41796875, "loss/logits": 0.10052283853292465, "loss/reg": 31813.265625, "step": 867 }, { "epoch": 0.0217, "grad_norm": 0.6767722368240356, "grad_norm_var": 0.021091679013798256, "learning_rate": 0.0001, "loss": 31700.1758, "loss/crossentropy": 2.521838665008545, "loss/hidden": 0.392578125, "loss/logits": 0.09481383860111237, "loss/reg": 31699.6875, "step": 868 }, { "epoch": 0.021725, "grad_norm": 0.7836902737617493, "grad_norm_var": 0.02209418880242387, "learning_rate": 0.0001, "loss": 31586.8145, "loss/crossentropy": 2.4294605255126953, "loss/hidden": 0.400390625, "loss/logits": 0.10107098519802094, "loss/reg": 31586.3125, "step": 869 }, { "epoch": 0.02175, "grad_norm": 0.7529267072677612, "grad_norm_var": 0.023486597337748435, "learning_rate": 0.0001, "loss": 31472.9453, "loss/crossentropy": 2.4297895431518555, "loss/hidden": 0.390625, "loss/logits": 0.09175887703895569, "loss/reg": 31472.462890625, "step": 870 }, { "epoch": 0.021775, "grad_norm": 0.7665960788726807, "grad_norm_var": 0.023727348579464073, "learning_rate": 0.0001, "loss": 31361.6562, "loss/crossentropy": 2.502922296524048, "loss/hidden": 0.404296875, "loss/logits": 0.10530885308980942, "loss/reg": 31361.146484375, "step": 871 }, { "epoch": 0.0218, "grad_norm": 0.8659310936927795, "grad_norm_var": 0.023454315559804078, "learning_rate": 0.0001, "loss": 31250.8555, "loss/crossentropy": 2.400033473968506, "loss/hidden": 0.419921875, "loss/logits": 0.10294472426176071, "loss/reg": 31250.33203125, "step": 872 }, { "epoch": 0.021825, "grad_norm": 0.857296884059906, "grad_norm_var": 0.021925511634459778, "learning_rate": 0.0001, "loss": 31138.9648, "loss/crossentropy": 2.500699281692505, "loss/hidden": 0.4140625, "loss/logits": 0.09416045993566513, "loss/reg": 31138.45703125, "step": 873 }, { "epoch": 0.02185, "grad_norm": 1.3556040525436401, "grad_norm_var": 0.025335049988287513, "learning_rate": 0.0001, "loss": 31028.1133, "loss/crossentropy": 2.4126169681549072, "loss/hidden": 0.447265625, "loss/logits": 0.09992693364620209, "loss/reg": 31027.56640625, "step": 874 }, { "epoch": 0.021875, "grad_norm": 1.1106455326080322, "grad_norm_var": 0.028144669255741366, "learning_rate": 0.0001, "loss": 30916.2793, "loss/crossentropy": 2.682753801345825, "loss/hidden": 0.40234375, "loss/logits": 0.09390898793935776, "loss/reg": 30915.783203125, "step": 875 }, { "epoch": 0.0219, "grad_norm": 0.9540243148803711, "grad_norm_var": 0.027703643787985694, "learning_rate": 0.0001, "loss": 30805.8301, "loss/crossentropy": 2.50162410736084, "loss/hidden": 0.42578125, "loss/logits": 0.10138815641403198, "loss/reg": 30805.302734375, "step": 876 }, { "epoch": 0.021925, "grad_norm": 0.9050595760345459, "grad_norm_var": 0.027619531474660106, "learning_rate": 0.0001, "loss": 30695.3926, "loss/crossentropy": 2.7011783123016357, "loss/hidden": 0.392578125, "loss/logits": 0.08969256281852722, "loss/reg": 30694.91015625, "step": 877 }, { "epoch": 0.02195, "grad_norm": 1.171053171157837, "grad_norm_var": 0.03138944267110251, "learning_rate": 0.0001, "loss": 30585.9648, "loss/crossentropy": 2.6654486656188965, "loss/hidden": 0.41796875, "loss/logits": 0.110439732670784, "loss/reg": 30585.435546875, "step": 878 }, { "epoch": 0.021975, "grad_norm": 0.8103191256523132, "grad_norm_var": 0.02979827776598006, "learning_rate": 0.0001, "loss": 30475.1992, "loss/crossentropy": 2.520002603530884, "loss/hidden": 0.392578125, "loss/logits": 0.08766569197177887, "loss/reg": 30474.71875, "step": 879 }, { "epoch": 0.022, "grad_norm": 0.8522130846977234, "grad_norm_var": 0.029996476232309735, "learning_rate": 0.0001, "loss": 30366.8828, "loss/crossentropy": 2.460566997528076, "loss/hidden": 0.404296875, "loss/logits": 0.096153125166893, "loss/reg": 30366.3828125, "step": 880 }, { "epoch": 0.022025, "grad_norm": 0.9695855379104614, "grad_norm_var": 0.030150554154386323, "learning_rate": 0.0001, "loss": 30258.8926, "loss/crossentropy": 2.2949836254119873, "loss/hidden": 0.4140625, "loss/logits": 0.08789625763893127, "loss/reg": 30258.390625, "step": 881 }, { "epoch": 0.02205, "grad_norm": 0.9067624807357788, "grad_norm_var": 0.029714434778758524, "learning_rate": 0.0001, "loss": 30152.0566, "loss/crossentropy": 2.5709807872772217, "loss/hidden": 0.404296875, "loss/logits": 0.08980227261781693, "loss/reg": 30151.5625, "step": 882 }, { "epoch": 0.022075, "grad_norm": 3.0597167015075684, "grad_norm_var": 0.31695078213330063, "learning_rate": 0.0001, "loss": 30043.8809, "loss/crossentropy": 2.7180075645446777, "loss/hidden": 0.46875, "loss/logits": 0.1211073249578476, "loss/reg": 30043.291015625, "step": 883 }, { "epoch": 0.0221, "grad_norm": 0.949224591255188, "grad_norm_var": 0.308036031196324, "learning_rate": 0.0001, "loss": 29936.375, "loss/crossentropy": 2.387786388397217, "loss/hidden": 0.4296875, "loss/logits": 0.09997786581516266, "loss/reg": 29935.845703125, "step": 884 }, { "epoch": 0.022125, "grad_norm": 1.128546118736267, "grad_norm_var": 0.3024459582136742, "learning_rate": 0.0001, "loss": 29830.6816, "loss/crossentropy": 2.5146074295043945, "loss/hidden": 0.44140625, "loss/logits": 0.12338151037693024, "loss/reg": 29830.1171875, "step": 885 }, { "epoch": 0.02215, "grad_norm": 0.7824976444244385, "grad_norm_var": 0.3011776371345623, "learning_rate": 0.0001, "loss": 29722.6387, "loss/crossentropy": 2.3945186138153076, "loss/hidden": 0.3984375, "loss/logits": 0.08712532371282578, "loss/reg": 29722.15234375, "step": 886 }, { "epoch": 0.022175, "grad_norm": 1.2043960094451904, "grad_norm_var": 0.2942602550435301, "learning_rate": 0.0001, "loss": 29617.1387, "loss/crossentropy": 2.932788372039795, "loss/hidden": 0.4296875, "loss/logits": 0.09943626821041107, "loss/reg": 29616.609375, "step": 887 }, { "epoch": 0.0222, "grad_norm": 1.1389050483703613, "grad_norm_var": 0.28975465213318613, "learning_rate": 0.0001, "loss": 29510.8145, "loss/crossentropy": 2.4214000701904297, "loss/hidden": 0.419921875, "loss/logits": 0.09680774062871933, "loss/reg": 29510.296875, "step": 888 }, { "epoch": 0.022225, "grad_norm": 0.7648321986198425, "grad_norm_var": 0.2937095091789043, "learning_rate": 0.0001, "loss": 29406.875, "loss/crossentropy": 2.5498549938201904, "loss/hidden": 0.40234375, "loss/logits": 0.09309667348861694, "loss/reg": 29406.37890625, "step": 889 }, { "epoch": 0.02225, "grad_norm": 1.1869903802871704, "grad_norm_var": 0.2903910837602872, "learning_rate": 0.0001, "loss": 29301.8965, "loss/crossentropy": 2.93878436088562, "loss/hidden": 0.443359375, "loss/logits": 0.10722556710243225, "loss/reg": 29301.345703125, "step": 890 }, { "epoch": 0.022275, "grad_norm": 0.8767397999763489, "grad_norm_var": 0.29405314307700603, "learning_rate": 0.0001, "loss": 29197.5547, "loss/crossentropy": 2.410872459411621, "loss/hidden": 0.431640625, "loss/logits": 0.10561473667621613, "loss/reg": 29197.017578125, "step": 891 }, { "epoch": 0.0223, "grad_norm": 0.821043848991394, "grad_norm_var": 0.29781408579728175, "learning_rate": 0.0001, "loss": 29093.6172, "loss/crossentropy": 2.512385845184326, "loss/hidden": 0.4140625, "loss/logits": 0.09107698500156403, "loss/reg": 29093.111328125, "step": 892 }, { "epoch": 0.022325, "grad_norm": 0.9101580381393433, "grad_norm_var": 0.2976862548765704, "learning_rate": 0.0001, "loss": 28989.8184, "loss/crossentropy": 2.5767509937286377, "loss/hidden": 0.396484375, "loss/logits": 0.10410018265247345, "loss/reg": 28989.318359375, "step": 893 }, { "epoch": 0.02235, "grad_norm": 0.9581095576286316, "grad_norm_var": 0.2983840181156954, "learning_rate": 0.0001, "loss": 28887.0391, "loss/crossentropy": 2.3874075412750244, "loss/hidden": 0.423828125, "loss/logits": 0.09900985658168793, "loss/reg": 28886.515625, "step": 894 }, { "epoch": 0.022375, "grad_norm": 0.9015693664550781, "grad_norm_var": 0.29559285776519056, "learning_rate": 0.0001, "loss": 28784.6133, "loss/crossentropy": 2.7540664672851562, "loss/hidden": 0.427734375, "loss/logits": 0.11583462357521057, "loss/reg": 28784.0703125, "step": 895 }, { "epoch": 0.0224, "grad_norm": 0.956749439239502, "grad_norm_var": 0.2929865413509854, "learning_rate": 0.0001, "loss": 28682.1367, "loss/crossentropy": 2.338555335998535, "loss/hidden": 0.404296875, "loss/logits": 0.11809753626585007, "loss/reg": 28681.615234375, "step": 896 }, { "epoch": 0.022425, "grad_norm": 1.0482844114303589, "grad_norm_var": 0.29206037588530465, "learning_rate": 0.0001, "loss": 28580.9648, "loss/crossentropy": 2.7794132232666016, "loss/hidden": 0.419921875, "loss/logits": 0.11443160474300385, "loss/reg": 28580.4296875, "step": 897 }, { "epoch": 0.02245, "grad_norm": 0.7482628226280212, "grad_norm_var": 0.29770702312698105, "learning_rate": 0.0001, "loss": 28479.0078, "loss/crossentropy": 2.2662155628204346, "loss/hidden": 0.40234375, "loss/logits": 0.08552417904138565, "loss/reg": 28478.51953125, "step": 898 }, { "epoch": 0.022475, "grad_norm": 0.782559335231781, "grad_norm_var": 0.023674645747053195, "learning_rate": 0.0001, "loss": 28379.8262, "loss/crossentropy": 2.4198241233825684, "loss/hidden": 0.3984375, "loss/logits": 0.09736079722642899, "loss/reg": 28379.330078125, "step": 899 }, { "epoch": 0.0225, "grad_norm": 0.8299167156219482, "grad_norm_var": 0.024535734648487056, "learning_rate": 0.0001, "loss": 28279.8398, "loss/crossentropy": 2.6819629669189453, "loss/hidden": 0.408203125, "loss/logits": 0.08226903527975082, "loss/reg": 28279.349609375, "step": 900 }, { "epoch": 0.022525, "grad_norm": 1.000473976135254, "grad_norm_var": 0.022340753054793307, "learning_rate": 0.0001, "loss": 28180.084, "loss/crossentropy": 2.6211347579956055, "loss/hidden": 0.41796875, "loss/logits": 0.12049969285726547, "loss/reg": 28179.544921875, "step": 901 }, { "epoch": 0.02255, "grad_norm": 1.1626721620559692, "grad_norm_var": 0.023797399957489286, "learning_rate": 0.0001, "loss": 28080.8613, "loss/crossentropy": 2.5171658992767334, "loss/hidden": 0.43359375, "loss/logits": 0.11568973958492279, "loss/reg": 28080.3125, "step": 902 }, { "epoch": 0.022575, "grad_norm": 0.8448003530502319, "grad_norm_var": 0.01995660184935984, "learning_rate": 0.0001, "loss": 27981.7832, "loss/crossentropy": 2.484351396560669, "loss/hidden": 0.392578125, "loss/logits": 0.08854812383651733, "loss/reg": 27981.302734375, "step": 903 }, { "epoch": 0.0226, "grad_norm": 0.9268690347671509, "grad_norm_var": 0.016952505261794554, "learning_rate": 0.0001, "loss": 27883.5234, "loss/crossentropy": 2.4348769187927246, "loss/hidden": 0.421875, "loss/logits": 0.1028188019990921, "loss/reg": 27882.998046875, "step": 904 }, { "epoch": 0.022625, "grad_norm": 1.214880347251892, "grad_norm_var": 0.020300281621865146, "learning_rate": 0.0001, "loss": 27786.293, "loss/crossentropy": 2.2924892902374268, "loss/hidden": 0.421875, "loss/logits": 0.09746156632900238, "loss/reg": 27785.7734375, "step": 905 }, { "epoch": 0.02265, "grad_norm": 1.4136064052581787, "grad_norm_var": 0.03072723747541867, "learning_rate": 0.0001, "loss": 27689.4258, "loss/crossentropy": 2.643136978149414, "loss/hidden": 0.42578125, "loss/logits": 0.09651921689510345, "loss/reg": 27688.904296875, "step": 906 }, { "epoch": 0.022675, "grad_norm": 0.8881863355636597, "grad_norm_var": 0.03060485399788484, "learning_rate": 0.0001, "loss": 27592.2148, "loss/crossentropy": 2.507758140563965, "loss/hidden": 0.419921875, "loss/logits": 0.09700015187263489, "loss/reg": 27591.697265625, "step": 907 }, { "epoch": 0.0227, "grad_norm": 0.7471372485160828, "grad_norm_var": 0.03234519435069997, "learning_rate": 0.0001, "loss": 27496.6816, "loss/crossentropy": 2.585388422012329, "loss/hidden": 0.384765625, "loss/logits": 0.08749718219041824, "loss/reg": 27496.208984375, "step": 908 }, { "epoch": 0.022725, "grad_norm": 0.8215914368629456, "grad_norm_var": 0.033405009202436026, "learning_rate": 0.0001, "loss": 27400.1914, "loss/crossentropy": 2.6576366424560547, "loss/hidden": 0.42578125, "loss/logits": 0.09847737848758698, "loss/reg": 27399.66796875, "step": 909 }, { "epoch": 0.02275, "grad_norm": 0.7689388990402222, "grad_norm_var": 0.03550905349180364, "learning_rate": 0.0001, "loss": 27299.5469, "loss/crossentropy": 2.4018616676330566, "loss/hidden": 0.400390625, "loss/logits": 0.09672288596630096, "loss/reg": 27299.048828125, "step": 910 }, { "epoch": 0.022775, "grad_norm": 1.1210814714431763, "grad_norm_var": 0.037365672903085984, "learning_rate": 0.0001, "loss": 27203.0352, "loss/crossentropy": 2.9312374591827393, "loss/hidden": 0.427734375, "loss/logits": 0.09397996217012405, "loss/reg": 27202.513671875, "step": 911 }, { "epoch": 0.0228, "grad_norm": 1.0522981882095337, "grad_norm_var": 0.03796173486365054, "learning_rate": 0.0001, "loss": 27104.9355, "loss/crossentropy": 2.7311244010925293, "loss/hidden": 0.41796875, "loss/logits": 0.09144529700279236, "loss/reg": 27104.42578125, "step": 912 }, { "epoch": 0.022825, "grad_norm": 2.082756757736206, "grad_norm_var": 0.11692244041732346, "learning_rate": 0.0001, "loss": 27008.9414, "loss/crossentropy": 2.760934352874756, "loss/hidden": 0.455078125, "loss/logits": 0.109530970454216, "loss/reg": 27008.376953125, "step": 913 }, { "epoch": 0.02285, "grad_norm": 1.009446382522583, "grad_norm_var": 0.1115356391304008, "learning_rate": 0.0001, "loss": 26912.2383, "loss/crossentropy": 2.480273962020874, "loss/hidden": 0.427734375, "loss/logits": 0.08872679620981216, "loss/reg": 26911.72265625, "step": 914 }, { "epoch": 0.022875, "grad_norm": 1.0722192525863647, "grad_norm_var": 0.10677117689794256, "learning_rate": 0.0001, "loss": 26816.4785, "loss/crossentropy": 2.6407546997070312, "loss/hidden": 0.4609375, "loss/logits": 0.11386647820472717, "loss/reg": 26815.904296875, "step": 915 }, { "epoch": 0.0229, "grad_norm": 0.8702617883682251, "grad_norm_var": 0.10563626345524095, "learning_rate": 0.0001, "loss": 26718.4863, "loss/crossentropy": 2.5538382530212402, "loss/hidden": 0.4140625, "loss/logits": 0.08729642629623413, "loss/reg": 26717.984375, "step": 916 }, { "epoch": 0.022925, "grad_norm": 1.1217052936553955, "grad_norm_var": 0.10555503691690002, "learning_rate": 0.0001, "loss": 26623.6973, "loss/crossentropy": 2.3956480026245117, "loss/hidden": 0.462890625, "loss/logits": 0.10590574890375137, "loss/reg": 26623.12890625, "step": 917 }, { "epoch": 0.02295, "grad_norm": 0.9607998728752136, "grad_norm_var": 0.10560506580399846, "learning_rate": 0.0001, "loss": 26527.5898, "loss/crossentropy": 2.681879758834839, "loss/hidden": 0.39453125, "loss/logits": 0.09322258085012436, "loss/reg": 26527.1015625, "step": 918 }, { "epoch": 0.022975, "grad_norm": 1.241347074508667, "grad_norm_var": 0.10419840550631591, "learning_rate": 0.0001, "loss": 26432.4277, "loss/crossentropy": 2.335767984390259, "loss/hidden": 0.4296875, "loss/logits": 0.09113731980323792, "loss/reg": 26431.90625, "step": 919 }, { "epoch": 0.023, "grad_norm": 0.9808320999145508, "grad_norm_var": 0.1032637210382731, "learning_rate": 0.0001, "loss": 26331.9551, "loss/crossentropy": 2.3959317207336426, "loss/hidden": 0.427734375, "loss/logits": 0.09294003993272781, "loss/reg": 26331.43359375, "step": 920 }, { "epoch": 0.023025, "grad_norm": 1.0856369733810425, "grad_norm_var": 0.10207718996882348, "learning_rate": 0.0001, "loss": 26237.1758, "loss/crossentropy": 2.653390407562256, "loss/hidden": 0.431640625, "loss/logits": 0.10660263895988464, "loss/reg": 26236.63671875, "step": 921 }, { "epoch": 0.02305, "grad_norm": 1.050618052482605, "grad_norm_var": 0.09403867871835095, "learning_rate": 0.0001, "loss": 26142.5957, "loss/crossentropy": 2.7401986122131348, "loss/hidden": 0.423828125, "loss/logits": 0.1003125011920929, "loss/reg": 26142.072265625, "step": 922 }, { "epoch": 0.023075, "grad_norm": 0.8106462359428406, "grad_norm_var": 0.09613576783978507, "learning_rate": 0.0001, "loss": 26050.1152, "loss/crossentropy": 2.6547510623931885, "loss/hidden": 0.421875, "loss/logits": 0.09801352024078369, "loss/reg": 26049.595703125, "step": 923 }, { "epoch": 0.0231, "grad_norm": 0.7937747240066528, "grad_norm_var": 0.0943894508943714, "learning_rate": 0.0001, "loss": 25953.9883, "loss/crossentropy": 2.4390735626220703, "loss/hidden": 0.423828125, "loss/logits": 0.0942004844546318, "loss/reg": 25953.470703125, "step": 924 }, { "epoch": 0.023125, "grad_norm": 0.851125955581665, "grad_norm_var": 0.09353369249125756, "learning_rate": 0.0001, "loss": 25861.0723, "loss/crossentropy": 2.7774431705474854, "loss/hidden": 0.4296875, "loss/logits": 0.1062496230006218, "loss/reg": 25860.537109375, "step": 925 }, { "epoch": 0.02315, "grad_norm": 0.9323251843452454, "grad_norm_var": 0.08897920516345294, "learning_rate": 0.0001, "loss": 25768.9844, "loss/crossentropy": 2.5458085536956787, "loss/hidden": 0.43359375, "loss/logits": 0.10097949951887131, "loss/reg": 25768.44921875, "step": 926 }, { "epoch": 0.023175, "grad_norm": 1.0372633934020996, "grad_norm_var": 0.08878936235039156, "learning_rate": 0.0001, "loss": 25677.8711, "loss/crossentropy": 2.54379940032959, "loss/hidden": 0.431640625, "loss/logits": 0.09122653305530548, "loss/reg": 25677.34765625, "step": 927 }, { "epoch": 0.0232, "grad_norm": 1.5073013305664062, "grad_norm_var": 0.10128768195023512, "learning_rate": 0.0001, "loss": 25585.4941, "loss/crossentropy": 2.9681735038757324, "loss/hidden": 0.451171875, "loss/logits": 0.09525079280138016, "loss/reg": 25584.947265625, "step": 928 }, { "epoch": 0.023225, "grad_norm": 1.118074655532837, "grad_norm_var": 0.031501517644075355, "learning_rate": 0.0001, "loss": 25485.7266, "loss/crossentropy": 2.705671548843384, "loss/hidden": 0.4296875, "loss/logits": 0.09706344455480576, "loss/reg": 25485.19921875, "step": 929 }, { "epoch": 0.02325, "grad_norm": 0.792195200920105, "grad_norm_var": 0.03498046946117814, "learning_rate": 0.0001, "loss": 25394.082, "loss/crossentropy": 2.669508218765259, "loss/hidden": 0.42578125, "loss/logits": 0.09703344851732254, "loss/reg": 25393.55859375, "step": 930 }, { "epoch": 0.023275, "grad_norm": 0.9936244487762451, "grad_norm_var": 0.03475783662954665, "learning_rate": 0.0001, "loss": 25303.7051, "loss/crossentropy": 2.5820255279541016, "loss/hidden": 0.439453125, "loss/logits": 0.09157466143369675, "loss/reg": 25303.173828125, "step": 931 }, { "epoch": 0.0233, "grad_norm": 0.9209782481193542, "grad_norm_var": 0.03397892892039251, "learning_rate": 0.0001, "loss": 25212.748, "loss/crossentropy": 2.3748815059661865, "loss/hidden": 0.4140625, "loss/logits": 0.09344466030597687, "loss/reg": 25212.240234375, "step": 932 }, { "epoch": 0.023325, "grad_norm": 1.307304859161377, "grad_norm_var": 0.038837048116851194, "learning_rate": 0.0001, "loss": 25124.3164, "loss/crossentropy": 2.3792972564697266, "loss/hidden": 0.466796875, "loss/logits": 0.12809084355831146, "loss/reg": 25123.720703125, "step": 933 }, { "epoch": 0.02335, "grad_norm": 0.8441806435585022, "grad_norm_var": 0.04066961683942717, "learning_rate": 0.0001, "loss": 25034.1055, "loss/crossentropy": 2.708909273147583, "loss/hidden": 0.423828125, "loss/logits": 0.0946684405207634, "loss/reg": 25033.587890625, "step": 934 }, { "epoch": 0.023375, "grad_norm": 1.0004233121871948, "grad_norm_var": 0.03708106564219798, "learning_rate": 0.0001, "loss": 24944.4883, "loss/crossentropy": 2.5024023056030273, "loss/hidden": 0.435546875, "loss/logits": 0.09347517043352127, "loss/reg": 24943.958984375, "step": 935 }, { "epoch": 0.0234, "grad_norm": 0.9585517644882202, "grad_norm_var": 0.037173917869692256, "learning_rate": 0.0001, "loss": 24855.9492, "loss/crossentropy": 2.396317958831787, "loss/hidden": 0.4296875, "loss/logits": 0.09736696630716324, "loss/reg": 24855.421875, "step": 936 }, { "epoch": 0.023425, "grad_norm": 0.9499222040176392, "grad_norm_var": 0.03677999916784411, "learning_rate": 0.0001, "loss": 24767.418, "loss/crossentropy": 2.8797192573547363, "loss/hidden": 0.439453125, "loss/logits": 0.10986952483654022, "loss/reg": 24766.869140625, "step": 937 }, { "epoch": 0.02345, "grad_norm": 0.7930538654327393, "grad_norm_var": 0.03890523162507543, "learning_rate": 0.0001, "loss": 24678.4062, "loss/crossentropy": 2.5853559970855713, "loss/hidden": 0.421875, "loss/logits": 0.09216812252998352, "loss/reg": 24677.892578125, "step": 938 }, { "epoch": 0.023475, "grad_norm": 1.0527880191802979, "grad_norm_var": 0.03724183284043681, "learning_rate": 0.0001, "loss": 24591.1797, "loss/crossentropy": 2.5778865814208984, "loss/hidden": 0.4375, "loss/logits": 0.10692226886749268, "loss/reg": 24590.634765625, "step": 939 }, { "epoch": 0.0235, "grad_norm": 1.0680421590805054, "grad_norm_var": 0.03473803065608056, "learning_rate": 0.0001, "loss": 24503.9961, "loss/crossentropy": 2.7273216247558594, "loss/hidden": 0.435546875, "loss/logits": 0.10270722210407257, "loss/reg": 24503.45703125, "step": 940 }, { "epoch": 0.023525, "grad_norm": 0.9111777544021606, "grad_norm_var": 0.033707766249989495, "learning_rate": 0.0001, "loss": 24417.6191, "loss/crossentropy": 2.6318414211273193, "loss/hidden": 0.419921875, "loss/logits": 0.09268413484096527, "loss/reg": 24417.107421875, "step": 941 }, { "epoch": 0.02355, "grad_norm": 1.0316659212112427, "grad_norm_var": 0.03327319310434256, "learning_rate": 0.0001, "loss": 24331.5488, "loss/crossentropy": 2.6445016860961914, "loss/hidden": 0.40234375, "loss/logits": 0.0935656726360321, "loss/reg": 24331.052734375, "step": 942 }, { "epoch": 0.023575, "grad_norm": 1.0706905126571655, "grad_norm_var": 0.03342928936908199, "learning_rate": 0.0001, "loss": 24245.291, "loss/crossentropy": 2.7278873920440674, "loss/hidden": 0.423828125, "loss/logits": 0.10185898840427399, "loss/reg": 24244.765625, "step": 943 }, { "epoch": 0.0236, "grad_norm": 0.9393065571784973, "grad_norm_var": 0.01668818496218057, "learning_rate": 0.0001, "loss": 24158.9805, "loss/crossentropy": 2.5154876708984375, "loss/hidden": 0.431640625, "loss/logits": 0.10179644078016281, "loss/reg": 24158.447265625, "step": 944 }, { "epoch": 0.023625, "grad_norm": 0.9378867149353027, "grad_norm_var": 0.015508247044760977, "learning_rate": 0.0001, "loss": 24072.2188, "loss/crossentropy": 2.459463119506836, "loss/hidden": 0.427734375, "loss/logits": 0.10269282758235931, "loss/reg": 24071.6875, "step": 945 }, { "epoch": 0.02365, "grad_norm": 0.9219256043434143, "grad_norm_var": 0.013428570358553079, "learning_rate": 0.0001, "loss": 23987.8887, "loss/crossentropy": 2.444058656692505, "loss/hidden": 0.43359375, "loss/logits": 0.10357135534286499, "loss/reg": 23987.3515625, "step": 946 }, { "epoch": 0.023675, "grad_norm": 0.8147755861282349, "grad_norm_var": 0.015134934386543305, "learning_rate": 0.0001, "loss": 23903.2949, "loss/crossentropy": 2.5425283908843994, "loss/hidden": 0.421875, "loss/logits": 0.09350921213626862, "loss/reg": 23902.779296875, "step": 947 }, { "epoch": 0.0237, "grad_norm": 1.0720521211624146, "grad_norm_var": 0.015570571374023704, "learning_rate": 0.0001, "loss": 23819.9102, "loss/crossentropy": 2.504666566848755, "loss/hidden": 0.4296875, "loss/logits": 0.1169600635766983, "loss/reg": 23819.36328125, "step": 948 }, { "epoch": 0.023725, "grad_norm": 1.1205313205718994, "grad_norm_var": 0.009590193786136429, "learning_rate": 0.0001, "loss": 23735.5586, "loss/crossentropy": 2.7423009872436523, "loss/hidden": 0.4296875, "loss/logits": 0.09194411337375641, "loss/reg": 23735.037109375, "step": 949 }, { "epoch": 0.02375, "grad_norm": 1.5967024564743042, "grad_norm_var": 0.03256612602235875, "learning_rate": 0.0001, "loss": 23652.8613, "loss/crossentropy": 2.4818520545959473, "loss/hidden": 0.439453125, "loss/logits": 0.10307301580905914, "loss/reg": 23652.318359375, "step": 950 }, { "epoch": 0.023775, "grad_norm": 1.3237601518630981, "grad_norm_var": 0.03847322984858147, "learning_rate": 0.0001, "loss": 23570.2109, "loss/crossentropy": 3.0349578857421875, "loss/hidden": 0.470703125, "loss/logits": 0.11495113372802734, "loss/reg": 23569.625, "step": 951 }, { "epoch": 0.0238, "grad_norm": 0.8153846859931946, "grad_norm_var": 0.041216976286479565, "learning_rate": 0.0001, "loss": 23485.4648, "loss/crossentropy": 2.4277260303497314, "loss/hidden": 0.4296875, "loss/logits": 0.10003092885017395, "loss/reg": 23484.935546875, "step": 952 }, { "epoch": 0.023825, "grad_norm": 1.0136277675628662, "grad_norm_var": 0.04082246968663174, "learning_rate": 0.0001, "loss": 23403.4863, "loss/crossentropy": 2.6772782802581787, "loss/hidden": 0.44140625, "loss/logits": 0.1092321127653122, "loss/reg": 23402.935546875, "step": 953 }, { "epoch": 0.02385, "grad_norm": 0.8215113878250122, "grad_norm_var": 0.039973231307667695, "learning_rate": 0.0001, "loss": 23320.4492, "loss/crossentropy": 2.495701789855957, "loss/hidden": 0.423828125, "loss/logits": 0.09646574407815933, "loss/reg": 23319.9296875, "step": 954 }, { "epoch": 0.023875, "grad_norm": 1.060551404953003, "grad_norm_var": 0.03999852732846573, "learning_rate": 0.0001, "loss": 23237.9766, "loss/crossentropy": 2.664013624191284, "loss/hidden": 0.44921875, "loss/logits": 0.11422315239906311, "loss/reg": 23237.4140625, "step": 955 }, { "epoch": 0.0239, "grad_norm": 0.9129847288131714, "grad_norm_var": 0.04076586557442507, "learning_rate": 0.0001, "loss": 23155.3926, "loss/crossentropy": 2.5501043796539307, "loss/hidden": 0.435546875, "loss/logits": 0.10178236663341522, "loss/reg": 23154.85546875, "step": 956 }, { "epoch": 0.023925, "grad_norm": 1.0732991695404053, "grad_norm_var": 0.03999608608578099, "learning_rate": 0.0001, "loss": 23073.875, "loss/crossentropy": 2.3961002826690674, "loss/hidden": 0.4453125, "loss/logits": 0.10903213918209076, "loss/reg": 23073.3203125, "step": 957 }, { "epoch": 0.02395, "grad_norm": 1.1402229070663452, "grad_norm_var": 0.04071453073050209, "learning_rate": 0.0001, "loss": 22992.459, "loss/crossentropy": 2.8703441619873047, "loss/hidden": 0.4453125, "loss/logits": 0.12756890058517456, "loss/reg": 22991.88671875, "step": 958 }, { "epoch": 0.023975, "grad_norm": 1.2905657291412354, "grad_norm_var": 0.04464461561989617, "learning_rate": 0.0001, "loss": 22911.1367, "loss/crossentropy": 2.1613481044769287, "loss/hidden": 0.4296875, "loss/logits": 0.10078255087137222, "loss/reg": 22910.60546875, "step": 959 }, { "epoch": 0.024, "grad_norm": 0.9392194151878357, "grad_norm_var": 0.04464594223829518, "learning_rate": 0.0001, "loss": 22831.9512, "loss/crossentropy": 2.4932808876037598, "loss/hidden": 0.423828125, "loss/logits": 0.08269266784191132, "loss/reg": 22831.4453125, "step": 960 }, { "epoch": 0.024025, "grad_norm": 1.8595917224884033, "grad_norm_var": 0.08354169265646237, "learning_rate": 0.0001, "loss": 22752.7676, "loss/crossentropy": 2.7113559246063232, "loss/hidden": 0.490234375, "loss/logits": 0.11296282708644867, "loss/reg": 22752.1640625, "step": 961 }, { "epoch": 0.02405, "grad_norm": 1.1560962200164795, "grad_norm_var": 0.08106413461811185, "learning_rate": 0.0001, "loss": 22672.8867, "loss/crossentropy": 2.613546848297119, "loss/hidden": 0.4296875, "loss/logits": 0.09537872672080994, "loss/reg": 22672.361328125, "step": 962 }, { "epoch": 0.024075, "grad_norm": 0.9497840404510498, "grad_norm_var": 0.07660671350333838, "learning_rate": 0.0001, "loss": 22593.0859, "loss/crossentropy": 2.448693037033081, "loss/hidden": 0.4296875, "loss/logits": 0.08571556955575943, "loss/reg": 22592.5703125, "step": 963 }, { "epoch": 0.0241, "grad_norm": 1.1213970184326172, "grad_norm_var": 0.0763505458177983, "learning_rate": 0.0001, "loss": 22512.8164, "loss/crossentropy": 2.826176166534424, "loss/hidden": 0.46875, "loss/logits": 0.10593421757221222, "loss/reg": 22512.2421875, "step": 964 }, { "epoch": 0.024125, "grad_norm": 1.1012262105941772, "grad_norm_var": 0.07641674904413306, "learning_rate": 0.0001, "loss": 22433.457, "loss/crossentropy": 2.9246420860290527, "loss/hidden": 0.435546875, "loss/logits": 0.09543861448764801, "loss/reg": 22432.92578125, "step": 965 }, { "epoch": 0.02415, "grad_norm": 1.0128264427185059, "grad_norm_var": 0.06185758565790037, "learning_rate": 0.0001, "loss": 22355.3984, "loss/crossentropy": 2.4664463996887207, "loss/hidden": 0.44140625, "loss/logits": 0.10364780575037003, "loss/reg": 22354.853515625, "step": 966 }, { "epoch": 0.024175, "grad_norm": 1.354954481124878, "grad_norm_var": 0.06285114347466039, "learning_rate": 0.0001, "loss": 22278.1406, "loss/crossentropy": 2.9408609867095947, "loss/hidden": 0.453125, "loss/logits": 0.11183148622512817, "loss/reg": 22277.576171875, "step": 967 }, { "epoch": 0.0242, "grad_norm": 1.3423038721084595, "grad_norm_var": 0.06010591884042792, "learning_rate": 0.0001, "loss": 22200.9824, "loss/crossentropy": 2.842505931854248, "loss/hidden": 0.462890625, "loss/logits": 0.09240270406007767, "loss/reg": 22200.427734375, "step": 968 }, { "epoch": 0.024225, "grad_norm": 0.9177575707435608, "grad_norm_var": 0.062223966885823205, "learning_rate": 0.0001, "loss": 22115.6211, "loss/crossentropy": 2.5726301670074463, "loss/hidden": 0.421875, "loss/logits": 0.08873818814754486, "loss/reg": 22115.111328125, "step": 969 }, { "epoch": 0.02425, "grad_norm": 0.8709549903869629, "grad_norm_var": 0.06035364597454086, "learning_rate": 0.0001, "loss": 22038.9238, "loss/crossentropy": 2.6931722164154053, "loss/hidden": 0.43359375, "loss/logits": 0.1035221666097641, "loss/reg": 22038.38671875, "step": 970 }, { "epoch": 0.024275, "grad_norm": 1.5222187042236328, "grad_norm_var": 0.0693084192185554, "learning_rate": 0.0001, "loss": 21961.6309, "loss/crossentropy": 2.6958839893341064, "loss/hidden": 0.466796875, "loss/logits": 0.11546257138252258, "loss/reg": 21961.048828125, "step": 971 }, { "epoch": 0.0243, "grad_norm": 1.1752046346664429, "grad_norm_var": 0.06495775773170204, "learning_rate": 0.0001, "loss": 21884.3262, "loss/crossentropy": 2.49680233001709, "loss/hidden": 0.46484375, "loss/logits": 0.11024495959281921, "loss/reg": 21883.751953125, "step": 972 }, { "epoch": 0.024325, "grad_norm": 0.93658846616745, "grad_norm_var": 0.06801115334468061, "learning_rate": 0.0001, "loss": 21807.9648, "loss/crossentropy": 2.5653347969055176, "loss/hidden": 0.427734375, "loss/logits": 0.08618831634521484, "loss/reg": 21807.451171875, "step": 973 }, { "epoch": 0.02435, "grad_norm": 1.0110540390014648, "grad_norm_var": 0.06953546700071542, "learning_rate": 0.0001, "loss": 21731.2305, "loss/crossentropy": 2.6715872287750244, "loss/hidden": 0.43359375, "loss/logits": 0.10801969468593597, "loss/reg": 21730.689453125, "step": 974 }, { "epoch": 0.024375, "grad_norm": 0.7632473111152649, "grad_norm_var": 0.07774221154281466, "learning_rate": 0.0001, "loss": 21654.6543, "loss/crossentropy": 2.3023362159729004, "loss/hidden": 0.39453125, "loss/logits": 0.08346745371818542, "loss/reg": 21654.17578125, "step": 975 }, { "epoch": 0.0244, "grad_norm": 0.892348051071167, "grad_norm_var": 0.07905400420977847, "learning_rate": 0.0001, "loss": 21579.6055, "loss/crossentropy": 2.7380852699279785, "loss/hidden": 0.423828125, "loss/logits": 0.09384720027446747, "loss/reg": 21579.087890625, "step": 976 }, { "epoch": 0.024425, "grad_norm": 1.179625153541565, "grad_norm_var": 0.04128092908533317, "learning_rate": 0.0001, "loss": 21505.2305, "loss/crossentropy": 2.5506160259246826, "loss/hidden": 0.43359375, "loss/logits": 0.11979290843009949, "loss/reg": 21504.677734375, "step": 977 }, { "epoch": 0.02445, "grad_norm": 0.9429466128349304, "grad_norm_var": 0.042006827373164884, "learning_rate": 0.0001, "loss": 21429.9922, "loss/crossentropy": 2.449470281600952, "loss/hidden": 0.443359375, "loss/logits": 0.11196134239435196, "loss/reg": 21429.4375, "step": 978 }, { "epoch": 0.024475, "grad_norm": 0.9729964733123779, "grad_norm_var": 0.04167338083207805, "learning_rate": 0.0001, "loss": 21356.1621, "loss/crossentropy": 2.6006290912628174, "loss/hidden": 0.435546875, "loss/logits": 0.09295792877674103, "loss/reg": 21355.6328125, "step": 979 }, { "epoch": 0.0245, "grad_norm": 1.66546630859375, "grad_norm_var": 0.06391321931942666, "learning_rate": 0.0001, "loss": 21283.2324, "loss/crossentropy": 2.4464926719665527, "loss/hidden": 0.49609375, "loss/logits": 0.15045657753944397, "loss/reg": 21282.5859375, "step": 980 }, { "epoch": 0.024525, "grad_norm": 1.159827709197998, "grad_norm_var": 0.06410729346084636, "learning_rate": 0.0001, "loss": 21208.8262, "loss/crossentropy": 2.473205089569092, "loss/hidden": 0.451171875, "loss/logits": 0.12011029571294785, "loss/reg": 21208.255859375, "step": 981 }, { "epoch": 0.02455, "grad_norm": 1.228529930114746, "grad_norm_var": 0.06429186090149078, "learning_rate": 0.0001, "loss": 21135.3398, "loss/crossentropy": 2.5950937271118164, "loss/hidden": 0.462890625, "loss/logits": 0.10239904373884201, "loss/reg": 21134.775390625, "step": 982 }, { "epoch": 0.024575, "grad_norm": 1.1297028064727783, "grad_norm_var": 0.06043656618327204, "learning_rate": 0.0001, "loss": 21061.5859, "loss/crossentropy": 2.5663630962371826, "loss/hidden": 0.44921875, "loss/logits": 0.11027559638023376, "loss/reg": 21061.02734375, "step": 983 }, { "epoch": 0.0246, "grad_norm": 0.8357163667678833, "grad_norm_var": 0.06057721585537804, "learning_rate": 0.0001, "loss": 20987.8066, "loss/crossentropy": 2.509295701980591, "loss/hidden": 0.435546875, "loss/logits": 0.0971388965845108, "loss/reg": 20987.2734375, "step": 984 }, { "epoch": 0.024625, "grad_norm": 0.9624331593513489, "grad_norm_var": 0.05976374893259659, "learning_rate": 0.0001, "loss": 20915.5469, "loss/crossentropy": 2.6645829677581787, "loss/hidden": 0.4296875, "loss/logits": 0.09732533991336823, "loss/reg": 20915.01953125, "step": 985 }, { "epoch": 0.02465, "grad_norm": 0.921421468257904, "grad_norm_var": 0.058529388388714056, "learning_rate": 0.0001, "loss": 20843.3809, "loss/crossentropy": 2.5907278060913086, "loss/hidden": 0.423828125, "loss/logits": 0.08527834713459015, "loss/reg": 20842.87109375, "step": 986 }, { "epoch": 0.024675, "grad_norm": 0.936118483543396, "grad_norm_var": 0.045535446772327605, "learning_rate": 0.0001, "loss": 20768.4512, "loss/crossentropy": 2.6672019958496094, "loss/hidden": 0.427734375, "loss/logits": 0.10828252136707306, "loss/reg": 20767.916015625, "step": 987 }, { "epoch": 0.0247, "grad_norm": 0.9817948341369629, "grad_norm_var": 0.04450477613690406, "learning_rate": 0.0001, "loss": 20696.8027, "loss/crossentropy": 2.604332447052002, "loss/hidden": 0.435546875, "loss/logits": 0.09101539850234985, "loss/reg": 20696.275390625, "step": 988 }, { "epoch": 0.024725, "grad_norm": 0.8676700592041016, "grad_norm_var": 0.045682872994934995, "learning_rate": 0.0001, "loss": 20624.293, "loss/crossentropy": 2.532348394393921, "loss/hidden": 0.421875, "loss/logits": 0.07284273207187653, "loss/reg": 20623.798828125, "step": 989 }, { "epoch": 0.02475, "grad_norm": 0.9078096151351929, "grad_norm_var": 0.04658485699583158, "learning_rate": 0.0001, "loss": 20553.4355, "loss/crossentropy": 2.8214199542999268, "loss/hidden": 0.4296875, "loss/logits": 0.09346893429756165, "loss/reg": 20552.912109375, "step": 990 }, { "epoch": 0.024775, "grad_norm": 0.9056034088134766, "grad_norm_var": 0.042945254504367, "learning_rate": 0.0001, "loss": 20482.8789, "loss/crossentropy": 2.4334371089935303, "loss/hidden": 0.44921875, "loss/logits": 0.0925212949514389, "loss/reg": 20482.337890625, "step": 991 }, { "epoch": 0.0248, "grad_norm": 1.03048837184906, "grad_norm_var": 0.041591031305633054, "learning_rate": 0.0001, "loss": 20411.1191, "loss/crossentropy": 2.5874595642089844, "loss/hidden": 0.466796875, "loss/logits": 0.08578906953334808, "loss/reg": 20410.56640625, "step": 992 }, { "epoch": 0.024825, "grad_norm": 0.9951749444007874, "grad_norm_var": 0.04026533365937889, "learning_rate": 0.0001, "loss": 20341.043, "loss/crossentropy": 2.9441099166870117, "loss/hidden": 0.431640625, "loss/logits": 0.10317258536815643, "loss/reg": 20340.5078125, "step": 993 }, { "epoch": 0.02485, "grad_norm": 0.7870285511016846, "grad_norm_var": 0.04354733117220501, "learning_rate": 0.0001, "loss": 20270.502, "loss/crossentropy": 2.565255641937256, "loss/hidden": 0.431640625, "loss/logits": 0.10767176747322083, "loss/reg": 20269.962890625, "step": 994 }, { "epoch": 0.024875, "grad_norm": 1.132638692855835, "grad_norm_var": 0.044182544542965836, "learning_rate": 0.0001, "loss": 20200.2891, "loss/crossentropy": 2.9740242958068848, "loss/hidden": 0.4453125, "loss/logits": 0.10422468930482864, "loss/reg": 20199.740234375, "step": 995 }, { "epoch": 0.0249, "grad_norm": 0.9809736013412476, "grad_norm_var": 0.015283599422464636, "learning_rate": 0.0001, "loss": 20130.6875, "loss/crossentropy": 2.6761510372161865, "loss/hidden": 0.443359375, "loss/logits": 0.11210547387599945, "loss/reg": 20130.1328125, "step": 996 }, { "epoch": 0.024925, "grad_norm": 1.2750529050827026, "grad_norm_var": 0.018796527883273944, "learning_rate": 0.0001, "loss": 20061.7246, "loss/crossentropy": 2.713996171951294, "loss/hidden": 0.4765625, "loss/logits": 0.12018194794654846, "loss/reg": 20061.126953125, "step": 997 }, { "epoch": 0.02495, "grad_norm": 1.0360372066497803, "grad_norm_var": 0.015051539828577971, "learning_rate": 0.0001, "loss": 19993.0957, "loss/crossentropy": 2.730985641479492, "loss/hidden": 0.431640625, "loss/logits": 0.08067913353443146, "loss/reg": 19992.583984375, "step": 998 }, { "epoch": 0.024975, "grad_norm": 0.8395718336105347, "grad_norm_var": 0.01453511121293259, "learning_rate": 0.0001, "loss": 19919.8027, "loss/crossentropy": 2.7646329402923584, "loss/hidden": 0.4296875, "loss/logits": 0.09436003118753433, "loss/reg": 19919.279296875, "step": 999 }, { "epoch": 0.025, "grad_norm": 1.1801493167877197, "grad_norm_var": 0.016140098413421084, "learning_rate": 0.0001, "loss": 19851.1016, "loss/crossentropy": 3.0086069107055664, "loss/hidden": 0.47265625, "loss/logits": 0.10071013867855072, "loss/reg": 19850.52734375, "step": 1000 }, { "epoch": 0.025025, "grad_norm": 0.9698167443275452, "grad_norm_var": 0.016122521849981183, "learning_rate": 0.0001, "loss": 19781.9707, "loss/crossentropy": 2.813955068588257, "loss/hidden": 0.427734375, "loss/logits": 0.0990939736366272, "loss/reg": 19781.443359375, "step": 1001 }, { "epoch": 0.02505, "grad_norm": 0.8948052525520325, "grad_norm_var": 0.016389621821141617, "learning_rate": 0.0001, "loss": 19714.3203, "loss/crossentropy": 2.843393325805664, "loss/hidden": 0.427734375, "loss/logits": 0.09265361726284027, "loss/reg": 19713.80078125, "step": 1002 }, { "epoch": 0.025075, "grad_norm": 0.9143559336662292, "grad_norm_var": 0.016553939447787987, "learning_rate": 0.0001, "loss": 19647.4551, "loss/crossentropy": 2.903536558151245, "loss/hidden": 0.419921875, "loss/logits": 0.08771992474794388, "loss/reg": 19646.947265625, "step": 1003 }, { "epoch": 0.0251, "grad_norm": 0.8601119518280029, "grad_norm_var": 0.017469476934411195, "learning_rate": 0.0001, "loss": 19580.8223, "loss/crossentropy": 2.6393449306488037, "loss/hidden": 0.42578125, "loss/logits": 0.08560352772474289, "loss/reg": 19580.310546875, "step": 1004 }, { "epoch": 0.025125, "grad_norm": 1.1371523141860962, "grad_norm_var": 0.01820280433152129, "learning_rate": 0.0001, "loss": 19514.7129, "loss/crossentropy": 2.5199880599975586, "loss/hidden": 0.43359375, "loss/logits": 0.08653436601161957, "loss/reg": 19514.193359375, "step": 1005 }, { "epoch": 0.02515, "grad_norm": 1.3092695474624634, "grad_norm_var": 0.023853796794785314, "learning_rate": 0.0001, "loss": 19447.8926, "loss/crossentropy": 2.913530111312866, "loss/hidden": 0.4296875, "loss/logits": 0.09430601447820663, "loss/reg": 19447.369140625, "step": 1006 }, { "epoch": 0.025175, "grad_norm": 1.0356576442718506, "grad_norm_var": 0.023005009456557406, "learning_rate": 0.0001, "loss": 19380.7793, "loss/crossentropy": 2.6900110244750977, "loss/hidden": 0.43359375, "loss/logits": 0.07939346134662628, "loss/reg": 19380.265625, "step": 1007 }, { "epoch": 0.0252, "grad_norm": 0.8629568219184875, "grad_norm_var": 0.024606272502219525, "learning_rate": 0.0001, "loss": 19315.0293, "loss/crossentropy": 3.0058045387268066, "loss/hidden": 0.419921875, "loss/logits": 0.08857347071170807, "loss/reg": 19314.521484375, "step": 1008 }, { "epoch": 0.025225, "grad_norm": 1.0065983533859253, "grad_norm_var": 0.024587016577246778, "learning_rate": 0.0001, "loss": 19249.4434, "loss/crossentropy": 2.8189473152160645, "loss/hidden": 0.46875, "loss/logits": 0.1279560625553131, "loss/reg": 19248.845703125, "step": 1009 }, { "epoch": 0.02525, "grad_norm": 0.8587822318077087, "grad_norm_var": 0.022738422167231778, "learning_rate": 0.0001, "loss": 19183.877, "loss/crossentropy": 2.836906671524048, "loss/hidden": 0.42578125, "loss/logits": 0.08942188322544098, "loss/reg": 19183.361328125, "step": 1010 }, { "epoch": 0.025275, "grad_norm": 0.9735855460166931, "grad_norm_var": 0.021896248833306705, "learning_rate": 0.0001, "loss": 19119.3652, "loss/crossentropy": 2.576018810272217, "loss/hidden": 0.44921875, "loss/logits": 0.09336192905902863, "loss/reg": 19118.822265625, "step": 1011 }, { "epoch": 0.0253, "grad_norm": 0.8476600646972656, "grad_norm_var": 0.02349506820003923, "learning_rate": 0.0001, "loss": 19054.9141, "loss/crossentropy": 2.7196743488311768, "loss/hidden": 0.43359375, "loss/logits": 0.08736908435821533, "loss/reg": 19054.392578125, "step": 1012 }, { "epoch": 0.025325, "grad_norm": 1.0779842138290405, "grad_norm_var": 0.018697648296842277, "learning_rate": 0.0001, "loss": 18990.8809, "loss/crossentropy": 2.90828537940979, "loss/hidden": 0.435546875, "loss/logits": 0.09901103377342224, "loss/reg": 18990.345703125, "step": 1013 }, { "epoch": 0.02535, "grad_norm": 1.0291345119476318, "grad_norm_var": 0.01865621313936613, "learning_rate": 0.0001, "loss": 18926.7422, "loss/crossentropy": 2.5710670948028564, "loss/hidden": 0.435546875, "loss/logits": 0.09601636230945587, "loss/reg": 18926.2109375, "step": 1014 }, { "epoch": 0.025375, "grad_norm": 1.037187933921814, "grad_norm_var": 0.017203204215706987, "learning_rate": 0.0001, "loss": 18863.2285, "loss/crossentropy": 2.9131431579589844, "loss/hidden": 0.435546875, "loss/logits": 0.09500589966773987, "loss/reg": 18862.697265625, "step": 1015 }, { "epoch": 0.0254, "grad_norm": 0.9352390766143799, "grad_norm_var": 0.015059517364305879, "learning_rate": 0.0001, "loss": 18796.5547, "loss/crossentropy": 2.563702344894409, "loss/hidden": 0.4296875, "loss/logits": 0.09276743978261948, "loss/reg": 18796.033203125, "step": 1016 }, { "epoch": 0.025425, "grad_norm": 1.01382577419281, "grad_norm_var": 0.015095031731177247, "learning_rate": 0.0001, "loss": 18733.0645, "loss/crossentropy": 2.8074276447296143, "loss/hidden": 0.4296875, "loss/logits": 0.09178042411804199, "loss/reg": 18732.54296875, "step": 1017 }, { "epoch": 0.02545, "grad_norm": 0.8776202201843262, "grad_norm_var": 0.01532506926756169, "learning_rate": 0.0001, "loss": 18668.4941, "loss/crossentropy": 2.913670063018799, "loss/hidden": 0.421875, "loss/logits": 0.0906159058213234, "loss/reg": 18667.982421875, "step": 1018 }, { "epoch": 0.025475, "grad_norm": 0.9331912994384766, "grad_norm_var": 0.015167140696689533, "learning_rate": 0.0001, "loss": 18606.2109, "loss/crossentropy": 3.2162771224975586, "loss/hidden": 0.443359375, "loss/logits": 0.09538498520851135, "loss/reg": 18605.671875, "step": 1019 }, { "epoch": 0.0255, "grad_norm": 0.7805931568145752, "grad_norm_var": 0.01691029687888581, "learning_rate": 0.0001, "loss": 18543.0703, "loss/crossentropy": 2.379256248474121, "loss/hidden": 0.41796875, "loss/logits": 0.08216649293899536, "loss/reg": 18542.5703125, "step": 1020 }, { "epoch": 0.025525, "grad_norm": 1.035799264907837, "grad_norm_var": 0.015459385370742717, "learning_rate": 0.0001, "loss": 18481.0234, "loss/crossentropy": 2.7980172634124756, "loss/hidden": 0.4296875, "loss/logits": 0.09035004675388336, "loss/reg": 18480.50390625, "step": 1021 }, { "epoch": 0.02555, "grad_norm": 1.1896371841430664, "grad_norm_var": 0.011036991252401155, "learning_rate": 0.0001, "loss": 18417.9668, "loss/crossentropy": 2.533511161804199, "loss/hidden": 0.5, "loss/logits": 0.10565692186355591, "loss/reg": 18417.361328125, "step": 1022 }, { "epoch": 0.025575, "grad_norm": 1.1064552068710327, "grad_norm_var": 0.011984528561376573, "learning_rate": 0.0001, "loss": 18354.793, "loss/crossentropy": 2.736083507537842, "loss/hidden": 0.435546875, "loss/logits": 0.08818010985851288, "loss/reg": 18354.26953125, "step": 1023 }, { "epoch": 0.0256, "grad_norm": 0.9937368631362915, "grad_norm_var": 0.011136538349077263, "learning_rate": 0.0001, "loss": 18292.7598, "loss/crossentropy": 2.8839433193206787, "loss/hidden": 0.43359375, "loss/logits": 0.08257324993610382, "loss/reg": 18292.244140625, "step": 1024 }, { "epoch": 0.025625, "grad_norm": 1.1220980882644653, "grad_norm_var": 0.012363521642834444, "learning_rate": 0.0001, "loss": 18230.7578, "loss/crossentropy": 2.7472383975982666, "loss/hidden": 0.466796875, "loss/logits": 0.09016968309879303, "loss/reg": 18230.201171875, "step": 1025 }, { "epoch": 0.02565, "grad_norm": 0.7697837948799133, "grad_norm_var": 0.014395284169400214, "learning_rate": 0.0001, "loss": 18169.1855, "loss/crossentropy": 2.7969915866851807, "loss/hidden": 0.423828125, "loss/logits": 0.0929093211889267, "loss/reg": 18168.66796875, "step": 1026 }, { "epoch": 0.025675, "grad_norm": 0.9489984512329102, "grad_norm_var": 0.014463014787380013, "learning_rate": 0.0001, "loss": 18107.4141, "loss/crossentropy": 2.691138505935669, "loss/hidden": 0.427734375, "loss/logits": 0.07996124029159546, "loss/reg": 18106.90625, "step": 1027 }, { "epoch": 0.0257, "grad_norm": 0.977716863155365, "grad_norm_var": 0.013204760693864634, "learning_rate": 0.0001, "loss": 18045.3945, "loss/crossentropy": 2.789412260055542, "loss/hidden": 0.4296875, "loss/logits": 0.08714571595191956, "loss/reg": 18044.876953125, "step": 1028 }, { "epoch": 0.025725, "grad_norm": 1.0606443881988525, "grad_norm_var": 0.013018545875161122, "learning_rate": 0.0001, "loss": 17983.6035, "loss/crossentropy": 2.911621570587158, "loss/hidden": 0.43359375, "loss/logits": 0.08721598237752914, "loss/reg": 17983.08203125, "step": 1029 }, { "epoch": 0.02575, "grad_norm": 1.029701590538025, "grad_norm_var": 0.013021658868217875, "learning_rate": 0.0001, "loss": 17921.9004, "loss/crossentropy": 2.683985948562622, "loss/hidden": 0.451171875, "loss/logits": 0.10340292751789093, "loss/reg": 17921.345703125, "step": 1030 }, { "epoch": 0.025775, "grad_norm": 0.9352401494979858, "grad_norm_var": 0.013006222781651882, "learning_rate": 0.0001, "loss": 17860.1621, "loss/crossentropy": 2.820894718170166, "loss/hidden": 0.423828125, "loss/logits": 0.09438331425189972, "loss/reg": 17859.64453125, "step": 1031 }, { "epoch": 0.0258, "grad_norm": 1.0464032888412476, "grad_norm_var": 0.013087071999497308, "learning_rate": 0.0001, "loss": 17799.5801, "loss/crossentropy": 2.778580665588379, "loss/hidden": 0.431640625, "loss/logits": 0.09593655169010162, "loss/reg": 17799.052734375, "step": 1032 }, { "epoch": 0.025825, "grad_norm": 0.9822232723236084, "grad_norm_var": 0.013044211622835651, "learning_rate": 0.0001, "loss": 17738.7188, "loss/crossentropy": 3.0002071857452393, "loss/hidden": 0.43359375, "loss/logits": 0.08786912262439728, "loss/reg": 17738.197265625, "step": 1033 }, { "epoch": 0.02585, "grad_norm": 0.7570947408676147, "grad_norm_var": 0.015707684759895278, "learning_rate": 0.0001, "loss": 17678.4766, "loss/crossentropy": 2.6046817302703857, "loss/hidden": 0.4140625, "loss/logits": 0.07064966857433319, "loss/reg": 17677.9921875, "step": 1034 }, { "epoch": 0.025875, "grad_norm": 1.0941604375839233, "grad_norm_var": 0.01633682153482544, "learning_rate": 0.0001, "loss": 17619.3457, "loss/crossentropy": 2.923370838165283, "loss/hidden": 0.427734375, "loss/logits": 0.0829104632139206, "loss/reg": 17618.8359375, "step": 1035 }, { "epoch": 0.0259, "grad_norm": 0.908995509147644, "grad_norm_var": 0.0137925518327591, "learning_rate": 0.0001, "loss": 17559.7441, "loss/crossentropy": 2.863892078399658, "loss/hidden": 0.427734375, "loss/logits": 0.09001794457435608, "loss/reg": 17559.2265625, "step": 1036 }, { "epoch": 0.025925, "grad_norm": 0.9213618040084839, "grad_norm_var": 0.014025414720603162, "learning_rate": 0.0001, "loss": 17500.082, "loss/crossentropy": 2.89579176902771, "loss/hidden": 0.423828125, "loss/logits": 0.08792084455490112, "loss/reg": 17499.5703125, "step": 1037 }, { "epoch": 0.02595, "grad_norm": 0.781911313533783, "grad_norm_var": 0.013576918695598754, "learning_rate": 0.0001, "loss": 17440.498, "loss/crossentropy": 2.9025070667266846, "loss/hidden": 0.41796875, "loss/logits": 0.08638165146112442, "loss/reg": 17439.994140625, "step": 1038 }, { "epoch": 0.025975, "grad_norm": 1.1047182083129883, "grad_norm_var": 0.013544295980080993, "learning_rate": 0.0001, "loss": 17382.502, "loss/crossentropy": 2.6538867950439453, "loss/hidden": 0.447265625, "loss/logits": 0.08192922174930573, "loss/reg": 17381.97265625, "step": 1039 }, { "epoch": 0.026, "grad_norm": 0.8481978178024292, "grad_norm_var": 0.014304181767292675, "learning_rate": 0.0001, "loss": 17323.1992, "loss/crossentropy": 2.7071163654327393, "loss/hidden": 0.44140625, "loss/logits": 0.08912265300750732, "loss/reg": 17322.66796875, "step": 1040 }, { "epoch": 0.026025, "grad_norm": 0.892241358757019, "grad_norm_var": 0.012502882263243704, "learning_rate": 0.0001, "loss": 17265.3945, "loss/crossentropy": 3.038763999938965, "loss/hidden": 0.431640625, "loss/logits": 0.08531604707241058, "loss/reg": 17264.876953125, "step": 1041 }, { "epoch": 0.02605, "grad_norm": 1.0308339595794678, "grad_norm_var": 0.010795231235618511, "learning_rate": 0.0001, "loss": 17199.1367, "loss/crossentropy": 2.9324772357940674, "loss/hidden": 0.4296875, "loss/logits": 0.08472186326980591, "loss/reg": 17198.623046875, "step": 1042 }, { "epoch": 0.026075, "grad_norm": 0.9274694919586182, "grad_norm_var": 0.010848683179298983, "learning_rate": 0.0001, "loss": 17139.7949, "loss/crossentropy": 2.8589069843292236, "loss/hidden": 0.4375, "loss/logits": 0.08558094501495361, "loss/reg": 17139.271484375, "step": 1043 }, { "epoch": 0.0261, "grad_norm": 1.3131765127182007, "grad_norm_var": 0.01884521092897805, "learning_rate": 0.0001, "loss": 17080.8555, "loss/crossentropy": 2.664062023162842, "loss/hidden": 0.427734375, "loss/logits": 0.08755569905042648, "loss/reg": 17080.33984375, "step": 1044 }, { "epoch": 0.026125, "grad_norm": 1.1463737487792969, "grad_norm_var": 0.020258964221098318, "learning_rate": 0.0001, "loss": 17022.043, "loss/crossentropy": 2.744938373565674, "loss/hidden": 0.439453125, "loss/logits": 0.09745976328849792, "loss/reg": 17021.505859375, "step": 1045 }, { "epoch": 0.02615, "grad_norm": 1.1437360048294067, "grad_norm_var": 0.02178928738922002, "learning_rate": 0.0001, "loss": 16964.625, "loss/crossentropy": 3.15686297416687, "loss/hidden": 0.43359375, "loss/logits": 0.10271066427230835, "loss/reg": 16964.087890625, "step": 1046 }, { "epoch": 0.026175, "grad_norm": 1.1650415658950806, "grad_norm_var": 0.023423204944099264, "learning_rate": 0.0001, "loss": 16906.3867, "loss/crossentropy": 2.66678786277771, "loss/hidden": 0.443359375, "loss/logits": 0.11273590475320816, "loss/reg": 16905.830078125, "step": 1047 }, { "epoch": 0.0262, "grad_norm": 1.231393814086914, "grad_norm_var": 0.026608036552323223, "learning_rate": 0.0001, "loss": 16848.3691, "loss/crossentropy": 2.6733105182647705, "loss/hidden": 0.466796875, "loss/logits": 0.09650056064128876, "loss/reg": 16847.806640625, "step": 1048 }, { "epoch": 0.026225, "grad_norm": 0.9987756609916687, "grad_norm_var": 0.026551590936127904, "learning_rate": 0.0001, "loss": 16789.7051, "loss/crossentropy": 2.7861099243164062, "loss/hidden": 0.455078125, "loss/logits": 0.08518000692129135, "loss/reg": 16789.1640625, "step": 1049 }, { "epoch": 0.02625, "grad_norm": 0.8940253853797913, "grad_norm_var": 0.022985704412560048, "learning_rate": 0.0001, "loss": 16732.5352, "loss/crossentropy": 2.9168479442596436, "loss/hidden": 0.4296875, "loss/logits": 0.08574089407920837, "loss/reg": 16732.01953125, "step": 1050 }, { "epoch": 0.026275, "grad_norm": 0.9486713409423828, "grad_norm_var": 0.022969959845412412, "learning_rate": 0.0001, "loss": 16675.8574, "loss/crossentropy": 2.9892466068267822, "loss/hidden": 0.435546875, "loss/logits": 0.09711939096450806, "loss/reg": 16675.32421875, "step": 1051 }, { "epoch": 0.0263, "grad_norm": 1.0788503885269165, "grad_norm_var": 0.02234845552669828, "learning_rate": 0.0001, "loss": 16619.1816, "loss/crossentropy": 3.05783748626709, "loss/hidden": 0.4609375, "loss/logits": 0.08326160907745361, "loss/reg": 16618.63671875, "step": 1052 }, { "epoch": 0.026325, "grad_norm": 0.9396947622299194, "grad_norm_var": 0.022112037925466396, "learning_rate": 0.0001, "loss": 16562.8984, "loss/crossentropy": 2.759321451187134, "loss/hidden": 0.41015625, "loss/logits": 0.07942262291908264, "loss/reg": 16562.408203125, "step": 1053 }, { "epoch": 0.02635, "grad_norm": 0.9998372197151184, "grad_norm_var": 0.017934968504518863, "learning_rate": 0.0001, "loss": 16507.1777, "loss/crossentropy": 2.9365243911743164, "loss/hidden": 0.43359375, "loss/logits": 0.09559677541255951, "loss/reg": 16506.6484375, "step": 1054 }, { "epoch": 0.026375, "grad_norm": 0.9944206476211548, "grad_norm_var": 0.017764722025300238, "learning_rate": 0.0001, "loss": 16451.0293, "loss/crossentropy": 2.7758195400238037, "loss/hidden": 0.48046875, "loss/logits": 0.10010755807161331, "loss/reg": 16450.44921875, "step": 1055 }, { "epoch": 0.0264, "grad_norm": 0.9697908163070679, "grad_norm_var": 0.015667620645066178, "learning_rate": 0.0001, "loss": 16395.4805, "loss/crossentropy": 2.854970693588257, "loss/hidden": 0.435546875, "loss/logits": 0.09074226021766663, "loss/reg": 16394.955078125, "step": 1056 }, { "epoch": 0.026425, "grad_norm": 0.9877775311470032, "grad_norm_var": 0.014328562051209323, "learning_rate": 0.0001, "loss": 16340.9678, "loss/crossentropy": 2.782637357711792, "loss/hidden": 0.4609375, "loss/logits": 0.1002429649233818, "loss/reg": 16340.40625, "step": 1057 }, { "epoch": 0.02645, "grad_norm": 0.8147690296173096, "grad_norm_var": 0.017744210942317786, "learning_rate": 0.0001, "loss": 16285.5225, "loss/crossentropy": 2.6116652488708496, "loss/hidden": 0.431640625, "loss/logits": 0.08781687915325165, "loss/reg": 16285.0029296875, "step": 1058 }, { "epoch": 0.026475, "grad_norm": 0.8014287352561951, "grad_norm_var": 0.020537691856836307, "learning_rate": 0.0001, "loss": 16232.4189, "loss/crossentropy": 2.857954263687134, "loss/hidden": 0.43359375, "loss/logits": 0.09286174923181534, "loss/reg": 16231.892578125, "step": 1059 }, { "epoch": 0.0265, "grad_norm": 0.8132032752037048, "grad_norm_var": 0.01706595369942955, "learning_rate": 0.0001, "loss": 16179.2793, "loss/crossentropy": 2.768078088760376, "loss/hidden": 0.43359375, "loss/logits": 0.0950530394911766, "loss/reg": 16178.7509765625, "step": 1060 }, { "epoch": 0.026525, "grad_norm": 0.8404484987258911, "grad_norm_var": 0.01676066406379461, "learning_rate": 0.0001, "loss": 16125.2832, "loss/crossentropy": 2.6537022590637207, "loss/hidden": 0.423828125, "loss/logits": 0.07667912542819977, "loss/reg": 16124.7822265625, "step": 1061 }, { "epoch": 0.02655, "grad_norm": 0.9588916897773743, "grad_norm_var": 0.014771155375485737, "learning_rate": 0.0001, "loss": 16071.2188, "loss/crossentropy": 2.582611322402954, "loss/hidden": 0.423828125, "loss/logits": 0.08356771618127823, "loss/reg": 16070.7109375, "step": 1062 }, { "epoch": 0.026575, "grad_norm": 0.9827629327774048, "grad_norm_var": 0.011981449334387865, "learning_rate": 0.0001, "loss": 16017.4893, "loss/crossentropy": 2.8175463676452637, "loss/hidden": 0.435546875, "loss/logits": 0.0889303982257843, "loss/reg": 16016.96484375, "step": 1063 }, { "epoch": 0.0266, "grad_norm": 0.7705737352371216, "grad_norm_var": 0.008174274018235516, "learning_rate": 0.0001, "loss": 15963.8389, "loss/crossentropy": 2.9675939083099365, "loss/hidden": 0.423828125, "loss/logits": 0.08410842716693878, "loss/reg": 15963.3310546875, "step": 1064 }, { "epoch": 0.026625, "grad_norm": 0.7642571926116943, "grad_norm_var": 0.009292926304177485, "learning_rate": 0.0001, "loss": 15910.2979, "loss/crossentropy": 2.987457752227783, "loss/hidden": 0.435546875, "loss/logits": 0.0909990519285202, "loss/reg": 15909.771484375, "step": 1065 }, { "epoch": 0.02665, "grad_norm": 0.812102198600769, "grad_norm_var": 0.009886474071495129, "learning_rate": 0.0001, "loss": 15856.9854, "loss/crossentropy": 2.946333885192871, "loss/hidden": 0.4296875, "loss/logits": 0.0913858711719513, "loss/reg": 15856.4638671875, "step": 1066 }, { "epoch": 0.026675, "grad_norm": 0.857896625995636, "grad_norm_var": 0.00987100371605288, "learning_rate": 0.0001, "loss": 15803.7031, "loss/crossentropy": 2.797611951828003, "loss/hidden": 0.416015625, "loss/logits": 0.07527288794517517, "loss/reg": 15803.2119140625, "step": 1067 }, { "epoch": 0.0267, "grad_norm": 1.0861632823944092, "grad_norm_var": 0.010049544829034574, "learning_rate": 0.0001, "loss": 15751.3408, "loss/crossentropy": 2.6403746604919434, "loss/hidden": 0.46875, "loss/logits": 0.08439454436302185, "loss/reg": 15750.7880859375, "step": 1068 }, { "epoch": 0.026725, "grad_norm": 0.9127049446105957, "grad_norm_var": 0.009950880314127098, "learning_rate": 0.0001, "loss": 15698.7949, "loss/crossentropy": 2.682400703430176, "loss/hidden": 0.42578125, "loss/logits": 0.08712255954742432, "loss/reg": 15698.2822265625, "step": 1069 }, { "epoch": 0.02675, "grad_norm": 0.9740332365036011, "grad_norm_var": 0.009641912604132451, "learning_rate": 0.0001, "loss": 15646.957, "loss/crossentropy": 2.686271905899048, "loss/hidden": 0.44921875, "loss/logits": 0.08891947567462921, "loss/reg": 15646.4189453125, "step": 1070 }, { "epoch": 0.026775, "grad_norm": 1.1772500276565552, "grad_norm_var": 0.01412233905048681, "learning_rate": 0.0001, "loss": 15595.2686, "loss/crossentropy": 2.958514928817749, "loss/hidden": 0.43359375, "loss/logits": 0.08424527198076248, "loss/reg": 15594.7509765625, "step": 1071 }, { "epoch": 0.0268, "grad_norm": 1.1505613327026367, "grad_norm_var": 0.017659985188438164, "learning_rate": 0.0001, "loss": 15543.4844, "loss/crossentropy": 2.8109195232391357, "loss/hidden": 0.458984375, "loss/logits": 0.10685601830482483, "loss/reg": 15542.9189453125, "step": 1072 }, { "epoch": 0.026825, "grad_norm": 0.9034889936447144, "grad_norm_var": 0.017331644670570214, "learning_rate": 0.0001, "loss": 15491.6006, "loss/crossentropy": 2.8358302116394043, "loss/hidden": 0.427734375, "loss/logits": 0.086671844124794, "loss/reg": 15491.0859375, "step": 1073 }, { "epoch": 0.02685, "grad_norm": 0.9926300048828125, "grad_norm_var": 0.01696069494076884, "learning_rate": 0.0001, "loss": 15440.167, "loss/crossentropy": 2.7901105880737305, "loss/hidden": 0.419921875, "loss/logits": 0.08056454360485077, "loss/reg": 15439.6669921875, "step": 1074 }, { "epoch": 0.026875, "grad_norm": 0.8253371715545654, "grad_norm_var": 0.016602820780788348, "learning_rate": 0.0001, "loss": 15389.0439, "loss/crossentropy": 2.8071136474609375, "loss/hidden": 0.42578125, "loss/logits": 0.08567114174365997, "loss/reg": 15388.5322265625, "step": 1075 }, { "epoch": 0.0269, "grad_norm": 0.8556614518165588, "grad_norm_var": 0.016074706019418968, "learning_rate": 0.0001, "loss": 15338.8643, "loss/crossentropy": 2.960524797439575, "loss/hidden": 0.421875, "loss/logits": 0.0796973779797554, "loss/reg": 15338.3623046875, "step": 1076 }, { "epoch": 0.026925, "grad_norm": 0.8220919370651245, "grad_norm_var": 0.016312616474285932, "learning_rate": 0.0001, "loss": 15288.1484, "loss/crossentropy": 2.902122735977173, "loss/hidden": 0.419921875, "loss/logits": 0.08749879151582718, "loss/reg": 15287.640625, "step": 1077 }, { "epoch": 0.02695, "grad_norm": 0.9153377413749695, "grad_norm_var": 0.016251203350715674, "learning_rate": 0.0001, "loss": 15239.1123, "loss/crossentropy": 2.5998146533966064, "loss/hidden": 0.455078125, "loss/logits": 0.07559297233819962, "loss/reg": 15238.58203125, "step": 1078 }, { "epoch": 0.026975, "grad_norm": 1.333935260772705, "grad_norm_var": 0.026655112509867, "learning_rate": 0.0001, "loss": 15190.1279, "loss/crossentropy": 2.974548101425171, "loss/hidden": 0.48046875, "loss/logits": 0.17960259318351746, "loss/reg": 15189.4677734375, "step": 1079 }, { "epoch": 0.027, "grad_norm": 0.8889738321304321, "grad_norm_var": 0.024744090894334634, "learning_rate": 0.0001, "loss": 15138.1201, "loss/crossentropy": 2.7455484867095947, "loss/hidden": 0.439453125, "loss/logits": 0.09583721309900284, "loss/reg": 15137.5849609375, "step": 1080 }, { "epoch": 0.027025, "grad_norm": 0.7190319895744324, "grad_norm_var": 0.026019252870833044, "learning_rate": 0.0001, "loss": 15087.3291, "loss/crossentropy": 2.798892021179199, "loss/hidden": 0.42578125, "loss/logits": 0.0866219699382782, "loss/reg": 15086.81640625, "step": 1081 }, { "epoch": 0.02705, "grad_norm": 0.7696980834007263, "grad_norm_var": 0.026920904193799312, "learning_rate": 0.0001, "loss": 15037.7148, "loss/crossentropy": 2.596310615539551, "loss/hidden": 0.423828125, "loss/logits": 0.07713499665260315, "loss/reg": 15037.2138671875, "step": 1082 }, { "epoch": 0.027075, "grad_norm": 0.9159994721412659, "grad_norm_var": 0.026425733071442047, "learning_rate": 0.0001, "loss": 14988.4238, "loss/crossentropy": 2.9121432304382324, "loss/hidden": 0.43359375, "loss/logits": 0.08162932097911835, "loss/reg": 14987.908203125, "step": 1083 }, { "epoch": 0.0271, "grad_norm": 0.910456657409668, "grad_norm_var": 0.025228125400286894, "learning_rate": 0.0001, "loss": 14938.9277, "loss/crossentropy": 2.826249837875366, "loss/hidden": 0.4296875, "loss/logits": 0.08516315370798111, "loss/reg": 14938.4130859375, "step": 1084 }, { "epoch": 0.027125, "grad_norm": 1.0477243661880493, "grad_norm_var": 0.025845538428886157, "learning_rate": 0.0001, "loss": 14889.8086, "loss/crossentropy": 2.8449409008026123, "loss/hidden": 0.462890625, "loss/logits": 0.10157535970211029, "loss/reg": 14889.244140625, "step": 1085 }, { "epoch": 0.02715, "grad_norm": 2.7801320552825928, "grad_norm_var": 0.23547433160522477, "learning_rate": 0.0001, "loss": 14840.0693, "loss/crossentropy": 3.2276058197021484, "loss/hidden": 0.470703125, "loss/logits": 0.08429836481809616, "loss/reg": 14839.5146484375, "step": 1086 }, { "epoch": 0.027175, "grad_norm": 1.2570152282714844, "grad_norm_var": 0.2370868709294562, "learning_rate": 0.0001, "loss": 14790.6309, "loss/crossentropy": 2.885909080505371, "loss/hidden": 0.455078125, "loss/logits": 0.10632868111133575, "loss/reg": 14790.0693359375, "step": 1087 }, { "epoch": 0.0272, "grad_norm": 0.9222579598426819, "grad_norm_var": 0.23783146250132223, "learning_rate": 0.0001, "loss": 14741.3857, "loss/crossentropy": 2.917914390563965, "loss/hidden": 0.435546875, "loss/logits": 0.08871741592884064, "loss/reg": 14740.861328125, "step": 1088 }, { "epoch": 0.027225, "grad_norm": 1.1597723960876465, "grad_norm_var": 0.23680243597076794, "learning_rate": 0.0001, "loss": 14691.9111, "loss/crossentropy": 3.21909499168396, "loss/hidden": 0.466796875, "loss/logits": 0.08862360566854477, "loss/reg": 14691.35546875, "step": 1089 }, { "epoch": 0.02725, "grad_norm": 1.7188750505447388, "grad_norm_var": 0.262298855826787, "learning_rate": 0.0001, "loss": 14641.9336, "loss/crossentropy": 2.998014450073242, "loss/hidden": 0.4453125, "loss/logits": 0.08976680040359497, "loss/reg": 14641.3984375, "step": 1090 }, { "epoch": 0.027275, "grad_norm": 0.9482136964797974, "grad_norm_var": 0.2584944634799588, "learning_rate": 0.0001, "loss": 14591.6689, "loss/crossentropy": 2.530045986175537, "loss/hidden": 0.431640625, "loss/logits": 0.07906611263751984, "loss/reg": 14591.158203125, "step": 1091 }, { "epoch": 0.0273, "grad_norm": 0.7725463509559631, "grad_norm_var": 0.26188691520636426, "learning_rate": 0.0001, "loss": 14542.3398, "loss/crossentropy": 2.439523935317993, "loss/hidden": 0.435546875, "loss/logits": 0.09834860265254974, "loss/reg": 14541.8056640625, "step": 1092 }, { "epoch": 0.027325, "grad_norm": 0.824009358882904, "grad_norm_var": 0.2618115891237431, "learning_rate": 0.0001, "loss": 14493.7295, "loss/crossentropy": 2.9190967082977295, "loss/hidden": 0.435546875, "loss/logits": 0.08553117513656616, "loss/reg": 14493.2080078125, "step": 1093 }, { "epoch": 0.02735, "grad_norm": 0.9745771288871765, "grad_norm_var": 0.26043216037856104, "learning_rate": 0.0001, "loss": 14444.9678, "loss/crossentropy": 2.7135884761810303, "loss/hidden": 0.45703125, "loss/logits": 0.12357691675424576, "loss/reg": 14444.38671875, "step": 1094 }, { "epoch": 0.027375, "grad_norm": 0.9248599410057068, "grad_norm_var": 0.25930147487457583, "learning_rate": 0.0001, "loss": 14395.5244, "loss/crossentropy": 2.629840850830078, "loss/hidden": 0.4375, "loss/logits": 0.09772805869579315, "loss/reg": 14394.9892578125, "step": 1095 }, { "epoch": 0.0274, "grad_norm": 0.9113340973854065, "grad_norm_var": 0.25871584829829997, "learning_rate": 0.0001, "loss": 14347.1514, "loss/crossentropy": 2.892179250717163, "loss/hidden": 0.435546875, "loss/logits": 0.0860610157251358, "loss/reg": 14346.6298828125, "step": 1096 }, { "epoch": 0.027425, "grad_norm": 1.135790467262268, "grad_norm_var": 0.24855283365572234, "learning_rate": 0.0001, "loss": 14299.7197, "loss/crossentropy": 2.764049768447876, "loss/hidden": 0.435546875, "loss/logits": 0.0889464020729065, "loss/reg": 14299.1953125, "step": 1097 }, { "epoch": 0.02745, "grad_norm": 1.0498467683792114, "grad_norm_var": 0.24024881145662946, "learning_rate": 0.0001, "loss": 14252.5498, "loss/crossentropy": 2.697889804840088, "loss/hidden": 0.4375, "loss/logits": 0.10231277346611023, "loss/reg": 14252.009765625, "step": 1098 }, { "epoch": 0.027475, "grad_norm": 0.903654158115387, "grad_norm_var": 0.24062843081926943, "learning_rate": 0.0001, "loss": 14204.5557, "loss/crossentropy": 2.7106425762176514, "loss/hidden": 0.43359375, "loss/logits": 0.09402255713939667, "loss/reg": 14204.0283203125, "step": 1099 }, { "epoch": 0.0275, "grad_norm": 0.7978147864341736, "grad_norm_var": 0.24486993530923176, "learning_rate": 0.0001, "loss": 14156.9307, "loss/crossentropy": 2.6107664108276367, "loss/hidden": 0.423828125, "loss/logits": 0.07774809002876282, "loss/reg": 14156.4287109375, "step": 1100 }, { "epoch": 0.027525, "grad_norm": 1.0636903047561646, "grad_norm_var": 0.2447042768004019, "learning_rate": 0.0001, "loss": 14110.3281, "loss/crossentropy": 2.850802421569824, "loss/hidden": 0.482421875, "loss/logits": 0.1095501109957695, "loss/reg": 14109.736328125, "step": 1101 }, { "epoch": 0.02755, "grad_norm": 0.854430615901947, "grad_norm_var": 0.05381972596157554, "learning_rate": 0.0001, "loss": 14061.6553, "loss/crossentropy": 2.9119491577148438, "loss/hidden": 0.44140625, "loss/logits": 0.09522178769111633, "loss/reg": 14061.1181640625, "step": 1102 }, { "epoch": 0.027575, "grad_norm": 1.118687391281128, "grad_norm_var": 0.05052741212680799, "learning_rate": 0.0001, "loss": 14015.6572, "loss/crossentropy": 2.852381944656372, "loss/hidden": 0.4765625, "loss/logits": 0.1372184008359909, "loss/reg": 14015.04296875, "step": 1103 }, { "epoch": 0.0276, "grad_norm": 0.9183930158615112, "grad_norm_var": 0.05057099646219327, "learning_rate": 0.0001, "loss": 13969.3428, "loss/crossentropy": 2.8287909030914307, "loss/hidden": 0.439453125, "loss/logits": 0.09352678060531616, "loss/reg": 13968.8095703125, "step": 1104 }, { "epoch": 0.027625, "grad_norm": 0.7984148859977722, "grad_norm_var": 0.05126455750925533, "learning_rate": 0.0001, "loss": 13924.2598, "loss/crossentropy": 2.8634836673736572, "loss/hidden": 0.435546875, "loss/logits": 0.08804089576005936, "loss/reg": 13923.736328125, "step": 1105 }, { "epoch": 0.02765, "grad_norm": 0.9173059463500977, "grad_norm_var": 0.012688413073267653, "learning_rate": 0.0001, "loss": 13879.0557, "loss/crossentropy": 2.5765674114227295, "loss/hidden": 0.419921875, "loss/logits": 0.074608713388443, "loss/reg": 13878.5615234375, "step": 1106 }, { "epoch": 0.027675, "grad_norm": 1.2418100833892822, "grad_norm_var": 0.01870670627212404, "learning_rate": 0.0001, "loss": 13833.5557, "loss/crossentropy": 2.7832295894622803, "loss/hidden": 0.435546875, "loss/logits": 0.09539151191711426, "loss/reg": 13833.0244140625, "step": 1107 }, { "epoch": 0.0277, "grad_norm": 1.3345333337783813, "grad_norm_var": 0.025115599668681852, "learning_rate": 0.0001, "loss": 13788.3779, "loss/crossentropy": 3.0659708976745605, "loss/hidden": 0.455078125, "loss/logits": 0.09437006711959839, "loss/reg": 13787.828125, "step": 1108 }, { "epoch": 0.027725, "grad_norm": 1.1107778549194336, "grad_norm_var": 0.024077882887884157, "learning_rate": 0.0001, "loss": 13743.627, "loss/crossentropy": 3.3273847103118896, "loss/hidden": 0.4375, "loss/logits": 0.09910184144973755, "loss/reg": 13743.0908203125, "step": 1109 }, { "epoch": 0.02775, "grad_norm": 0.7995766997337341, "grad_norm_var": 0.02666670789335961, "learning_rate": 0.0001, "loss": 13698.3145, "loss/crossentropy": 2.670424699783325, "loss/hidden": 0.431640625, "loss/logits": 0.08939287066459656, "loss/reg": 13697.79296875, "step": 1110 }, { "epoch": 0.027775, "grad_norm": 0.998966634273529, "grad_norm_var": 0.026341033002484915, "learning_rate": 0.0001, "loss": 13653.4668, "loss/crossentropy": 2.8975906372070312, "loss/hidden": 0.439453125, "loss/logits": 0.08185222744941711, "loss/reg": 13652.9453125, "step": 1111 }, { "epoch": 0.0278, "grad_norm": 0.9062837958335876, "grad_norm_var": 0.026400439646917667, "learning_rate": 0.0001, "loss": 13608.8438, "loss/crossentropy": 2.8450193405151367, "loss/hidden": 0.478515625, "loss/logits": 0.09909355640411377, "loss/reg": 13608.2666015625, "step": 1112 }, { "epoch": 0.027825, "grad_norm": 0.9913869500160217, "grad_norm_var": 0.025029034203488957, "learning_rate": 0.0001, "loss": 13564.2236, "loss/crossentropy": 2.94085955619812, "loss/hidden": 0.4765625, "loss/logits": 0.09758688509464264, "loss/reg": 13563.6494140625, "step": 1113 }, { "epoch": 0.02785, "grad_norm": 0.9357085824012756, "grad_norm_var": 0.024899735922832737, "learning_rate": 0.0001, "loss": 13519.4248, "loss/crossentropy": 2.7248375415802, "loss/hidden": 0.451171875, "loss/logits": 0.07659360766410828, "loss/reg": 13518.8974609375, "step": 1114 }, { "epoch": 0.027875, "grad_norm": 0.965599536895752, "grad_norm_var": 0.02450309028379388, "learning_rate": 0.0001, "loss": 13475.3721, "loss/crossentropy": 2.5047667026519775, "loss/hidden": 0.466796875, "loss/logits": 0.11319025605916977, "loss/reg": 13474.7919921875, "step": 1115 }, { "epoch": 0.0279, "grad_norm": 0.8854598999023438, "grad_norm_var": 0.02280057994939105, "learning_rate": 0.0001, "loss": 13431.3467, "loss/crossentropy": 2.8435001373291016, "loss/hidden": 0.421875, "loss/logits": 0.07650317251682281, "loss/reg": 13430.8486328125, "step": 1116 }, { "epoch": 0.027925, "grad_norm": 0.8546820878982544, "grad_norm_var": 0.0234790594300057, "learning_rate": 0.0001, "loss": 13388.2959, "loss/crossentropy": 2.8461828231811523, "loss/hidden": 0.43359375, "loss/logits": 0.0916643738746643, "loss/reg": 13387.7705078125, "step": 1117 }, { "epoch": 0.02795, "grad_norm": 0.9899805188179016, "grad_norm_var": 0.022412165134545116, "learning_rate": 0.0001, "loss": 13344.9209, "loss/crossentropy": 3.1090798377990723, "loss/hidden": 0.455078125, "loss/logits": 0.08351921290159225, "loss/reg": 13344.3818359375, "step": 1118 }, { "epoch": 0.027975, "grad_norm": 0.7054470777511597, "grad_norm_var": 0.025745193825458406, "learning_rate": 0.0001, "loss": 13301.3359, "loss/crossentropy": 2.9630494117736816, "loss/hidden": 0.421875, "loss/logits": 0.07988356798887253, "loss/reg": 13300.833984375, "step": 1119 }, { "epoch": 0.028, "grad_norm": 1.0503507852554321, "grad_norm_var": 0.026107686898817883, "learning_rate": 0.0001, "loss": 13259.3818, "loss/crossentropy": 2.9457664489746094, "loss/hidden": 0.48828125, "loss/logits": 0.10067403316497803, "loss/reg": 13258.79296875, "step": 1120 }, { "epoch": 0.028025, "grad_norm": 1.0407966375350952, "grad_norm_var": 0.024302380428266622, "learning_rate": 0.0001, "loss": 13216.4863, "loss/crossentropy": 2.856452703475952, "loss/hidden": 0.47265625, "loss/logits": 0.10159683227539062, "loss/reg": 13215.912109375, "step": 1121 }, { "epoch": 0.02805, "grad_norm": 0.8844307661056519, "grad_norm_var": 0.024658072108102166, "learning_rate": 0.0001, "loss": 13173.7305, "loss/crossentropy": 2.7508833408355713, "loss/hidden": 0.470703125, "loss/logits": 0.10203750431537628, "loss/reg": 13173.158203125, "step": 1122 }, { "epoch": 0.028075, "grad_norm": 0.8127800226211548, "grad_norm_var": 0.02124211819669591, "learning_rate": 0.0001, "loss": 13131.751, "loss/crossentropy": 2.946770668029785, "loss/hidden": 0.43359375, "loss/logits": 0.08815675228834152, "loss/reg": 13131.2294921875, "step": 1123 }, { "epoch": 0.0281, "grad_norm": 0.8686839938163757, "grad_norm_var": 0.011180152481760535, "learning_rate": 0.0001, "loss": 13087.5605, "loss/crossentropy": 2.7543065547943115, "loss/hidden": 0.44921875, "loss/logits": 0.10058388859033585, "loss/reg": 13087.0107421875, "step": 1124 }, { "epoch": 0.028125, "grad_norm": 0.895906388759613, "grad_norm_var": 0.008744946401899689, "learning_rate": 0.0001, "loss": 13045.1602, "loss/crossentropy": 3.002467155456543, "loss/hidden": 0.4609375, "loss/logits": 0.09118813276290894, "loss/reg": 13044.6083984375, "step": 1125 }, { "epoch": 0.02815, "grad_norm": 1.1829336881637573, "grad_norm_var": 0.012202712169516027, "learning_rate": 0.0001, "loss": 13002.5703, "loss/crossentropy": 3.0006842613220215, "loss/hidden": 0.435546875, "loss/logits": 0.09035210311412811, "loss/reg": 13002.0439453125, "step": 1126 }, { "epoch": 0.028175, "grad_norm": 1.4346967935562134, "grad_norm_var": 0.02775118000169859, "learning_rate": 0.0001, "loss": 12960.9385, "loss/crossentropy": 2.900606393814087, "loss/hidden": 0.484375, "loss/logits": 0.13488101959228516, "loss/reg": 12960.3193359375, "step": 1127 }, { "epoch": 0.0282, "grad_norm": 1.0146552324295044, "grad_norm_var": 0.02766827493378264, "learning_rate": 0.0001, "loss": 12919.1104, "loss/crossentropy": 3.1415045261383057, "loss/hidden": 0.4375, "loss/logits": 0.08919603377580643, "loss/reg": 12918.583984375, "step": 1128 }, { "epoch": 0.028225, "grad_norm": 1.0106333494186401, "grad_norm_var": 0.027747352010968028, "learning_rate": 0.0001, "loss": 12876.6816, "loss/crossentropy": 2.976088285446167, "loss/hidden": 0.458984375, "loss/logits": 0.09923793375492096, "loss/reg": 12876.123046875, "step": 1129 }, { "epoch": 0.02825, "grad_norm": 0.9112340211868286, "grad_norm_var": 0.02789929156606991, "learning_rate": 0.0001, "loss": 12834.8223, "loss/crossentropy": 2.673325538635254, "loss/hidden": 0.44921875, "loss/logits": 0.11222439259290695, "loss/reg": 12834.2607421875, "step": 1130 }, { "epoch": 0.028275, "grad_norm": 0.920089602470398, "grad_norm_var": 0.028050992378277077, "learning_rate": 0.0001, "loss": 12793.7109, "loss/crossentropy": 2.7545506954193115, "loss/hidden": 0.435546875, "loss/logits": 0.10355065017938614, "loss/reg": 12793.171875, "step": 1131 }, { "epoch": 0.0283, "grad_norm": 0.8590668439865112, "grad_norm_var": 0.028379443047076913, "learning_rate": 0.0001, "loss": 12752.7422, "loss/crossentropy": 2.784660577774048, "loss/hidden": 0.4296875, "loss/logits": 0.09297633171081543, "loss/reg": 12752.2197265625, "step": 1132 }, { "epoch": 0.028325, "grad_norm": 0.6936010718345642, "grad_norm_var": 0.03236561028099718, "learning_rate": 0.0001, "loss": 12712.0752, "loss/crossentropy": 2.6045994758605957, "loss/hidden": 0.4296875, "loss/logits": 0.07505609840154648, "loss/reg": 12711.5703125, "step": 1133 }, { "epoch": 0.02835, "grad_norm": 0.8183262944221497, "grad_norm_var": 0.033399834386414716, "learning_rate": 0.0001, "loss": 12671.4336, "loss/crossentropy": 2.6184515953063965, "loss/hidden": 0.421875, "loss/logits": 0.07417555153369904, "loss/reg": 12670.9375, "step": 1134 }, { "epoch": 0.028375, "grad_norm": 0.8885838985443115, "grad_norm_var": 0.02967154541324343, "learning_rate": 0.0001, "loss": 12630.6895, "loss/crossentropy": 3.2083587646484375, "loss/hidden": 0.419921875, "loss/logits": 0.0850653350353241, "loss/reg": 12630.1845703125, "step": 1135 }, { "epoch": 0.0284, "grad_norm": 0.8322352170944214, "grad_norm_var": 0.02988425096504533, "learning_rate": 0.0001, "loss": 12590.1465, "loss/crossentropy": 2.6865787506103516, "loss/hidden": 0.4296875, "loss/logits": 0.08578802645206451, "loss/reg": 12589.630859375, "step": 1136 }, { "epoch": 0.028425, "grad_norm": 0.8828990459442139, "grad_norm_var": 0.02935810929242463, "learning_rate": 0.0001, "loss": 12549.917, "loss/crossentropy": 2.778714179992676, "loss/hidden": 0.4375, "loss/logits": 0.09445519745349884, "loss/reg": 12549.384765625, "step": 1137 }, { "epoch": 0.02845, "grad_norm": 1.2958890199661255, "grad_norm_var": 0.037333791847929315, "learning_rate": 0.0001, "loss": 12509.4854, "loss/crossentropy": 3.0362274646759033, "loss/hidden": 0.486328125, "loss/logits": 0.11208359897136688, "loss/reg": 12508.88671875, "step": 1138 }, { "epoch": 0.028475, "grad_norm": 1.0555758476257324, "grad_norm_var": 0.036328687068246525, "learning_rate": 0.0001, "loss": 12469.209, "loss/crossentropy": 2.772933006286621, "loss/hidden": 0.47265625, "loss/logits": 0.0948249101638794, "loss/reg": 12468.6416015625, "step": 1139 }, { "epoch": 0.0285, "grad_norm": 0.9909307956695557, "grad_norm_var": 0.0355654438528526, "learning_rate": 0.0001, "loss": 12429.7412, "loss/crossentropy": 2.6208302974700928, "loss/hidden": 0.478515625, "loss/logits": 0.09450601041316986, "loss/reg": 12429.16796875, "step": 1140 }, { "epoch": 0.028525, "grad_norm": 0.8380599021911621, "grad_norm_var": 0.03642668333360503, "learning_rate": 0.0001, "loss": 12390.4238, "loss/crossentropy": 2.7336008548736572, "loss/hidden": 0.451171875, "loss/logits": 0.08721227943897247, "loss/reg": 12389.8857421875, "step": 1141 }, { "epoch": 0.02855, "grad_norm": 1.08302640914917, "grad_norm_var": 0.03430513245419273, "learning_rate": 0.0001, "loss": 12350.8652, "loss/crossentropy": 2.8139443397521973, "loss/hidden": 0.47265625, "loss/logits": 0.10447725653648376, "loss/reg": 12350.2880859375, "step": 1142 }, { "epoch": 0.028575, "grad_norm": 1.0056302547454834, "grad_norm_var": 0.01926046379658614, "learning_rate": 0.0001, "loss": 12311.4238, "loss/crossentropy": 2.7299814224243164, "loss/hidden": 0.443359375, "loss/logits": 0.09301269799470901, "loss/reg": 12310.8876953125, "step": 1143 }, { "epoch": 0.0286, "grad_norm": 1.0221171379089355, "grad_norm_var": 0.019334461722572863, "learning_rate": 0.0001, "loss": 12272.2256, "loss/crossentropy": 2.738447427749634, "loss/hidden": 0.48046875, "loss/logits": 0.12227018177509308, "loss/reg": 12271.623046875, "step": 1144 }, { "epoch": 0.028625, "grad_norm": 1.2503671646118164, "grad_norm_var": 0.025048594107295645, "learning_rate": 0.0001, "loss": 12233.4121, "loss/crossentropy": 2.5988683700561523, "loss/hidden": 0.474609375, "loss/logits": 0.09158191084861755, "loss/reg": 12232.845703125, "step": 1145 }, { "epoch": 0.02865, "grad_norm": 0.8764640688896179, "grad_norm_var": 0.025346648753542058, "learning_rate": 0.0001, "loss": 12194.0625, "loss/crossentropy": 2.9124915599823, "loss/hidden": 0.44921875, "loss/logits": 0.09028612077236176, "loss/reg": 12193.5234375, "step": 1146 }, { "epoch": 0.028675, "grad_norm": 0.9417552947998047, "grad_norm_var": 0.025269205414616976, "learning_rate": 0.0001, "loss": 12155.0537, "loss/crossentropy": 2.856606960296631, "loss/hidden": 0.451171875, "loss/logits": 0.11057109385728836, "loss/reg": 12154.4921875, "step": 1147 }, { "epoch": 0.0287, "grad_norm": 1.073207974433899, "grad_norm_var": 0.025298827761974948, "learning_rate": 0.0001, "loss": 12116.6191, "loss/crossentropy": 2.9602468013763428, "loss/hidden": 0.4921875, "loss/logits": 0.11190488934516907, "loss/reg": 12116.0146484375, "step": 1148 }, { "epoch": 0.028725, "grad_norm": 0.8884559273719788, "grad_norm_var": 0.02044427749182618, "learning_rate": 0.0001, "loss": 12078.3057, "loss/crossentropy": 2.954441547393799, "loss/hidden": 0.45703125, "loss/logits": 0.0968749150633812, "loss/reg": 12077.751953125, "step": 1149 }, { "epoch": 0.02875, "grad_norm": 1.098034381866455, "grad_norm_var": 0.01915647141942376, "learning_rate": 0.0001, "loss": 12040.9893, "loss/crossentropy": 2.5061094760894775, "loss/hidden": 0.4609375, "loss/logits": 0.11192715167999268, "loss/reg": 12040.416015625, "step": 1150 }, { "epoch": 0.028775, "grad_norm": 0.8926265835762024, "grad_norm_var": 0.01909665417308942, "learning_rate": 0.0001, "loss": 12003.5557, "loss/crossentropy": 2.9123997688293457, "loss/hidden": 0.466796875, "loss/logits": 0.09902822971343994, "loss/reg": 12002.990234375, "step": 1151 }, { "epoch": 0.0288, "grad_norm": 0.8516673445701599, "grad_norm_var": 0.01868116766225493, "learning_rate": 0.0001, "loss": 11965.3525, "loss/crossentropy": 2.524635076522827, "loss/hidden": 0.4609375, "loss/logits": 0.08091796934604645, "loss/reg": 11964.810546875, "step": 1152 }, { "epoch": 0.028825, "grad_norm": 0.9828234314918518, "grad_norm_var": 0.017706164191503015, "learning_rate": 0.0001, "loss": 11927.0225, "loss/crossentropy": 2.964435577392578, "loss/hidden": 0.48046875, "loss/logits": 0.09457586705684662, "loss/reg": 11926.447265625, "step": 1153 }, { "epoch": 0.02885, "grad_norm": 1.2419341802597046, "grad_norm_var": 0.015825419211028195, "learning_rate": 0.0001, "loss": 11888.6523, "loss/crossentropy": 3.0043070316314697, "loss/hidden": 0.474609375, "loss/logits": 0.09112387895584106, "loss/reg": 11888.0869140625, "step": 1154 }, { "epoch": 0.028875, "grad_norm": 1.0305931568145752, "grad_norm_var": 0.015698597353836267, "learning_rate": 0.0001, "loss": 11850.6299, "loss/crossentropy": 3.150991678237915, "loss/hidden": 0.455078125, "loss/logits": 0.0933547168970108, "loss/reg": 11850.0810546875, "step": 1155 }, { "epoch": 0.0289, "grad_norm": 1.0774022340774536, "grad_norm_var": 0.016012585602837352, "learning_rate": 0.0001, "loss": 11812.8965, "loss/crossentropy": 2.512453079223633, "loss/hidden": 0.484375, "loss/logits": 0.11678382754325867, "loss/reg": 11812.294921875, "step": 1156 }, { "epoch": 0.028925, "grad_norm": 1.0668998956680298, "grad_norm_var": 0.014050459894159777, "learning_rate": 0.0001, "loss": 11775.5527, "loss/crossentropy": 2.8800909519195557, "loss/hidden": 0.435546875, "loss/logits": 0.08428835868835449, "loss/reg": 11775.033203125, "step": 1157 }, { "epoch": 0.02895, "grad_norm": 1.095428705215454, "grad_norm_var": 0.014157784643863858, "learning_rate": 0.0001, "loss": 11738.0811, "loss/crossentropy": 2.7751150131225586, "loss/hidden": 0.4609375, "loss/logits": 0.099217489361763, "loss/reg": 11737.5205078125, "step": 1158 }, { "epoch": 0.028975, "grad_norm": 0.9712222218513489, "grad_norm_var": 0.014319325743920726, "learning_rate": 0.0001, "loss": 11700.4678, "loss/crossentropy": 2.9733681678771973, "loss/hidden": 0.447265625, "loss/logits": 0.08700287342071533, "loss/reg": 11699.93359375, "step": 1159 }, { "epoch": 0.029, "grad_norm": 1.1365258693695068, "grad_norm_var": 0.015130617116427854, "learning_rate": 0.0001, "loss": 11663.209, "loss/crossentropy": 2.8745062351226807, "loss/hidden": 0.478515625, "loss/logits": 0.11189639568328857, "loss/reg": 11662.6181640625, "step": 1160 }, { "epoch": 0.029025, "grad_norm": 1.149517297744751, "grad_norm_var": 0.01279922687653959, "learning_rate": 0.0001, "loss": 11626.1377, "loss/crossentropy": 3.0209760665893555, "loss/hidden": 0.462890625, "loss/logits": 0.10469117760658264, "loss/reg": 11625.5703125, "step": 1161 }, { "epoch": 0.02905, "grad_norm": 1.017334222793579, "grad_norm_var": 0.011279464116419632, "learning_rate": 0.0001, "loss": 11589.6016, "loss/crossentropy": 2.7759485244750977, "loss/hidden": 0.453125, "loss/logits": 0.09751708805561066, "loss/reg": 11589.05078125, "step": 1162 }, { "epoch": 0.029075, "grad_norm": 0.8124439120292664, "grad_norm_var": 0.013884204178922666, "learning_rate": 0.0001, "loss": 11553.4912, "loss/crossentropy": 2.4102635383605957, "loss/hidden": 0.439453125, "loss/logits": 0.08739569783210754, "loss/reg": 11552.96484375, "step": 1163 }, { "epoch": 0.0291, "grad_norm": 0.8549541234970093, "grad_norm_var": 0.015433249300702733, "learning_rate": 0.0001, "loss": 11517.8145, "loss/crossentropy": 2.915776014328003, "loss/hidden": 0.43359375, "loss/logits": 0.08613419532775879, "loss/reg": 11517.294921875, "step": 1164 }, { "epoch": 0.029125, "grad_norm": 0.9901089668273926, "grad_norm_var": 0.014425045260586921, "learning_rate": 0.0001, "loss": 11482.0186, "loss/crossentropy": 2.914726495742798, "loss/hidden": 0.470703125, "loss/logits": 0.0901867225766182, "loss/reg": 11481.4580078125, "step": 1165 }, { "epoch": 0.02915, "grad_norm": 1.008752703666687, "grad_norm_var": 0.013956747237325576, "learning_rate": 0.0001, "loss": 11446.793, "loss/crossentropy": 2.808004379272461, "loss/hidden": 0.4609375, "loss/logits": 0.08883035182952881, "loss/reg": 11446.2431640625, "step": 1166 }, { "epoch": 0.029175, "grad_norm": 1.0150833129882812, "grad_norm_var": 0.012956904326744182, "learning_rate": 0.0001, "loss": 11411.3867, "loss/crossentropy": 2.716193437576294, "loss/hidden": 0.466796875, "loss/logits": 0.09643800556659698, "loss/reg": 11410.8232421875, "step": 1167 }, { "epoch": 0.0292, "grad_norm": 1.141662359237671, "grad_norm_var": 0.011746050791282636, "learning_rate": 0.0001, "loss": 11375.5488, "loss/crossentropy": 2.971235990524292, "loss/hidden": 0.482421875, "loss/logits": 0.12297721207141876, "loss/reg": 11374.943359375, "step": 1168 }, { "epoch": 0.029225, "grad_norm": 1.0099797248840332, "grad_norm_var": 0.011595822291827495, "learning_rate": 0.0001, "loss": 11339.9443, "loss/crossentropy": 2.673962354660034, "loss/hidden": 0.48046875, "loss/logits": 0.09458102285861969, "loss/reg": 11339.369140625, "step": 1169 }, { "epoch": 0.02925, "grad_norm": 1.2093685865402222, "grad_norm_var": 0.010779820227608105, "learning_rate": 0.0001, "loss": 11305.1221, "loss/crossentropy": 2.8429079055786133, "loss/hidden": 0.5, "loss/logits": 0.08760304749011993, "loss/reg": 11304.5341796875, "step": 1170 }, { "epoch": 0.029275, "grad_norm": 1.0845022201538086, "grad_norm_var": 0.01091752697247627, "learning_rate": 0.0001, "loss": 11270.4902, "loss/crossentropy": 2.9612159729003906, "loss/hidden": 0.482421875, "loss/logits": 0.11011095345020294, "loss/reg": 11269.8974609375, "step": 1171 }, { "epoch": 0.0293, "grad_norm": 1.1352198123931885, "grad_norm_var": 0.011414219105993812, "learning_rate": 0.0001, "loss": 11235.9561, "loss/crossentropy": 2.685256004333496, "loss/hidden": 0.482421875, "loss/logits": 0.09084700793027878, "loss/reg": 11235.3828125, "step": 1172 }, { "epoch": 0.029325, "grad_norm": 0.9860196113586426, "grad_norm_var": 0.011572749677654256, "learning_rate": 0.0001, "loss": 11201.4805, "loss/crossentropy": 2.589221715927124, "loss/hidden": 0.466796875, "loss/logits": 0.08546493202447891, "loss/reg": 11200.927734375, "step": 1173 }, { "epoch": 0.02935, "grad_norm": 1.1775130033493042, "grad_norm_var": 0.012615471899458204, "learning_rate": 0.0001, "loss": 11167.1963, "loss/crossentropy": 3.0070106983184814, "loss/hidden": 0.466796875, "loss/logits": 0.10018382966518402, "loss/reg": 11166.62890625, "step": 1174 }, { "epoch": 0.029375, "grad_norm": 1.0073573589324951, "grad_norm_var": 0.012347578378169391, "learning_rate": 0.0001, "loss": 11132.5576, "loss/crossentropy": 2.6534533500671387, "loss/hidden": 0.48046875, "loss/logits": 0.08900558203458786, "loss/reg": 11131.98828125, "step": 1175 }, { "epoch": 0.0294, "grad_norm": 1.043900489807129, "grad_norm_var": 0.01176606043919694, "learning_rate": 0.0001, "loss": 11098.457, "loss/crossentropy": 2.8767192363739014, "loss/hidden": 0.4765625, "loss/logits": 0.10700362920761108, "loss/reg": 11097.873046875, "step": 1176 }, { "epoch": 0.029425, "grad_norm": 0.8921750783920288, "grad_norm_var": 0.012155306817358028, "learning_rate": 0.0001, "loss": 11064.6582, "loss/crossentropy": 2.9935896396636963, "loss/hidden": 0.46484375, "loss/logits": 0.09252595901489258, "loss/reg": 11064.1005859375, "step": 1177 }, { "epoch": 0.02945, "grad_norm": 1.0026613473892212, "grad_norm_var": 0.012182093922875467, "learning_rate": 0.0001, "loss": 11031.0293, "loss/crossentropy": 3.0917484760284424, "loss/hidden": 0.48828125, "loss/logits": 0.11424778401851654, "loss/reg": 11030.4267578125, "step": 1178 }, { "epoch": 0.029475, "grad_norm": 1.1867682933807373, "grad_norm_var": 0.010419128530427695, "learning_rate": 0.0001, "loss": 10992.6582, "loss/crossentropy": 3.0319387912750244, "loss/hidden": 0.5, "loss/logits": 0.12374146282672882, "loss/reg": 10992.0341796875, "step": 1179 }, { "epoch": 0.0295, "grad_norm": 1.1099034547805786, "grad_norm_var": 0.007966003871787801, "learning_rate": 0.0001, "loss": 10958.374, "loss/crossentropy": 2.9356632232666016, "loss/hidden": 0.48046875, "loss/logits": 0.09477466344833374, "loss/reg": 10957.798828125, "step": 1180 }, { "epoch": 0.029525, "grad_norm": 1.1255443096160889, "grad_norm_var": 0.0078040821343260324, "learning_rate": 0.0001, "loss": 10924.3008, "loss/crossentropy": 2.931514263153076, "loss/hidden": 0.46484375, "loss/logits": 0.09181191027164459, "loss/reg": 10923.744140625, "step": 1181 }, { "epoch": 0.02955, "grad_norm": 0.9785094857215881, "grad_norm_var": 0.008112359456496524, "learning_rate": 0.0001, "loss": 10890.1377, "loss/crossentropy": 2.845010280609131, "loss/hidden": 0.43359375, "loss/logits": 0.08981429040431976, "loss/reg": 10889.6142578125, "step": 1182 }, { "epoch": 0.029575, "grad_norm": 1.1371616125106812, "grad_norm_var": 0.008163990240574733, "learning_rate": 0.0001, "loss": 10856.3428, "loss/crossentropy": 3.007722854614258, "loss/hidden": 0.46484375, "loss/logits": 0.08930703997612, "loss/reg": 10855.7890625, "step": 1183 }, { "epoch": 0.0296, "grad_norm": 1.1515138149261475, "grad_norm_var": 0.008255299846433894, "learning_rate": 0.0001, "loss": 10822.2334, "loss/crossentropy": 2.758664131164551, "loss/hidden": 0.47265625, "loss/logits": 0.1009654551744461, "loss/reg": 10821.66015625, "step": 1184 }, { "epoch": 0.029625, "grad_norm": 0.8978103399276733, "grad_norm_var": 0.010049723032669282, "learning_rate": 0.0001, "loss": 10788.4775, "loss/crossentropy": 2.641123056411743, "loss/hidden": 0.4296875, "loss/logits": 0.07992805540561676, "loss/reg": 10787.9677734375, "step": 1185 }, { "epoch": 0.02965, "grad_norm": 1.1538819074630737, "grad_norm_var": 0.009213807670944367, "learning_rate": 0.0001, "loss": 10755.3135, "loss/crossentropy": 2.8010590076446533, "loss/hidden": 0.462890625, "loss/logits": 0.07945588231086731, "loss/reg": 10754.771484375, "step": 1186 }, { "epoch": 0.029675, "grad_norm": 0.839072585105896, "grad_norm_var": 0.012402611272060786, "learning_rate": 0.0001, "loss": 10722.417, "loss/crossentropy": 2.5748178958892822, "loss/hidden": 0.470703125, "loss/logits": 0.08163893967866898, "loss/reg": 10721.8642578125, "step": 1187 }, { "epoch": 0.0297, "grad_norm": 0.8767160177230835, "grad_norm_var": 0.013695720424199923, "learning_rate": 0.0001, "loss": 10688.084, "loss/crossentropy": 2.836094856262207, "loss/hidden": 0.431640625, "loss/logits": 0.08492501080036163, "loss/reg": 10687.5673828125, "step": 1188 }, { "epoch": 0.029725, "grad_norm": 1.5030205249786377, "grad_norm_var": 0.026996912799878325, "learning_rate": 0.0001, "loss": 10654.8174, "loss/crossentropy": 2.943218469619751, "loss/hidden": 0.46484375, "loss/logits": 0.09647072851657867, "loss/reg": 10654.255859375, "step": 1189 }, { "epoch": 0.02975, "grad_norm": 1.4945253133773804, "grad_norm_var": 0.03791875498742254, "learning_rate": 0.0001, "loss": 10621.7188, "loss/crossentropy": 2.9707281589508057, "loss/hidden": 0.478515625, "loss/logits": 0.11330445110797882, "loss/reg": 10621.126953125, "step": 1190 }, { "epoch": 0.029775, "grad_norm": 1.161048173904419, "grad_norm_var": 0.03775209897049773, "learning_rate": 0.0001, "loss": 10588.3164, "loss/crossentropy": 2.8618388175964355, "loss/hidden": 0.486328125, "loss/logits": 0.09845025092363358, "loss/reg": 10587.7314453125, "step": 1191 }, { "epoch": 0.0298, "grad_norm": 0.9410629868507385, "grad_norm_var": 0.039143050056103114, "learning_rate": 0.0001, "loss": 10555.1143, "loss/crossentropy": 2.684262990951538, "loss/hidden": 0.439453125, "loss/logits": 0.08633062243461609, "loss/reg": 10554.5888671875, "step": 1192 }, { "epoch": 0.029825, "grad_norm": 0.8235244750976562, "grad_norm_var": 0.04125488764726738, "learning_rate": 0.0001, "loss": 10522.4512, "loss/crossentropy": 2.9447689056396484, "loss/hidden": 0.43359375, "loss/logits": 0.09423172473907471, "loss/reg": 10521.923828125, "step": 1193 }, { "epoch": 0.02985, "grad_norm": 0.8602933287620544, "grad_norm_var": 0.044111624406263754, "learning_rate": 0.0001, "loss": 10490.2285, "loss/crossentropy": 2.9620957374572754, "loss/hidden": 0.45703125, "loss/logits": 0.08669282495975494, "loss/reg": 10489.6845703125, "step": 1194 }, { "epoch": 0.029875, "grad_norm": 0.8690145015716553, "grad_norm_var": 0.04579364713540136, "learning_rate": 0.0001, "loss": 10457.7422, "loss/crossentropy": 2.8131606578826904, "loss/hidden": 0.4375, "loss/logits": 0.09218825399875641, "loss/reg": 10457.212890625, "step": 1195 }, { "epoch": 0.0299, "grad_norm": 0.7750914692878723, "grad_norm_var": 0.050467726933814974, "learning_rate": 0.0001, "loss": 10425.543, "loss/crossentropy": 2.4100165367126465, "loss/hidden": 0.443359375, "loss/logits": 0.0891672819852829, "loss/reg": 10425.0107421875, "step": 1196 }, { "epoch": 0.029925, "grad_norm": 0.6748509407043457, "grad_norm_var": 0.05782635591090936, "learning_rate": 0.0001, "loss": 10393.8721, "loss/crossentropy": 2.8050386905670166, "loss/hidden": 0.427734375, "loss/logits": 0.08326968550682068, "loss/reg": 10393.361328125, "step": 1197 }, { "epoch": 0.02995, "grad_norm": 0.836169958114624, "grad_norm_var": 0.05966311960294477, "learning_rate": 0.0001, "loss": 10362.6973, "loss/crossentropy": 2.8260862827301025, "loss/hidden": 0.43359375, "loss/logits": 0.08343534171581268, "loss/reg": 10362.1806640625, "step": 1198 }, { "epoch": 0.029975, "grad_norm": 0.932468831539154, "grad_norm_var": 0.058529408178915256, "learning_rate": 0.0001, "loss": 10329.6719, "loss/crossentropy": 2.977475881576538, "loss/hidden": 0.474609375, "loss/logits": 0.08931249380111694, "loss/reg": 10329.1083984375, "step": 1199 }, { "epoch": 0.03, "grad_norm": 0.7377482652664185, "grad_norm_var": 0.060146837549021136, "learning_rate": 0.0001, "loss": 10297.377, "loss/crossentropy": 2.5743801593780518, "loss/hidden": 0.4453125, "loss/logits": 0.08330844342708588, "loss/reg": 10296.8486328125, "step": 1200 }, { "epoch": 0.030025, "grad_norm": 1.0377100706100464, "grad_norm_var": 0.06019103863493148, "learning_rate": 0.0001, "loss": 10265.8555, "loss/crossentropy": 3.201678514480591, "loss/hidden": 0.46484375, "loss/logits": 0.09200893342494965, "loss/reg": 10265.298828125, "step": 1201 }, { "epoch": 0.03005, "grad_norm": 1.1051257848739624, "grad_norm_var": 0.05914268438559394, "learning_rate": 0.0001, "loss": 10234.3828, "loss/crossentropy": 2.675462484359741, "loss/hidden": 0.48046875, "loss/logits": 0.095950648188591, "loss/reg": 10233.806640625, "step": 1202 }, { "epoch": 0.030075, "grad_norm": 0.9691970944404602, "grad_norm_var": 0.057986367583310626, "learning_rate": 0.0001, "loss": 10202.7168, "loss/crossentropy": 2.9942128658294678, "loss/hidden": 0.4765625, "loss/logits": 0.09277678281068802, "loss/reg": 10202.1474609375, "step": 1203 }, { "epoch": 0.0301, "grad_norm": 2.018515110015869, "grad_norm_var": 0.12452834192716093, "learning_rate": 0.0001, "loss": 10171.2373, "loss/crossentropy": 2.7582156658172607, "loss/hidden": 0.5390625, "loss/logits": 0.20079275965690613, "loss/reg": 10170.4970703125, "step": 1204 }, { "epoch": 0.030125, "grad_norm": 0.8988450765609741, "grad_norm_var": 0.1105434640884032, "learning_rate": 0.0001, "loss": 10139.9619, "loss/crossentropy": 3.1079561710357666, "loss/hidden": 0.478515625, "loss/logits": 0.09983308613300323, "loss/reg": 10139.3837890625, "step": 1205 }, { "epoch": 0.03015, "grad_norm": 0.8246884346008301, "grad_norm_var": 0.09517385013607554, "learning_rate": 0.0001, "loss": 10108.6289, "loss/crossentropy": 2.796762704849243, "loss/hidden": 0.474609375, "loss/logits": 0.0915188193321228, "loss/reg": 10108.0625, "step": 1206 }, { "epoch": 0.030175, "grad_norm": 0.8127716183662415, "grad_norm_var": 0.09372460720769069, "learning_rate": 0.0001, "loss": 10077.668, "loss/crossentropy": 2.611097574234009, "loss/hidden": 0.43359375, "loss/logits": 0.08079911768436432, "loss/reg": 10077.1533203125, "step": 1207 }, { "epoch": 0.0302, "grad_norm": 0.9361163377761841, "grad_norm_var": 0.0937286127578139, "learning_rate": 0.0001, "loss": 10047.2109, "loss/crossentropy": 3.0536181926727295, "loss/hidden": 0.486328125, "loss/logits": 0.09539620578289032, "loss/reg": 10046.62890625, "step": 1208 }, { "epoch": 0.030225, "grad_norm": 0.7071399688720703, "grad_norm_var": 0.09645261491092669, "learning_rate": 0.0001, "loss": 10016.3311, "loss/crossentropy": 2.8262569904327393, "loss/hidden": 0.4375, "loss/logits": 0.08987779915332794, "loss/reg": 10015.8037109375, "step": 1209 }, { "epoch": 0.03025, "grad_norm": 0.8685421347618103, "grad_norm_var": 0.09637224490467025, "learning_rate": 0.0001, "loss": 9985.8564, "loss/crossentropy": 2.7836618423461914, "loss/hidden": 0.470703125, "loss/logits": 0.08982132375240326, "loss/reg": 9985.2958984375, "step": 1210 }, { "epoch": 0.030275, "grad_norm": 0.916743814945221, "grad_norm_var": 0.09607720069623867, "learning_rate": 0.0001, "loss": 9955.1826, "loss/crossentropy": 2.9358649253845215, "loss/hidden": 0.431640625, "loss/logits": 0.08366554230451584, "loss/reg": 9954.6669921875, "step": 1211 }, { "epoch": 0.0303, "grad_norm": 1.0723576545715332, "grad_norm_var": 0.09503487394484282, "learning_rate": 0.0001, "loss": 9925.3447, "loss/crossentropy": 2.8664326667785645, "loss/hidden": 0.474609375, "loss/logits": 0.09154078364372253, "loss/reg": 9924.7783203125, "step": 1212 }, { "epoch": 0.030325, "grad_norm": 0.9004722237586975, "grad_norm_var": 0.08965903996462453, "learning_rate": 0.0001, "loss": 9895.2139, "loss/crossentropy": 2.6698296070098877, "loss/hidden": 0.46875, "loss/logits": 0.09159491211175919, "loss/reg": 9894.6533203125, "step": 1213 }, { "epoch": 0.03035, "grad_norm": 0.7564451098442078, "grad_norm_var": 0.09151518684718109, "learning_rate": 0.0001, "loss": 9865.2354, "loss/crossentropy": 2.6375725269317627, "loss/hidden": 0.4296875, "loss/logits": 0.08477196842432022, "loss/reg": 9864.720703125, "step": 1214 }, { "epoch": 0.030375, "grad_norm": 1.297078013420105, "grad_norm_var": 0.09807566864007426, "learning_rate": 0.0001, "loss": 9835.8916, "loss/crossentropy": 3.296292781829834, "loss/hidden": 0.51171875, "loss/logits": 0.12085559964179993, "loss/reg": 9835.2587890625, "step": 1215 }, { "epoch": 0.0304, "grad_norm": 0.9563354849815369, "grad_norm_var": 0.09367456011200681, "learning_rate": 0.0001, "loss": 9805.2402, "loss/crossentropy": 2.7964541912078857, "loss/hidden": 0.47265625, "loss/logits": 0.08087512105703354, "loss/reg": 9804.6865234375, "step": 1216 }, { "epoch": 0.030425, "grad_norm": 0.8623160123825073, "grad_norm_var": 0.09482949817968937, "learning_rate": 0.0001, "loss": 9775.4941, "loss/crossentropy": 2.7524194717407227, "loss/hidden": 0.439453125, "loss/logits": 0.08655818551778793, "loss/reg": 9774.9677734375, "step": 1217 }, { "epoch": 0.03045, "grad_norm": 0.9165598154067993, "grad_norm_var": 0.09425582092502614, "learning_rate": 0.0001, "loss": 9746.5801, "loss/crossentropy": 2.9415946006774902, "loss/hidden": 0.453125, "loss/logits": 0.09110292047262192, "loss/reg": 9746.0361328125, "step": 1218 }, { "epoch": 0.030475, "grad_norm": 0.921715497970581, "grad_norm_var": 0.09447862133950806, "learning_rate": 0.0001, "loss": 9717.4014, "loss/crossentropy": 2.724311590194702, "loss/hidden": 0.447265625, "loss/logits": 0.09495562314987183, "loss/reg": 9716.859375, "step": 1219 }, { "epoch": 0.0305, "grad_norm": 1.1677136421203613, "grad_norm_var": 0.02181600391456436, "learning_rate": 0.0001, "loss": 9688.4238, "loss/crossentropy": 2.642845630645752, "loss/hidden": 0.4765625, "loss/logits": 0.10420182347297668, "loss/reg": 9687.8427734375, "step": 1220 }, { "epoch": 0.030525, "grad_norm": 0.9965676069259644, "grad_norm_var": 0.022059169625035166, "learning_rate": 0.0001, "loss": 9659.293, "loss/crossentropy": 2.785884141921997, "loss/hidden": 0.451171875, "loss/logits": 0.07995257526636124, "loss/reg": 9658.76171875, "step": 1221 }, { "epoch": 0.03055, "grad_norm": 1.2723093032836914, "grad_norm_var": 0.028171462793960166, "learning_rate": 0.0001, "loss": 9630.8027, "loss/crossentropy": 2.9275527000427246, "loss/hidden": 0.4453125, "loss/logits": 0.08214107900857925, "loss/reg": 9630.275390625, "step": 1222 }, { "epoch": 0.030575, "grad_norm": 1.1707496643066406, "grad_norm_var": 0.0291499271671719, "learning_rate": 0.0001, "loss": 9602.373, "loss/crossentropy": 2.870009660720825, "loss/hidden": 0.51171875, "loss/logits": 0.09101098030805588, "loss/reg": 9601.7705078125, "step": 1223 }, { "epoch": 0.0306, "grad_norm": 1.0315990447998047, "grad_norm_var": 0.029129891252388622, "learning_rate": 0.0001, "loss": 9572.2627, "loss/crossentropy": 2.8040213584899902, "loss/hidden": 0.466796875, "loss/logits": 0.08623090386390686, "loss/reg": 9571.7099609375, "step": 1224 }, { "epoch": 0.030625, "grad_norm": 1.137269139289856, "grad_norm_var": 0.02456178787075095, "learning_rate": 0.0001, "loss": 9543.2119, "loss/crossentropy": 2.747441053390503, "loss/hidden": 0.45703125, "loss/logits": 0.09086060523986816, "loss/reg": 9542.6640625, "step": 1225 }, { "epoch": 0.03065, "grad_norm": 1.2014390230178833, "grad_norm_var": 0.024974099129706436, "learning_rate": 0.0001, "loss": 9514.4326, "loss/crossentropy": 3.3663926124572754, "loss/hidden": 0.4765625, "loss/logits": 0.09788084030151367, "loss/reg": 9513.8583984375, "step": 1226 }, { "epoch": 0.030675, "grad_norm": 1.1044578552246094, "grad_norm_var": 0.024188962145927903, "learning_rate": 0.0001, "loss": 9485.6719, "loss/crossentropy": 3.089447498321533, "loss/hidden": 0.48828125, "loss/logits": 0.10295175760984421, "loss/reg": 9485.0810546875, "step": 1227 }, { "epoch": 0.0307, "grad_norm": 0.9288240075111389, "grad_norm_var": 0.025007301258384965, "learning_rate": 0.0001, "loss": 9456.8525, "loss/crossentropy": 2.601111888885498, "loss/hidden": 0.50390625, "loss/logits": 0.09696957468986511, "loss/reg": 9456.251953125, "step": 1228 }, { "epoch": 0.030725, "grad_norm": 0.8527458310127258, "grad_norm_var": 0.02603033391088611, "learning_rate": 0.0001, "loss": 9428.3262, "loss/crossentropy": 2.9268016815185547, "loss/hidden": 0.453125, "loss/logits": 0.0778612419962883, "loss/reg": 9427.794921875, "step": 1229 }, { "epoch": 0.03075, "grad_norm": 1.004744291305542, "grad_norm_var": 0.02063239364545984, "learning_rate": 0.0001, "loss": 9400.5186, "loss/crossentropy": 3.0767099857330322, "loss/hidden": 0.48046875, "loss/logits": 0.09042792022228241, "loss/reg": 9399.947265625, "step": 1230 }, { "epoch": 0.030775, "grad_norm": 0.9882547855377197, "grad_norm_var": 0.01647704902306407, "learning_rate": 0.0001, "loss": 9372.2334, "loss/crossentropy": 2.9301977157592773, "loss/hidden": 0.478515625, "loss/logits": 0.09938155114650726, "loss/reg": 9371.6552734375, "step": 1231 }, { "epoch": 0.0308, "grad_norm": 1.0480315685272217, "grad_norm_var": 0.01607625108212716, "learning_rate": 0.0001, "loss": 9344.4482, "loss/crossentropy": 2.779447555541992, "loss/hidden": 0.478515625, "loss/logits": 0.09339262545108795, "loss/reg": 9343.8759765625, "step": 1232 }, { "epoch": 0.030825, "grad_norm": 0.8750410676002502, "grad_norm_var": 0.015788579684884126, "learning_rate": 0.0001, "loss": 9317.4863, "loss/crossentropy": 3.0507824420928955, "loss/hidden": 0.4765625, "loss/logits": 0.0992969423532486, "loss/reg": 9316.91015625, "step": 1233 }, { "epoch": 0.03085, "grad_norm": 0.9422901272773743, "grad_norm_var": 0.015411182960378215, "learning_rate": 0.0001, "loss": 9289.8379, "loss/crossentropy": 2.902963638305664, "loss/hidden": 0.435546875, "loss/logits": 0.08303774148225784, "loss/reg": 9289.3193359375, "step": 1234 }, { "epoch": 0.030875, "grad_norm": 0.9532501697540283, "grad_norm_var": 0.01497500735035633, "learning_rate": 0.0001, "loss": 9260.5039, "loss/crossentropy": 2.6641650199890137, "loss/hidden": 0.443359375, "loss/logits": 0.08857601881027222, "loss/reg": 9259.9716796875, "step": 1235 }, { "epoch": 0.0309, "grad_norm": 0.924162745475769, "grad_norm_var": 0.014606637793103753, "learning_rate": 0.0001, "loss": 9233.3857, "loss/crossentropy": 2.730973958969116, "loss/hidden": 0.486328125, "loss/logits": 0.10253769159317017, "loss/reg": 9232.796875, "step": 1236 }, { "epoch": 0.030925, "grad_norm": 0.9403824806213379, "grad_norm_var": 0.015031792033177697, "learning_rate": 0.0001, "loss": 9205.9912, "loss/crossentropy": 2.6175835132598877, "loss/hidden": 0.453125, "loss/logits": 0.09433528780937195, "loss/reg": 9205.443359375, "step": 1237 }, { "epoch": 0.03095, "grad_norm": 1.058070182800293, "grad_norm_var": 0.01079234914789892, "learning_rate": 0.0001, "loss": 9179.0371, "loss/crossentropy": 2.9107890129089355, "loss/hidden": 0.474609375, "loss/logits": 0.10070379078388214, "loss/reg": 9178.4619140625, "step": 1238 }, { "epoch": 0.030975, "grad_norm": 0.8924942016601562, "grad_norm_var": 0.009670593148200565, "learning_rate": 0.0001, "loss": 9152.2129, "loss/crossentropy": 2.9251017570495605, "loss/hidden": 0.443359375, "loss/logits": 0.08694691210985184, "loss/reg": 9151.6826171875, "step": 1239 }, { "epoch": 0.031, "grad_norm": 1.0834685564041138, "grad_norm_var": 0.01010783131661602, "learning_rate": 0.0001, "loss": 9125.5234, "loss/crossentropy": 3.0762522220611572, "loss/hidden": 0.484375, "loss/logits": 0.09588149189949036, "loss/reg": 9124.943359375, "step": 1240 }, { "epoch": 0.031025, "grad_norm": 1.2146703004837036, "grad_norm_var": 0.011940877159930589, "learning_rate": 0.0001, "loss": 9098.4736, "loss/crossentropy": 2.8192780017852783, "loss/hidden": 0.466796875, "loss/logits": 0.09233398735523224, "loss/reg": 9097.9140625, "step": 1241 }, { "epoch": 0.03105, "grad_norm": 1.1458591222763062, "grad_norm_var": 0.010646862243281655, "learning_rate": 0.0001, "loss": 9071.7295, "loss/crossentropy": 2.8693604469299316, "loss/hidden": 0.48046875, "loss/logits": 0.09497147798538208, "loss/reg": 9071.154296875, "step": 1242 }, { "epoch": 0.031075, "grad_norm": 0.8777756094932556, "grad_norm_var": 0.010619544110101591, "learning_rate": 0.0001, "loss": 9044.9199, "loss/crossentropy": 2.9424221515655518, "loss/hidden": 0.439453125, "loss/logits": 0.09632548689842224, "loss/reg": 9044.3837890625, "step": 1243 }, { "epoch": 0.0311, "grad_norm": 1.0108309984207153, "grad_norm_var": 0.010446079850578134, "learning_rate": 0.0001, "loss": 9015.875, "loss/crossentropy": 2.8137118816375732, "loss/hidden": 0.439453125, "loss/logits": 0.08419637382030487, "loss/reg": 9015.3515625, "step": 1244 }, { "epoch": 0.031125, "grad_norm": 1.1220258474349976, "grad_norm_var": 0.01011275916858628, "learning_rate": 0.0001, "loss": 8988.9355, "loss/crossentropy": 2.8730502128601074, "loss/hidden": 0.49609375, "loss/logits": 0.11299853026866913, "loss/reg": 8988.326171875, "step": 1245 }, { "epoch": 0.03115, "grad_norm": 0.9927026629447937, "grad_norm_var": 0.010122367947183146, "learning_rate": 0.0001, "loss": 8962.1436, "loss/crossentropy": 2.9736275672912598, "loss/hidden": 0.466796875, "loss/logits": 0.0912986621260643, "loss/reg": 8961.5859375, "step": 1246 }, { "epoch": 0.031175, "grad_norm": 1.0541266202926636, "grad_norm_var": 0.010252357725601216, "learning_rate": 0.0001, "loss": 8935.4277, "loss/crossentropy": 2.722059726715088, "loss/hidden": 0.494140625, "loss/logits": 0.1156279444694519, "loss/reg": 8934.818359375, "step": 1247 }, { "epoch": 0.0312, "grad_norm": 0.9716544151306152, "grad_norm_var": 0.010213854612195365, "learning_rate": 0.0001, "loss": 8909.125, "loss/crossentropy": 2.8865716457366943, "loss/hidden": 0.4453125, "loss/logits": 0.09578832238912582, "loss/reg": 8908.583984375, "step": 1248 }, { "epoch": 0.031225, "grad_norm": 0.9475975632667542, "grad_norm_var": 0.009298449025630321, "learning_rate": 0.0001, "loss": 8883.3818, "loss/crossentropy": 3.0513219833374023, "loss/hidden": 0.4765625, "loss/logits": 0.10030034184455872, "loss/reg": 8882.8046875, "step": 1249 }, { "epoch": 0.03125, "grad_norm": 0.8895021677017212, "grad_norm_var": 0.009936580350242237, "learning_rate": 0.0001, "loss": 8857.7197, "loss/crossentropy": 2.9448611736297607, "loss/hidden": 0.484375, "loss/logits": 0.11295387148857117, "loss/reg": 8857.1220703125, "step": 1250 }, { "epoch": 0.031275, "grad_norm": 0.937999427318573, "grad_norm_var": 0.010056165424536268, "learning_rate": 0.0001, "loss": 8831.6973, "loss/crossentropy": 3.092855930328369, "loss/hidden": 0.474609375, "loss/logits": 0.09617440402507782, "loss/reg": 8831.126953125, "step": 1251 }, { "epoch": 0.0313, "grad_norm": 0.9391719698905945, "grad_norm_var": 0.009910557212322724, "learning_rate": 0.0001, "loss": 8805.9775, "loss/crossentropy": 2.961047410964966, "loss/hidden": 0.490234375, "loss/logits": 0.11525814980268478, "loss/reg": 8805.3720703125, "step": 1252 }, { "epoch": 0.031325, "grad_norm": 1.1222453117370605, "grad_norm_var": 0.010413345496494827, "learning_rate": 0.0001, "loss": 8780.6943, "loss/crossentropy": 2.7763471603393555, "loss/hidden": 0.486328125, "loss/logits": 0.11867141723632812, "loss/reg": 8780.0888671875, "step": 1253 }, { "epoch": 0.03135, "grad_norm": 1.1006278991699219, "grad_norm_var": 0.010763776670529154, "learning_rate": 0.0001, "loss": 8755.541, "loss/crossentropy": 3.163494348526001, "loss/hidden": 0.478515625, "loss/logits": 0.08899696916341782, "loss/reg": 8754.9736328125, "step": 1254 }, { "epoch": 0.031375, "grad_norm": 0.8858732581138611, "grad_norm_var": 0.010878126022268416, "learning_rate": 0.0001, "loss": 8730.6084, "loss/crossentropy": 2.7622194290161133, "loss/hidden": 0.455078125, "loss/logits": 0.0905875563621521, "loss/reg": 8730.0625, "step": 1255 }, { "epoch": 0.0314, "grad_norm": 1.098836898803711, "grad_norm_var": 0.011025998642754093, "learning_rate": 0.0001, "loss": 8705.5156, "loss/crossentropy": 3.0155763626098633, "loss/hidden": 0.484375, "loss/logits": 0.11086156964302063, "loss/reg": 8704.919921875, "step": 1256 }, { "epoch": 0.031425, "grad_norm": 1.2236031293869019, "grad_norm_var": 0.011263479458820077, "learning_rate": 0.0001, "loss": 8680.752, "loss/crossentropy": 2.602417230606079, "loss/hidden": 0.48828125, "loss/logits": 0.09238439798355103, "loss/reg": 8680.1708984375, "step": 1257 }, { "epoch": 0.03145, "grad_norm": 0.8990491032600403, "grad_norm_var": 0.010929796606747866, "learning_rate": 0.0001, "loss": 8656.1475, "loss/crossentropy": 2.729997396469116, "loss/hidden": 0.435546875, "loss/logits": 0.09076844155788422, "loss/reg": 8655.62109375, "step": 1258 }, { "epoch": 0.031475, "grad_norm": 0.8509149551391602, "grad_norm_var": 0.01142910663300564, "learning_rate": 0.0001, "loss": 8631.0742, "loss/crossentropy": 2.835317611694336, "loss/hidden": 0.462890625, "loss/logits": 0.07381744682788849, "loss/reg": 8630.537109375, "step": 1259 }, { "epoch": 0.0315, "grad_norm": 1.0317010879516602, "grad_norm_var": 0.011478335594675985, "learning_rate": 0.0001, "loss": 8605.9824, "loss/crossentropy": 2.855336904525757, "loss/hidden": 0.48046875, "loss/logits": 0.10006128251552582, "loss/reg": 8605.40234375, "step": 1260 }, { "epoch": 0.031525, "grad_norm": 1.0000284910202026, "grad_norm_var": 0.01049239253982955, "learning_rate": 0.0001, "loss": 8580.8525, "loss/crossentropy": 2.6447980403900146, "loss/hidden": 0.484375, "loss/logits": 0.08910873532295227, "loss/reg": 8580.279296875, "step": 1261 }, { "epoch": 0.03155, "grad_norm": 0.962989330291748, "grad_norm_var": 0.010563021713161888, "learning_rate": 0.0001, "loss": 8555.9385, "loss/crossentropy": 2.884161949157715, "loss/hidden": 0.46484375, "loss/logits": 0.08687566220760345, "loss/reg": 8555.38671875, "step": 1262 }, { "epoch": 0.031575, "grad_norm": 0.9393940567970276, "grad_norm_var": 0.010477345104453603, "learning_rate": 0.0001, "loss": 8531.585, "loss/crossentropy": 2.914567708969116, "loss/hidden": 0.482421875, "loss/logits": 0.10453340411186218, "loss/reg": 8530.998046875, "step": 1263 }, { "epoch": 0.0316, "grad_norm": 0.8663077354431152, "grad_norm_var": 0.011394579787916293, "learning_rate": 0.0001, "loss": 8506.8096, "loss/crossentropy": 2.7986392974853516, "loss/hidden": 0.46484375, "loss/logits": 0.08872459828853607, "loss/reg": 8506.255859375, "step": 1264 }, { "epoch": 0.031625, "grad_norm": 1.0881963968276978, "grad_norm_var": 0.012004087305830646, "learning_rate": 0.0001, "loss": 8482.6777, "loss/crossentropy": 2.84379506111145, "loss/hidden": 0.482421875, "loss/logits": 0.11023124307394028, "loss/reg": 8482.0849609375, "step": 1265 }, { "epoch": 0.03165, "grad_norm": 0.8993434906005859, "grad_norm_var": 0.01187856150995447, "learning_rate": 0.0001, "loss": 8458.5137, "loss/crossentropy": 2.8666434288024902, "loss/hidden": 0.470703125, "loss/logits": 0.0925627276301384, "loss/reg": 8457.9501953125, "step": 1266 }, { "epoch": 0.031675, "grad_norm": 0.8524366021156311, "grad_norm_var": 0.012933845547619175, "learning_rate": 0.0001, "loss": 8434.5479, "loss/crossentropy": 3.014401912689209, "loss/hidden": 0.42578125, "loss/logits": 0.08417293429374695, "loss/reg": 8434.0380859375, "step": 1267 }, { "epoch": 0.0317, "grad_norm": 1.3166847229003906, "grad_norm_var": 0.019532068339086807, "learning_rate": 0.0001, "loss": 8410.9355, "loss/crossentropy": 3.2959179878234863, "loss/hidden": 0.490234375, "loss/logits": 0.11012903600931168, "loss/reg": 8410.3349609375, "step": 1268 }, { "epoch": 0.031725, "grad_norm": 0.8100549578666687, "grad_norm_var": 0.02089461048390171, "learning_rate": 0.0001, "loss": 8386.4502, "loss/crossentropy": 2.7742624282836914, "loss/hidden": 0.45703125, "loss/logits": 0.09910713881254196, "loss/reg": 8385.89453125, "step": 1269 }, { "epoch": 0.03175, "grad_norm": 0.8693830966949463, "grad_norm_var": 0.02079889777364117, "learning_rate": 0.0001, "loss": 8362.4961, "loss/crossentropy": 2.7391388416290283, "loss/hidden": 0.43359375, "loss/logits": 0.08556564152240753, "loss/reg": 8361.9765625, "step": 1270 }, { "epoch": 0.031775, "grad_norm": 1.057019829750061, "grad_norm_var": 0.02060318325980237, "learning_rate": 0.0001, "loss": 8339.002, "loss/crossentropy": 2.7165987491607666, "loss/hidden": 0.48046875, "loss/logits": 0.09790123999118805, "loss/reg": 8338.423828125, "step": 1271 }, { "epoch": 0.0318, "grad_norm": 0.9491914510726929, "grad_norm_var": 0.019738849063740106, "learning_rate": 0.0001, "loss": 8315.4844, "loss/crossentropy": 2.8127589225769043, "loss/hidden": 0.48828125, "loss/logits": 0.08863645792007446, "loss/reg": 8314.9072265625, "step": 1272 }, { "epoch": 0.031825, "grad_norm": 1.124371886253357, "grad_norm_var": 0.017078527814280875, "learning_rate": 0.0001, "loss": 8292.4492, "loss/crossentropy": 2.9964218139648438, "loss/hidden": 0.486328125, "loss/logits": 0.1079523116350174, "loss/reg": 8291.8544921875, "step": 1273 }, { "epoch": 0.03185, "grad_norm": 0.9936317205429077, "grad_norm_var": 0.016745193518145487, "learning_rate": 0.0001, "loss": 8269.1445, "loss/crossentropy": 2.620728015899658, "loss/hidden": 0.484375, "loss/logits": 0.11555637419223785, "loss/reg": 8268.544921875, "step": 1274 }, { "epoch": 0.031875, "grad_norm": 1.2509433031082153, "grad_norm_var": 0.020089437443413738, "learning_rate": 0.0001, "loss": 8245.5898, "loss/crossentropy": 2.6017544269561768, "loss/hidden": 0.46875, "loss/logits": 0.09242773801088333, "loss/reg": 8245.0283203125, "step": 1275 }, { "epoch": 0.0319, "grad_norm": 1.5470917224884033, "grad_norm_var": 0.03881945897441915, "learning_rate": 0.0001, "loss": 8221.9727, "loss/crossentropy": 2.775578260421753, "loss/hidden": 0.5078125, "loss/logits": 0.10969553142786026, "loss/reg": 8221.35546875, "step": 1276 }, { "epoch": 0.031925, "grad_norm": 0.9281179308891296, "grad_norm_var": 0.03945822977643634, "learning_rate": 0.0001, "loss": 8198.8486, "loss/crossentropy": 2.7830638885498047, "loss/hidden": 0.482421875, "loss/logits": 0.09233975410461426, "loss/reg": 8198.2734375, "step": 1277 }, { "epoch": 0.03195, "grad_norm": 0.7995396256446838, "grad_norm_var": 0.042554514376413964, "learning_rate": 0.0001, "loss": 8175.2583, "loss/crossentropy": 2.6517460346221924, "loss/hidden": 0.43359375, "loss/logits": 0.0899507999420166, "loss/reg": 8174.73486328125, "step": 1278 }, { "epoch": 0.031975, "grad_norm": 0.8659801483154297, "grad_norm_var": 0.04366306924828033, "learning_rate": 0.0001, "loss": 8152.5845, "loss/crossentropy": 2.704268455505371, "loss/hidden": 0.4765625, "loss/logits": 0.09852954745292664, "loss/reg": 8152.00927734375, "step": 1279 }, { "epoch": 0.032, "grad_norm": 0.8435584902763367, "grad_norm_var": 0.044142318153788655, "learning_rate": 0.0001, "loss": 8129.3618, "loss/crossentropy": 2.626718044281006, "loss/hidden": 0.482421875, "loss/logits": 0.08609582483768463, "loss/reg": 8128.79345703125, "step": 1280 }, { "epoch": 0.032025, "grad_norm": 1.0621079206466675, "grad_norm_var": 0.04392058058480234, "learning_rate": 0.0001, "loss": 8106.6753, "loss/crossentropy": 2.4879398345947266, "loss/hidden": 0.51171875, "loss/logits": 0.11241374909877777, "loss/reg": 8106.05126953125, "step": 1281 }, { "epoch": 0.03205, "grad_norm": 1.3785101175308228, "grad_norm_var": 0.05116313883933944, "learning_rate": 0.0001, "loss": 8083.8066, "loss/crossentropy": 2.842101573944092, "loss/hidden": 0.5625, "loss/logits": 0.12760193645954132, "loss/reg": 8083.11669921875, "step": 1282 }, { "epoch": 0.032075, "grad_norm": 0.8804663419723511, "grad_norm_var": 0.05050924825759508, "learning_rate": 0.0001, "loss": 8060.4771, "loss/crossentropy": 2.930004358291626, "loss/hidden": 0.474609375, "loss/logits": 0.10016739368438721, "loss/reg": 8059.90234375, "step": 1283 }, { "epoch": 0.0321, "grad_norm": 0.8530605435371399, "grad_norm_var": 0.04698137364486722, "learning_rate": 0.0001, "loss": 8038.0005, "loss/crossentropy": 2.982766628265381, "loss/hidden": 0.462890625, "loss/logits": 0.09573029726743698, "loss/reg": 8037.44189453125, "step": 1284 }, { "epoch": 0.032125, "grad_norm": 1.142184853553772, "grad_norm_var": 0.04487463069720589, "learning_rate": 0.0001, "loss": 8015.4233, "loss/crossentropy": 2.594120502471924, "loss/hidden": 0.51171875, "loss/logits": 0.0928168073296547, "loss/reg": 8014.81884765625, "step": 1285 }, { "epoch": 0.03215, "grad_norm": 0.9862678050994873, "grad_norm_var": 0.043161885828666806, "learning_rate": 0.0001, "loss": 7992.7622, "loss/crossentropy": 2.7711265087127686, "loss/hidden": 0.470703125, "loss/logits": 0.092730812728405, "loss/reg": 7992.19873046875, "step": 1286 }, { "epoch": 0.032175, "grad_norm": 2.5009279251098633, "grad_norm_var": 0.1764777313681615, "learning_rate": 0.0001, "loss": 7970.334, "loss/crossentropy": 2.942647695541382, "loss/hidden": 0.5546875, "loss/logits": 0.12212435156106949, "loss/reg": 7969.6572265625, "step": 1287 }, { "epoch": 0.0322, "grad_norm": 1.21425461769104, "grad_norm_var": 0.17442147111949607, "learning_rate": 0.0001, "loss": 7946.9565, "loss/crossentropy": 2.6966969966888428, "loss/hidden": 0.494140625, "loss/logits": 0.10218788683414459, "loss/reg": 7946.3603515625, "step": 1288 }, { "epoch": 0.032225, "grad_norm": 1.1258981227874756, "grad_norm_var": 0.1744167700820533, "learning_rate": 0.0001, "loss": 7924.2075, "loss/crossentropy": 2.7866528034210205, "loss/hidden": 0.474609375, "loss/logits": 0.09944634884595871, "loss/reg": 7923.63330078125, "step": 1289 }, { "epoch": 0.03225, "grad_norm": 1.285189151763916, "grad_norm_var": 0.1737176325484464, "learning_rate": 0.0001, "loss": 7901.8428, "loss/crossentropy": 2.994377851486206, "loss/hidden": 0.490234375, "loss/logits": 0.09650782495737076, "loss/reg": 7901.255859375, "step": 1290 }, { "epoch": 0.032275, "grad_norm": 1.1900367736816406, "grad_norm_var": 0.1732637797998457, "learning_rate": 0.0001, "loss": 7879.4111, "loss/crossentropy": 2.7183420658111572, "loss/hidden": 0.486328125, "loss/logits": 0.11161264032125473, "loss/reg": 7878.81298828125, "step": 1291 }, { "epoch": 0.0323, "grad_norm": 1.0578922033309937, "grad_norm_var": 0.16314844204604045, "learning_rate": 0.0001, "loss": 7857.1812, "loss/crossentropy": 2.464585781097412, "loss/hidden": 0.4921875, "loss/logits": 0.07893523573875427, "loss/reg": 7856.60986328125, "step": 1292 }, { "epoch": 0.032325, "grad_norm": 0.8201889395713806, "grad_norm_var": 0.16681224735512307, "learning_rate": 0.0001, "loss": 7835.291, "loss/crossentropy": 2.7440521717071533, "loss/hidden": 0.478515625, "loss/logits": 0.0926779955625534, "loss/reg": 7834.7197265625, "step": 1293 }, { "epoch": 0.03235, "grad_norm": 0.8110074400901794, "grad_norm_var": 0.16632224475366872, "learning_rate": 0.0001, "loss": 7813.4004, "loss/crossentropy": 2.948300361633301, "loss/hidden": 0.482421875, "loss/logits": 0.0882749855518341, "loss/reg": 7812.82958984375, "step": 1294 }, { "epoch": 0.032375, "grad_norm": 0.9979842901229858, "grad_norm_var": 0.16283313552398174, "learning_rate": 0.0001, "loss": 7791.646, "loss/crossentropy": 2.79728102684021, "loss/hidden": 0.48046875, "loss/logits": 0.09832129627466202, "loss/reg": 7791.0673828125, "step": 1295 }, { "epoch": 0.0324, "grad_norm": 0.9378174543380737, "grad_norm_var": 0.15973385564981984, "learning_rate": 0.0001, "loss": 7769.6074, "loss/crossentropy": 2.986835241317749, "loss/hidden": 0.443359375, "loss/logits": 0.08560170233249664, "loss/reg": 7769.07861328125, "step": 1296 }, { "epoch": 0.032425, "grad_norm": 1.0474334955215454, "grad_norm_var": 0.15990018120436067, "learning_rate": 0.0001, "loss": 7748.0244, "loss/crossentropy": 2.919323682785034, "loss/hidden": 0.486328125, "loss/logits": 0.11888301372528076, "loss/reg": 7747.41943359375, "step": 1297 }, { "epoch": 0.03245, "grad_norm": 1.028988242149353, "grad_norm_var": 0.15638857063354974, "learning_rate": 0.0001, "loss": 7726.6182, "loss/crossentropy": 2.753113031387329, "loss/hidden": 0.48828125, "loss/logits": 0.08861382305622101, "loss/reg": 7726.04150390625, "step": 1298 }, { "epoch": 0.032475, "grad_norm": 0.9262399077415466, "grad_norm_var": 0.15507302500825274, "learning_rate": 0.0001, "loss": 7705.2729, "loss/crossentropy": 2.8557252883911133, "loss/hidden": 0.48828125, "loss/logits": 0.08935841172933578, "loss/reg": 7704.6953125, "step": 1299 }, { "epoch": 0.0325, "grad_norm": 1.2030805349349976, "grad_norm_var": 0.15025659494356405, "learning_rate": 0.0001, "loss": 7684.001, "loss/crossentropy": 2.6697678565979004, "loss/hidden": 0.5078125, "loss/logits": 0.11960919201374054, "loss/reg": 7683.37353515625, "step": 1300 }, { "epoch": 0.032525, "grad_norm": 1.3810847997665405, "grad_norm_var": 0.15382280474053756, "learning_rate": 0.0001, "loss": 7662.5615, "loss/crossentropy": 2.75545072555542, "loss/hidden": 0.5078125, "loss/logits": 0.11684276163578033, "loss/reg": 7661.93701171875, "step": 1301 }, { "epoch": 0.03255, "grad_norm": 1.2311424016952515, "grad_norm_var": 0.15199145426370816, "learning_rate": 0.0001, "loss": 7641.6509, "loss/crossentropy": 3.2293875217437744, "loss/hidden": 0.5, "loss/logits": 0.133016437292099, "loss/reg": 7641.01806640625, "step": 1302 }, { "epoch": 0.032575, "grad_norm": 1.601858377456665, "grad_norm_var": 0.04325904037096214, "learning_rate": 0.0001, "loss": 7620.6187, "loss/crossentropy": 3.1844873428344727, "loss/hidden": 0.52734375, "loss/logits": 0.10423330962657928, "loss/reg": 7619.9873046875, "step": 1303 }, { "epoch": 0.0326, "grad_norm": 0.9981883764266968, "grad_norm_var": 0.04335360478470348, "learning_rate": 0.0001, "loss": 7599.3457, "loss/crossentropy": 2.9040017127990723, "loss/hidden": 0.4921875, "loss/logits": 0.10085226595401764, "loss/reg": 7598.75244140625, "step": 1304 }, { "epoch": 0.032625, "grad_norm": 1.127403974533081, "grad_norm_var": 0.04335839381164678, "learning_rate": 0.0001, "loss": 7578.4678, "loss/crossentropy": 3.017641544342041, "loss/hidden": 0.50390625, "loss/logits": 0.1118115782737732, "loss/reg": 7577.85205078125, "step": 1305 }, { "epoch": 0.03265, "grad_norm": 1.0478172302246094, "grad_norm_var": 0.04110889794239984, "learning_rate": 0.0001, "loss": 7557.6021, "loss/crossentropy": 3.1238770484924316, "loss/hidden": 0.48828125, "loss/logits": 0.09554004669189453, "loss/reg": 7557.01806640625, "step": 1306 }, { "epoch": 0.032675, "grad_norm": 1.0185546875, "grad_norm_var": 0.040614017374103625, "learning_rate": 0.0001, "loss": 7536.9092, "loss/crossentropy": 3.1412482261657715, "loss/hidden": 0.4921875, "loss/logits": 0.10232207924127579, "loss/reg": 7536.314453125, "step": 1307 }, { "epoch": 0.0327, "grad_norm": 1.301594614982605, "grad_norm_var": 0.04369555513625844, "learning_rate": 0.0001, "loss": 7515.9883, "loss/crossentropy": 3.0319020748138428, "loss/hidden": 0.515625, "loss/logits": 0.09871071577072144, "loss/reg": 7515.3740234375, "step": 1308 }, { "epoch": 0.032725, "grad_norm": 0.8953453302383423, "grad_norm_var": 0.04131955548779486, "learning_rate": 0.0001, "loss": 7495.7622, "loss/crossentropy": 2.6120543479919434, "loss/hidden": 0.48828125, "loss/logits": 0.09244601428508759, "loss/reg": 7495.181640625, "step": 1309 }, { "epoch": 0.03275, "grad_norm": 0.9810869097709656, "grad_norm_var": 0.0366369472383818, "learning_rate": 0.0001, "loss": 7474.7329, "loss/crossentropy": 3.0907046794891357, "loss/hidden": 0.484375, "loss/logits": 0.11530554294586182, "loss/reg": 7474.13330078125, "step": 1310 }, { "epoch": 0.032775, "grad_norm": 1.078172206878662, "grad_norm_var": 0.03586416117602245, "learning_rate": 0.0001, "loss": 7453.2261, "loss/crossentropy": 3.0059869289398193, "loss/hidden": 0.51953125, "loss/logits": 0.11619419604539871, "loss/reg": 7452.59033203125, "step": 1311 }, { "epoch": 0.0328, "grad_norm": 0.8973978161811829, "grad_norm_var": 0.03690964090499163, "learning_rate": 0.0001, "loss": 7432.4702, "loss/crossentropy": 2.6508216857910156, "loss/hidden": 0.478515625, "loss/logits": 0.08029112219810486, "loss/reg": 7431.91162109375, "step": 1312 }, { "epoch": 0.032825, "grad_norm": 1.2111337184906006, "grad_norm_var": 0.03721152961874593, "learning_rate": 0.0001, "loss": 7412.1855, "loss/crossentropy": 2.8881707191467285, "loss/hidden": 0.48828125, "loss/logits": 0.0881146490573883, "loss/reg": 7411.609375, "step": 1313 }, { "epoch": 0.03285, "grad_norm": 0.8852730393409729, "grad_norm_var": 0.04025726348707579, "learning_rate": 0.0001, "loss": 7392.3027, "loss/crossentropy": 3.0368635654449463, "loss/hidden": 0.4609375, "loss/logits": 0.11413509398698807, "loss/reg": 7391.7275390625, "step": 1314 }, { "epoch": 0.032875, "grad_norm": 0.9257658123970032, "grad_norm_var": 0.04026899375604476, "learning_rate": 0.0001, "loss": 7372.4961, "loss/crossentropy": 2.818535804748535, "loss/hidden": 0.48046875, "loss/logits": 0.09589619934558868, "loss/reg": 7371.919921875, "step": 1315 }, { "epoch": 0.0329, "grad_norm": 0.9861080646514893, "grad_norm_var": 0.040563542561517354, "learning_rate": 0.0001, "loss": 7352.4141, "loss/crossentropy": 2.960479974746704, "loss/hidden": 0.486328125, "loss/logits": 0.09554879367351532, "loss/reg": 7351.83203125, "step": 1316 }, { "epoch": 0.032925, "grad_norm": 1.205779790878296, "grad_norm_var": 0.03586735238293806, "learning_rate": 0.0001, "loss": 7332.7109, "loss/crossentropy": 2.8014562129974365, "loss/hidden": 0.53125, "loss/logits": 0.10308459401130676, "loss/reg": 7332.07666015625, "step": 1317 }, { "epoch": 0.03295, "grad_norm": 1.0991089344024658, "grad_norm_var": 0.03442003994635622, "learning_rate": 0.0001, "loss": 7312.6997, "loss/crossentropy": 2.7398338317871094, "loss/hidden": 0.48828125, "loss/logits": 0.09255098551511765, "loss/reg": 7312.11865234375, "step": 1318 }, { "epoch": 0.032975, "grad_norm": 1.1162376403808594, "grad_norm_var": 0.01529067172337856, "learning_rate": 0.0001, "loss": 7292.959, "loss/crossentropy": 3.011981964111328, "loss/hidden": 0.486328125, "loss/logits": 0.0965576171875, "loss/reg": 7292.3759765625, "step": 1319 }, { "epoch": 0.033, "grad_norm": 1.3372790813446045, "grad_norm_var": 0.02020530026863927, "learning_rate": 0.0001, "loss": 7273.0537, "loss/crossentropy": 2.8459720611572266, "loss/hidden": 0.515625, "loss/logits": 0.11112953722476959, "loss/reg": 7272.4267578125, "step": 1320 }, { "epoch": 0.033025, "grad_norm": 1.3002338409423828, "grad_norm_var": 0.02340355833699679, "learning_rate": 0.0001, "loss": 7252.2334, "loss/crossentropy": 2.893174171447754, "loss/hidden": 0.48828125, "loss/logits": 0.08477090299129486, "loss/reg": 7251.66015625, "step": 1321 }, { "epoch": 0.03305, "grad_norm": 1.2420011758804321, "grad_norm_var": 0.02491587351695799, "learning_rate": 0.0001, "loss": 7232.2285, "loss/crossentropy": 2.4793474674224854, "loss/hidden": 0.515625, "loss/logits": 0.11036118865013123, "loss/reg": 7231.6025390625, "step": 1322 }, { "epoch": 0.033075, "grad_norm": 0.8365013003349304, "grad_norm_var": 0.02878389821853061, "learning_rate": 0.0001, "loss": 7212.4717, "loss/crossentropy": 2.832367181777954, "loss/hidden": 0.478515625, "loss/logits": 0.08904410898685455, "loss/reg": 7211.904296875, "step": 1323 }, { "epoch": 0.0331, "grad_norm": 1.0674488544464111, "grad_norm_var": 0.025329465113824397, "learning_rate": 0.0001, "loss": 7193.1782, "loss/crossentropy": 2.6910011768341064, "loss/hidden": 0.5, "loss/logits": 0.09814359247684479, "loss/reg": 7192.580078125, "step": 1324 }, { "epoch": 0.033125, "grad_norm": 1.0702201128005981, "grad_norm_var": 0.023248765074305178, "learning_rate": 0.0001, "loss": 7173.7251, "loss/crossentropy": 2.985245943069458, "loss/hidden": 0.48046875, "loss/logits": 0.08581198006868362, "loss/reg": 7173.15869140625, "step": 1325 }, { "epoch": 0.03315, "grad_norm": 1.0932273864746094, "grad_norm_var": 0.022593393543815843, "learning_rate": 0.0001, "loss": 7154.1875, "loss/crossentropy": 3.2283401489257812, "loss/hidden": 0.494140625, "loss/logits": 0.11157424747943878, "loss/reg": 7153.58154296875, "step": 1326 }, { "epoch": 0.033175, "grad_norm": 1.1343928575515747, "grad_norm_var": 0.022743559564361013, "learning_rate": 0.0001, "loss": 7135.0659, "loss/crossentropy": 2.826775074005127, "loss/hidden": 0.490234375, "loss/logits": 0.115767702460289, "loss/reg": 7134.4599609375, "step": 1327 }, { "epoch": 0.0332, "grad_norm": 1.005599021911621, "grad_norm_var": 0.020725394890757073, "learning_rate": 0.0001, "loss": 7115.8457, "loss/crossentropy": 2.8775322437286377, "loss/hidden": 0.482421875, "loss/logits": 0.10473422706127167, "loss/reg": 7115.2587890625, "step": 1328 }, { "epoch": 0.033225, "grad_norm": 0.8706186413764954, "grad_norm_var": 0.022689128924057892, "learning_rate": 0.0001, "loss": 7096.7861, "loss/crossentropy": 2.806633710861206, "loss/hidden": 0.49609375, "loss/logits": 0.10028624534606934, "loss/reg": 7096.18994140625, "step": 1329 }, { "epoch": 0.03325, "grad_norm": 0.7180449962615967, "grad_norm_var": 0.028633580337355472, "learning_rate": 0.0001, "loss": 7077.8433, "loss/crossentropy": 2.5447871685028076, "loss/hidden": 0.46875, "loss/logits": 0.08894561231136322, "loss/reg": 7077.28564453125, "step": 1330 }, { "epoch": 0.033275, "grad_norm": 0.8574270009994507, "grad_norm_var": 0.03017624676980996, "learning_rate": 0.0001, "loss": 7059.2168, "loss/crossentropy": 2.8060553073883057, "loss/hidden": 0.478515625, "loss/logits": 0.09654555469751358, "loss/reg": 7058.6416015625, "step": 1331 }, { "epoch": 0.0333, "grad_norm": 0.9025835394859314, "grad_norm_var": 0.0314214120190319, "learning_rate": 0.0001, "loss": 7040.6797, "loss/crossentropy": 2.8328611850738525, "loss/hidden": 0.48828125, "loss/logits": 0.11992849409580231, "loss/reg": 7040.0712890625, "step": 1332 }, { "epoch": 0.033325, "grad_norm": 0.8659266233444214, "grad_norm_var": 0.031741798051395625, "learning_rate": 0.0001, "loss": 7022.3887, "loss/crossentropy": 2.7338974475860596, "loss/hidden": 0.48046875, "loss/logits": 0.09682288765907288, "loss/reg": 7021.8115234375, "step": 1333 }, { "epoch": 0.03335, "grad_norm": 1.019660472869873, "grad_norm_var": 0.03142861984241999, "learning_rate": 0.0001, "loss": 7004.1777, "loss/crossentropy": 3.012676477432251, "loss/hidden": 0.48828125, "loss/logits": 0.09259988367557526, "loss/reg": 7003.5966796875, "step": 1334 }, { "epoch": 0.033375, "grad_norm": 1.0380840301513672, "grad_norm_var": 0.030883988419247627, "learning_rate": 0.0001, "loss": 6985.627, "loss/crossentropy": 2.9568233489990234, "loss/hidden": 0.4765625, "loss/logits": 0.09427786618471146, "loss/reg": 6985.05615234375, "step": 1335 }, { "epoch": 0.0334, "grad_norm": 1.1338788270950317, "grad_norm_var": 0.02493162497171597, "learning_rate": 0.0001, "loss": 6967.625, "loss/crossentropy": 2.920952558517456, "loss/hidden": 0.484375, "loss/logits": 0.0969780683517456, "loss/reg": 6967.04345703125, "step": 1336 }, { "epoch": 0.033425, "grad_norm": 0.9520040154457092, "grad_norm_var": 0.01902283493770819, "learning_rate": 0.0001, "loss": 6949.4731, "loss/crossentropy": 2.965240240097046, "loss/hidden": 0.48046875, "loss/logits": 0.09136784076690674, "loss/reg": 6948.9013671875, "step": 1337 }, { "epoch": 0.03345, "grad_norm": 0.8833340406417847, "grad_norm_var": 0.014914911312502947, "learning_rate": 0.0001, "loss": 6931.3071, "loss/crossentropy": 2.8117728233337402, "loss/hidden": 0.47265625, "loss/logits": 0.07982734590768814, "loss/reg": 6930.7548828125, "step": 1338 }, { "epoch": 0.033475, "grad_norm": 1.3426650762557983, "grad_norm_var": 0.022217578237794657, "learning_rate": 0.0001, "loss": 6913.1899, "loss/crossentropy": 3.039302110671997, "loss/hidden": 0.5390625, "loss/logits": 0.13471244275569916, "loss/reg": 6912.51611328125, "step": 1339 }, { "epoch": 0.0335, "grad_norm": 1.1176189184188843, "grad_norm_var": 0.022844846877402233, "learning_rate": 0.0001, "loss": 6894.9351, "loss/crossentropy": 2.8302769660949707, "loss/hidden": 0.482421875, "loss/logits": 0.09411150962114334, "loss/reg": 6894.3583984375, "step": 1340 }, { "epoch": 0.033525, "grad_norm": 1.0748586654663086, "grad_norm_var": 0.022889416618989467, "learning_rate": 0.0001, "loss": 6876.7759, "loss/crossentropy": 2.611422538757324, "loss/hidden": 0.486328125, "loss/logits": 0.08938396722078323, "loss/reg": 6876.2001953125, "step": 1341 }, { "epoch": 0.03355, "grad_norm": 0.9157936573028564, "grad_norm_var": 0.0226662033596554, "learning_rate": 0.0001, "loss": 6859.0195, "loss/crossentropy": 2.811737298965454, "loss/hidden": 0.482421875, "loss/logits": 0.10107594728469849, "loss/reg": 6858.43603515625, "step": 1342 }, { "epoch": 0.033575, "grad_norm": 0.7973225116729736, "grad_norm_var": 0.02325672885853861, "learning_rate": 0.0001, "loss": 6841.3901, "loss/crossentropy": 2.892944097518921, "loss/hidden": 0.486328125, "loss/logits": 0.09526005387306213, "loss/reg": 6840.80859375, "step": 1343 }, { "epoch": 0.0336, "grad_norm": 0.9367047548294067, "grad_norm_var": 0.02321225915422347, "learning_rate": 0.0001, "loss": 6823.8916, "loss/crossentropy": 2.7473795413970947, "loss/hidden": 0.5, "loss/logits": 0.11404645442962646, "loss/reg": 6823.27734375, "step": 1344 }, { "epoch": 0.033625, "grad_norm": 0.8657424449920654, "grad_norm_var": 0.02327456064868153, "learning_rate": 0.0001, "loss": 6806.124, "loss/crossentropy": 2.865846872329712, "loss/hidden": 0.48828125, "loss/logits": 0.1009342223405838, "loss/reg": 6805.53466796875, "step": 1345 }, { "epoch": 0.03365, "grad_norm": 1.054735779762268, "grad_norm_var": 0.019324760377264265, "learning_rate": 0.0001, "loss": 6788.7681, "loss/crossentropy": 3.1837844848632812, "loss/hidden": 0.51953125, "loss/logits": 0.11502991616725922, "loss/reg": 6788.13330078125, "step": 1346 }, { "epoch": 0.033675, "grad_norm": 0.9969978928565979, "grad_norm_var": 0.018170129173570717, "learning_rate": 0.0001, "loss": 6770.9668, "loss/crossentropy": 2.617208480834961, "loss/hidden": 0.484375, "loss/logits": 0.10487374663352966, "loss/reg": 6770.37744140625, "step": 1347 }, { "epoch": 0.0337, "grad_norm": 0.8903811573982239, "grad_norm_var": 0.01832754930692624, "learning_rate": 0.0001, "loss": 6753.5425, "loss/crossentropy": 2.6439528465270996, "loss/hidden": 0.4921875, "loss/logits": 0.0947711169719696, "loss/reg": 6752.95556640625, "step": 1348 }, { "epoch": 0.033725, "grad_norm": 1.1429519653320312, "grad_norm_var": 0.018435605742367623, "learning_rate": 0.0001, "loss": 6736.2373, "loss/crossentropy": 3.1083426475524902, "loss/hidden": 0.5078125, "loss/logits": 0.10791288316249847, "loss/reg": 6735.62158203125, "step": 1349 }, { "epoch": 0.03375, "grad_norm": 1.1418884992599487, "grad_norm_var": 0.019523988822685096, "learning_rate": 0.0001, "loss": 6718.9263, "loss/crossentropy": 2.8117763996124268, "loss/hidden": 0.50390625, "loss/logits": 0.12460052967071533, "loss/reg": 6718.2978515625, "step": 1350 }, { "epoch": 0.033775, "grad_norm": 0.87659752368927, "grad_norm_var": 0.020717329095012702, "learning_rate": 0.0001, "loss": 6701.9033, "loss/crossentropy": 2.945575475692749, "loss/hidden": 0.4765625, "loss/logits": 0.08381037414073944, "loss/reg": 6701.3427734375, "step": 1351 }, { "epoch": 0.0338, "grad_norm": 1.689099669456482, "grad_norm_var": 0.049323888443802116, "learning_rate": 0.0001, "loss": 6684.8394, "loss/crossentropy": 2.7382237911224365, "loss/hidden": 0.54296875, "loss/logits": 0.1253061294555664, "loss/reg": 6684.1708984375, "step": 1352 }, { "epoch": 0.033825, "grad_norm": 1.2289764881134033, "grad_norm_var": 0.05077952008831321, "learning_rate": 0.0001, "loss": 6667.2241, "loss/crossentropy": 2.894232988357544, "loss/hidden": 0.51171875, "loss/logits": 0.1199125349521637, "loss/reg": 6666.59228515625, "step": 1353 }, { "epoch": 0.03385, "grad_norm": 0.9368696212768555, "grad_norm_var": 0.04969952561096349, "learning_rate": 0.0001, "loss": 6650.0176, "loss/crossentropy": 2.712751626968384, "loss/hidden": 0.484375, "loss/logits": 0.09183096885681152, "loss/reg": 6649.44140625, "step": 1354 }, { "epoch": 0.033875, "grad_norm": 1.061965823173523, "grad_norm_var": 0.04415994410644473, "learning_rate": 0.0001, "loss": 6632.9712, "loss/crossentropy": 3.0443739891052246, "loss/hidden": 0.5, "loss/logits": 0.11184091866016388, "loss/reg": 6632.359375, "step": 1355 }, { "epoch": 0.0339, "grad_norm": 1.4387856721878052, "grad_norm_var": 0.05369363995281968, "learning_rate": 0.0001, "loss": 6615.7505, "loss/crossentropy": 2.9656739234924316, "loss/hidden": 0.53515625, "loss/logits": 0.11242452263832092, "loss/reg": 6615.10302734375, "step": 1356 }, { "epoch": 0.033925, "grad_norm": 0.915932834148407, "grad_norm_var": 0.05507613206998211, "learning_rate": 0.0001, "loss": 6598.4375, "loss/crossentropy": 2.9624454975128174, "loss/hidden": 0.494140625, "loss/logits": 0.10640692710876465, "loss/reg": 6597.8369140625, "step": 1357 }, { "epoch": 0.03395, "grad_norm": 0.7371416091918945, "grad_norm_var": 0.06040284854185741, "learning_rate": 0.0001, "loss": 6581.6763, "loss/crossentropy": 2.574276924133301, "loss/hidden": 0.48828125, "loss/logits": 0.09294277429580688, "loss/reg": 6581.09521484375, "step": 1358 }, { "epoch": 0.033975, "grad_norm": 0.9200770854949951, "grad_norm_var": 0.05729892266983065, "learning_rate": 0.0001, "loss": 6565.1777, "loss/crossentropy": 2.9009039402008057, "loss/hidden": 0.5078125, "loss/logits": 0.09284225851297379, "loss/reg": 6564.5771484375, "step": 1359 }, { "epoch": 0.034, "grad_norm": 0.8759512305259705, "grad_norm_var": 0.058464997552856585, "learning_rate": 0.0001, "loss": 6548.5493, "loss/crossentropy": 2.875610589981079, "loss/hidden": 0.48828125, "loss/logits": 0.09936296939849854, "loss/reg": 6547.9619140625, "step": 1360 }, { "epoch": 0.034025, "grad_norm": 0.8347700238227844, "grad_norm_var": 0.05927918748393553, "learning_rate": 0.0001, "loss": 6532.1455, "loss/crossentropy": 2.864703893661499, "loss/hidden": 0.4765625, "loss/logits": 0.08602680265903473, "loss/reg": 6531.5830078125, "step": 1361 }, { "epoch": 0.03405, "grad_norm": 0.8607924580574036, "grad_norm_var": 0.06141567570151428, "learning_rate": 0.0001, "loss": 6515.8506, "loss/crossentropy": 2.605031728744507, "loss/hidden": 0.49609375, "loss/logits": 0.10415361821651459, "loss/reg": 6515.25048828125, "step": 1362 }, { "epoch": 0.034075, "grad_norm": 0.9597486853599548, "grad_norm_var": 0.0616877756756257, "learning_rate": 0.0001, "loss": 6499.5688, "loss/crossentropy": 2.887112855911255, "loss/hidden": 0.490234375, "loss/logits": 0.09598223119974136, "loss/reg": 6498.982421875, "step": 1363 }, { "epoch": 0.0341, "grad_norm": 2.3581037521362305, "grad_norm_var": 0.16861243853628496, "learning_rate": 0.0001, "loss": 6483.1235, "loss/crossentropy": 2.8899037837982178, "loss/hidden": 0.51171875, "loss/logits": 0.19754047691822052, "loss/reg": 6482.4140625, "step": 1364 }, { "epoch": 0.034125, "grad_norm": 1.3278796672821045, "grad_norm_var": 0.17122382768664993, "learning_rate": 0.0001, "loss": 6465.9492, "loss/crossentropy": 2.6608476638793945, "loss/hidden": 0.5390625, "loss/logits": 0.12474717944860458, "loss/reg": 6465.28564453125, "step": 1365 }, { "epoch": 0.03415, "grad_norm": 0.8733435273170471, "grad_norm_var": 0.17549470413593488, "learning_rate": 0.0001, "loss": 6449.3564, "loss/crossentropy": 2.6548588275909424, "loss/hidden": 0.484375, "loss/logits": 0.09747719019651413, "loss/reg": 6448.7744140625, "step": 1366 }, { "epoch": 0.034175, "grad_norm": 1.191423773765564, "grad_norm_var": 0.17153503360068617, "learning_rate": 0.0001, "loss": 6433.3599, "loss/crossentropy": 2.8686625957489014, "loss/hidden": 0.54296875, "loss/logits": 0.13953447341918945, "loss/reg": 6432.67724609375, "step": 1367 }, { "epoch": 0.0342, "grad_norm": 0.9046757817268372, "grad_norm_var": 0.15237186112470422, "learning_rate": 0.0001, "loss": 6417.104, "loss/crossentropy": 2.729344129562378, "loss/hidden": 0.48828125, "loss/logits": 0.11070854961872101, "loss/reg": 6416.5048828125, "step": 1368 }, { "epoch": 0.034225, "grad_norm": 0.7749925851821899, "grad_norm_var": 0.1567894782132723, "learning_rate": 0.0001, "loss": 6400.7339, "loss/crossentropy": 2.8414149284362793, "loss/hidden": 0.4765625, "loss/logits": 0.08838851749897003, "loss/reg": 6400.1689453125, "step": 1369 }, { "epoch": 0.03425, "grad_norm": 0.8586689233779907, "grad_norm_var": 0.158463654422662, "learning_rate": 0.0001, "loss": 6384.9233, "loss/crossentropy": 2.941861152648926, "loss/hidden": 0.484375, "loss/logits": 0.10005222260951996, "loss/reg": 6384.3388671875, "step": 1370 }, { "epoch": 0.034275, "grad_norm": 0.9387425780296326, "grad_norm_var": 0.15931284187209346, "learning_rate": 0.0001, "loss": 6368.7803, "loss/crossentropy": 2.8324739933013916, "loss/hidden": 0.48828125, "loss/logits": 0.09414509683847427, "loss/reg": 6368.19775390625, "step": 1371 }, { "epoch": 0.0343, "grad_norm": 1.0363467931747437, "grad_norm_var": 0.14847634007850463, "learning_rate": 0.0001, "loss": 6353.0469, "loss/crossentropy": 2.7621524333953857, "loss/hidden": 0.5, "loss/logits": 0.11550111323595047, "loss/reg": 6352.43115234375, "step": 1372 }, { "epoch": 0.034325, "grad_norm": 1.0415022373199463, "grad_norm_var": 0.14766861966588107, "learning_rate": 0.0001, "loss": 6337.2188, "loss/crossentropy": 2.695035457611084, "loss/hidden": 0.53515625, "loss/logits": 0.12410050630569458, "loss/reg": 6336.5595703125, "step": 1373 }, { "epoch": 0.03435, "grad_norm": 0.9406169652938843, "grad_norm_var": 0.1422869851031559, "learning_rate": 0.0001, "loss": 6321.0205, "loss/crossentropy": 2.7569470405578613, "loss/hidden": 0.494140625, "loss/logits": 0.09523436427116394, "loss/reg": 6320.43115234375, "step": 1374 }, { "epoch": 0.034375, "grad_norm": 0.8832371830940247, "grad_norm_var": 0.1429785627751386, "learning_rate": 0.0001, "loss": 6305.4341, "loss/crossentropy": 3.0882391929626465, "loss/hidden": 0.486328125, "loss/logits": 0.11143780499696732, "loss/reg": 6304.83642578125, "step": 1375 }, { "epoch": 0.0344, "grad_norm": 1.429364800453186, "grad_norm_var": 0.14991940796402817, "learning_rate": 0.0001, "loss": 6290.0093, "loss/crossentropy": 3.073742151260376, "loss/hidden": 0.5078125, "loss/logits": 0.1092057153582573, "loss/reg": 6289.39208984375, "step": 1376 }, { "epoch": 0.034425, "grad_norm": 1.6629365682601929, "grad_norm_var": 0.16616085625258492, "learning_rate": 0.0001, "loss": 6274.0537, "loss/crossentropy": 3.041329860687256, "loss/hidden": 0.4921875, "loss/logits": 0.10428580641746521, "loss/reg": 6273.45703125, "step": 1377 }, { "epoch": 0.03445, "grad_norm": 1.0548546314239502, "grad_norm_var": 0.16160972528239753, "learning_rate": 0.0001, "loss": 6258.5503, "loss/crossentropy": 2.8612847328186035, "loss/hidden": 0.498046875, "loss/logits": 0.09525103121995926, "loss/reg": 6257.95703125, "step": 1378 }, { "epoch": 0.034475, "grad_norm": 0.9255495667457581, "grad_norm_var": 0.16250373381180752, "learning_rate": 0.0001, "loss": 6242.9575, "loss/crossentropy": 2.9928011894226074, "loss/hidden": 0.49609375, "loss/logits": 0.11399185657501221, "loss/reg": 6242.34765625, "step": 1379 }, { "epoch": 0.0345, "grad_norm": 0.872288703918457, "grad_norm_var": 0.058697168420043376, "learning_rate": 0.0001, "loss": 6227.2896, "loss/crossentropy": 2.8623993396759033, "loss/hidden": 0.486328125, "loss/logits": 0.09496334940195084, "loss/reg": 6226.70849609375, "step": 1380 }, { "epoch": 0.034525, "grad_norm": 1.1161308288574219, "grad_norm_var": 0.05350661484273282, "learning_rate": 0.0001, "loss": 6211.978, "loss/crossentropy": 3.1282906532287598, "loss/hidden": 0.48828125, "loss/logits": 0.09676229953765869, "loss/reg": 6211.39306640625, "step": 1381 }, { "epoch": 0.03455, "grad_norm": 0.9747671484947205, "grad_norm_var": 0.05201019226660599, "learning_rate": 0.0001, "loss": 6196.6685, "loss/crossentropy": 2.552155017852783, "loss/hidden": 0.53125, "loss/logits": 0.12433020770549774, "loss/reg": 6196.0126953125, "step": 1382 }, { "epoch": 0.034575, "grad_norm": 1.1094785928726196, "grad_norm_var": 0.05075227060493869, "learning_rate": 0.0001, "loss": 6181.2075, "loss/crossentropy": 2.9492762088775635, "loss/hidden": 0.48046875, "loss/logits": 0.09517276287078857, "loss/reg": 6180.6318359375, "step": 1383 }, { "epoch": 0.0346, "grad_norm": 1.1116600036621094, "grad_norm_var": 0.04989508043688596, "learning_rate": 0.0001, "loss": 6165.6851, "loss/crossentropy": 3.004523754119873, "loss/hidden": 0.4921875, "loss/logits": 0.10090765357017517, "loss/reg": 6165.091796875, "step": 1384 }, { "epoch": 0.034625, "grad_norm": 0.8511428833007812, "grad_norm_var": 0.04750895557643346, "learning_rate": 0.0001, "loss": 6150.2388, "loss/crossentropy": 2.8703434467315674, "loss/hidden": 0.486328125, "loss/logits": 0.09198765456676483, "loss/reg": 6149.66064453125, "step": 1385 }, { "epoch": 0.03465, "grad_norm": 1.1000410318374634, "grad_norm_var": 0.0449779782934615, "learning_rate": 0.0001, "loss": 6134.9727, "loss/crossentropy": 2.912747383117676, "loss/hidden": 0.49609375, "loss/logits": 0.09684903919696808, "loss/reg": 6134.3798828125, "step": 1386 }, { "epoch": 0.034675, "grad_norm": 1.5239450931549072, "grad_norm_var": 0.05648812952174939, "learning_rate": 0.0001, "loss": 6119.5884, "loss/crossentropy": 2.942798376083374, "loss/hidden": 0.515625, "loss/logits": 0.1038217693567276, "loss/reg": 6118.96875, "step": 1387 }, { "epoch": 0.0347, "grad_norm": 1.1341832876205444, "grad_norm_var": 0.056228422031379906, "learning_rate": 0.0001, "loss": 6104.0986, "loss/crossentropy": 3.102213144302368, "loss/hidden": 0.52734375, "loss/logits": 0.11665617674589157, "loss/reg": 6103.45458984375, "step": 1388 }, { "epoch": 0.034725, "grad_norm": 1.142909288406372, "grad_norm_var": 0.05596889532093123, "learning_rate": 0.0001, "loss": 6088.6807, "loss/crossentropy": 2.795085906982422, "loss/hidden": 0.494140625, "loss/logits": 0.09639351069927216, "loss/reg": 6088.09033203125, "step": 1389 }, { "epoch": 0.03475, "grad_norm": 1.1043593883514404, "grad_norm_var": 0.05384683860217643, "learning_rate": 0.0001, "loss": 6073.5322, "loss/crossentropy": 2.617596387863159, "loss/hidden": 0.48828125, "loss/logits": 0.09738233685493469, "loss/reg": 6072.94677734375, "step": 1390 }, { "epoch": 0.034775, "grad_norm": 1.5261905193328857, "grad_norm_var": 0.05897490555796399, "learning_rate": 0.0001, "loss": 6058.2266, "loss/crossentropy": 2.8050098419189453, "loss/hidden": 0.52734375, "loss/logits": 0.09992828965187073, "loss/reg": 6057.59912109375, "step": 1391 }, { "epoch": 0.0348, "grad_norm": 1.1945099830627441, "grad_norm_var": 0.054143504860030826, "learning_rate": 0.0001, "loss": 6042.606, "loss/crossentropy": 2.8568167686462402, "loss/hidden": 0.5, "loss/logits": 0.09500862658023834, "loss/reg": 6042.0107421875, "step": 1392 }, { "epoch": 0.034825, "grad_norm": 1.2398542165756226, "grad_norm_var": 0.03641311092145567, "learning_rate": 0.0001, "loss": 6027.4819, "loss/crossentropy": 3.028388261795044, "loss/hidden": 0.490234375, "loss/logits": 0.10643279552459717, "loss/reg": 6026.88525390625, "step": 1393 }, { "epoch": 0.03485, "grad_norm": 0.9621533751487732, "grad_norm_var": 0.03780320526095952, "learning_rate": 0.0001, "loss": 6012.3086, "loss/crossentropy": 2.7710087299346924, "loss/hidden": 0.48828125, "loss/logits": 0.0932699665427208, "loss/reg": 6011.72705078125, "step": 1394 }, { "epoch": 0.034875, "grad_norm": 0.8675263524055481, "grad_norm_var": 0.03950306541272881, "learning_rate": 0.0001, "loss": 5997.2275, "loss/crossentropy": 2.6906023025512695, "loss/hidden": 0.48828125, "loss/logits": 0.08551843464374542, "loss/reg": 5996.65380859375, "step": 1395 }, { "epoch": 0.0349, "grad_norm": 0.8454170823097229, "grad_norm_var": 0.040415817948201746, "learning_rate": 0.0001, "loss": 5982.5356, "loss/crossentropy": 3.0521061420440674, "loss/hidden": 0.494140625, "loss/logits": 0.09865646809339523, "loss/reg": 5981.94287109375, "step": 1396 }, { "epoch": 0.034925, "grad_norm": 0.9366073608398438, "grad_norm_var": 0.04234958763299872, "learning_rate": 0.0001, "loss": 5967.7051, "loss/crossentropy": 3.2539052963256836, "loss/hidden": 0.48828125, "loss/logits": 0.09871365129947662, "loss/reg": 5967.1181640625, "step": 1397 }, { "epoch": 0.03495, "grad_norm": 1.0396111011505127, "grad_norm_var": 0.041516266332779835, "learning_rate": 0.0001, "loss": 5953.0903, "loss/crossentropy": 2.9550793170928955, "loss/hidden": 0.4921875, "loss/logits": 0.10799872875213623, "loss/reg": 5952.490234375, "step": 1398 }, { "epoch": 0.034975, "grad_norm": 1.357835292816162, "grad_norm_var": 0.0454997901936771, "learning_rate": 0.0001, "loss": 5938.5894, "loss/crossentropy": 2.8965654373168945, "loss/hidden": 0.494140625, "loss/logits": 0.09642442315816879, "loss/reg": 5937.9990234375, "step": 1399 }, { "epoch": 0.035, "grad_norm": 0.9682173728942871, "grad_norm_var": 0.04696673725534451, "learning_rate": 0.0001, "loss": 5924.3062, "loss/crossentropy": 2.8100740909576416, "loss/hidden": 0.515625, "loss/logits": 0.10012298077344894, "loss/reg": 5923.6904296875, "step": 1400 }, { "epoch": 0.035025, "grad_norm": 0.8568408489227295, "grad_norm_var": 0.0467704669000735, "learning_rate": 0.0001, "loss": 5910.2954, "loss/crossentropy": 2.8342316150665283, "loss/hidden": 0.482421875, "loss/logits": 0.09393075108528137, "loss/reg": 5909.71923828125, "step": 1401 }, { "epoch": 0.03505, "grad_norm": 0.9594826698303223, "grad_norm_var": 0.048238988806995294, "learning_rate": 0.0001, "loss": 5896.4443, "loss/crossentropy": 2.7887749671936035, "loss/hidden": 0.4921875, "loss/logits": 0.09894032776355743, "loss/reg": 5895.85302734375, "step": 1402 }, { "epoch": 0.035075, "grad_norm": 1.1903307437896729, "grad_norm_var": 0.03650307365173426, "learning_rate": 0.0001, "loss": 5882.7915, "loss/crossentropy": 2.9541616439819336, "loss/hidden": 0.52734375, "loss/logits": 0.10406450927257538, "loss/reg": 5882.16015625, "step": 1403 }, { "epoch": 0.0351, "grad_norm": 0.9279814958572388, "grad_norm_var": 0.037749923854517545, "learning_rate": 0.0001, "loss": 5868.8516, "loss/crossentropy": 3.004645824432373, "loss/hidden": 0.48828125, "loss/logits": 0.09477942436933517, "loss/reg": 5868.2685546875, "step": 1404 }, { "epoch": 0.035125, "grad_norm": 0.9412584900856018, "grad_norm_var": 0.038330778079378014, "learning_rate": 0.0001, "loss": 5855.0098, "loss/crossentropy": 2.9767656326293945, "loss/hidden": 0.490234375, "loss/logits": 0.10276122391223907, "loss/reg": 5854.4169921875, "step": 1405 }, { "epoch": 0.03515, "grad_norm": 0.8473858833312988, "grad_norm_var": 0.0408485346006211, "learning_rate": 0.0001, "loss": 5841.4292, "loss/crossentropy": 2.7796714305877686, "loss/hidden": 0.4921875, "loss/logits": 0.10826782882213593, "loss/reg": 5840.82861328125, "step": 1406 }, { "epoch": 0.035175, "grad_norm": 1.0374072790145874, "grad_norm_var": 0.024181143338040556, "learning_rate": 0.0001, "loss": 5827.9946, "loss/crossentropy": 3.059513807296753, "loss/hidden": 0.49609375, "loss/logits": 0.09310650080442429, "loss/reg": 5827.4052734375, "step": 1407 }, { "epoch": 0.0352, "grad_norm": 1.9238723516464233, "grad_norm_var": 0.07529703372726804, "learning_rate": 0.0001, "loss": 5814.2061, "loss/crossentropy": 2.676701068878174, "loss/hidden": 0.5234375, "loss/logits": 0.09868060797452927, "loss/reg": 5813.583984375, "step": 1408 }, { "epoch": 0.035225, "grad_norm": 0.9219335317611694, "grad_norm_var": 0.07383597514061453, "learning_rate": 0.0001, "loss": 5800.8066, "loss/crossentropy": 3.179941177368164, "loss/hidden": 0.494140625, "loss/logits": 0.10882144421339035, "loss/reg": 5800.20361328125, "step": 1409 }, { "epoch": 0.03525, "grad_norm": 1.7151775360107422, "grad_norm_var": 0.10181254197773316, "learning_rate": 0.0001, "loss": 5787.3242, "loss/crossentropy": 2.928461790084839, "loss/hidden": 0.5234375, "loss/logits": 0.10902885347604752, "loss/reg": 5786.69189453125, "step": 1410 }, { "epoch": 0.035275, "grad_norm": 0.9305047988891602, "grad_norm_var": 0.10024641212645134, "learning_rate": 0.0001, "loss": 5773.6714, "loss/crossentropy": 2.8883421421051025, "loss/hidden": 0.4765625, "loss/logits": 0.08832409977912903, "loss/reg": 5773.1064453125, "step": 1411 }, { "epoch": 0.0353, "grad_norm": 0.9236626029014587, "grad_norm_var": 0.09810356131340967, "learning_rate": 0.0001, "loss": 5760.1519, "loss/crossentropy": 2.5950002670288086, "loss/hidden": 0.494140625, "loss/logits": 0.09704364836215973, "loss/reg": 5759.560546875, "step": 1412 }, { "epoch": 0.035325, "grad_norm": 1.4456361532211304, "grad_norm_var": 0.10372546345283613, "learning_rate": 0.0001, "loss": 5746.7002, "loss/crossentropy": 2.9748454093933105, "loss/hidden": 0.52734375, "loss/logits": 0.1023864597082138, "loss/reg": 5746.0703125, "step": 1413 }, { "epoch": 0.03535, "grad_norm": 1.0488073825836182, "grad_norm_var": 0.10362703347444431, "learning_rate": 0.0001, "loss": 5733.1187, "loss/crossentropy": 2.902679204940796, "loss/hidden": 0.53125, "loss/logits": 0.10343801975250244, "loss/reg": 5732.48388671875, "step": 1414 }, { "epoch": 0.035375, "grad_norm": 0.9133599400520325, "grad_norm_var": 0.1021622460378283, "learning_rate": 0.0001, "loss": 5719.3115, "loss/crossentropy": 2.8861844539642334, "loss/hidden": 0.484375, "loss/logits": 0.09216949343681335, "loss/reg": 5718.73486328125, "step": 1415 }, { "epoch": 0.0354, "grad_norm": 0.8939177393913269, "grad_norm_var": 0.10378298627301626, "learning_rate": 0.0001, "loss": 5706.0918, "loss/crossentropy": 2.7581043243408203, "loss/hidden": 0.50390625, "loss/logits": 0.11326596885919571, "loss/reg": 5705.474609375, "step": 1416 }, { "epoch": 0.035425, "grad_norm": 0.9192634224891663, "grad_norm_var": 0.1020663984992288, "learning_rate": 0.0001, "loss": 5692.8921, "loss/crossentropy": 2.9199843406677246, "loss/hidden": 0.49609375, "loss/logits": 0.09658630192279816, "loss/reg": 5692.29931640625, "step": 1417 }, { "epoch": 0.03545, "grad_norm": 1.1331703662872314, "grad_norm_var": 0.10078458101228187, "learning_rate": 0.0001, "loss": 5679.9668, "loss/crossentropy": 3.0797760486602783, "loss/hidden": 0.498046875, "loss/logits": 0.1090155616402626, "loss/reg": 5679.35986328125, "step": 1418 }, { "epoch": 0.035475, "grad_norm": 1.0098109245300293, "grad_norm_var": 0.10081809220174791, "learning_rate": 0.0001, "loss": 5666.6006, "loss/crossentropy": 2.6272778511047363, "loss/hidden": 0.48828125, "loss/logits": 0.09299825131893158, "loss/reg": 5666.01953125, "step": 1419 }, { "epoch": 0.0355, "grad_norm": 1.2260712385177612, "grad_norm_var": 0.09970081981556332, "learning_rate": 0.0001, "loss": 5653.4111, "loss/crossentropy": 2.8777990341186523, "loss/hidden": 0.5078125, "loss/logits": 0.09825815260410309, "loss/reg": 5652.80517578125, "step": 1420 }, { "epoch": 0.035525, "grad_norm": 1.435912847518921, "grad_norm_var": 0.10357068173817315, "learning_rate": 0.0001, "loss": 5640.1152, "loss/crossentropy": 2.592386245727539, "loss/hidden": 0.5234375, "loss/logits": 0.0932084172964096, "loss/reg": 5639.49853515625, "step": 1421 }, { "epoch": 0.03555, "grad_norm": 1.0495048761367798, "grad_norm_var": 0.09809354771595694, "learning_rate": 0.0001, "loss": 5627.1245, "loss/crossentropy": 2.9958133697509766, "loss/hidden": 0.4921875, "loss/logits": 0.09580527245998383, "loss/reg": 5626.53662109375, "step": 1422 }, { "epoch": 0.035575, "grad_norm": 1.34793221950531, "grad_norm_var": 0.09912718309816147, "learning_rate": 0.0001, "loss": 5614.3306, "loss/crossentropy": 3.0196943283081055, "loss/hidden": 0.5234375, "loss/logits": 0.10388782620429993, "loss/reg": 5613.703125, "step": 1423 }, { "epoch": 0.0356, "grad_norm": 1.0757725238800049, "grad_norm_var": 0.05967166391269097, "learning_rate": 0.0001, "loss": 5601.147, "loss/crossentropy": 2.849278688430786, "loss/hidden": 0.50390625, "loss/logits": 0.08855399489402771, "loss/reg": 5600.5546875, "step": 1424 }, { "epoch": 0.035625, "grad_norm": 0.9359875917434692, "grad_norm_var": 0.059304607452516166, "learning_rate": 0.0001, "loss": 5587.9556, "loss/crossentropy": 2.7247815132141113, "loss/hidden": 0.51953125, "loss/logits": 0.09469491243362427, "loss/reg": 5587.34130859375, "step": 1425 }, { "epoch": 0.03565, "grad_norm": 0.9561020135879517, "grad_norm_var": 0.03561334173187921, "learning_rate": 0.0001, "loss": 5575.0317, "loss/crossentropy": 2.8671956062316895, "loss/hidden": 0.50390625, "loss/logits": 0.13245022296905518, "loss/reg": 5574.3955078125, "step": 1426 }, { "epoch": 0.035675, "grad_norm": 0.8375259637832642, "grad_norm_var": 0.03798018083497675, "learning_rate": 0.0001, "loss": 5562.2354, "loss/crossentropy": 2.699129104614258, "loss/hidden": 0.48828125, "loss/logits": 0.0898795947432518, "loss/reg": 5561.6572265625, "step": 1427 }, { "epoch": 0.0357, "grad_norm": 0.9537240266799927, "grad_norm_var": 0.03744198728680351, "learning_rate": 0.0001, "loss": 5550.0005, "loss/crossentropy": 2.908249855041504, "loss/hidden": 0.484375, "loss/logits": 0.08669602870941162, "loss/reg": 5549.42919921875, "step": 1428 }, { "epoch": 0.035725, "grad_norm": 0.958095908164978, "grad_norm_var": 0.02813351390256747, "learning_rate": 0.0001, "loss": 5537.4263, "loss/crossentropy": 2.706650495529175, "loss/hidden": 0.48828125, "loss/logits": 0.09869828075170517, "loss/reg": 5536.83935546875, "step": 1429 }, { "epoch": 0.03575, "grad_norm": 1.3420166969299316, "grad_norm_var": 0.03371677877437282, "learning_rate": 0.0001, "loss": 5524.8125, "loss/crossentropy": 3.1115193367004395, "loss/hidden": 0.515625, "loss/logits": 0.1058741956949234, "loss/reg": 5524.19091796875, "step": 1430 }, { "epoch": 0.035775, "grad_norm": 0.8543722033500671, "grad_norm_var": 0.0351014260765756, "learning_rate": 0.0001, "loss": 5512.2627, "loss/crossentropy": 2.608214855194092, "loss/hidden": 0.484375, "loss/logits": 0.10194234549999237, "loss/reg": 5511.67626953125, "step": 1431 }, { "epoch": 0.0358, "grad_norm": 0.8250775933265686, "grad_norm_var": 0.03690434825584472, "learning_rate": 0.0001, "loss": 5499.5815, "loss/crossentropy": 2.989871025085449, "loss/hidden": 0.48828125, "loss/logits": 0.09901078790426254, "loss/reg": 5498.994140625, "step": 1432 }, { "epoch": 0.035825, "grad_norm": 0.7822527885437012, "grad_norm_var": 0.040534793707076015, "learning_rate": 0.0001, "loss": 5487.1685, "loss/crossentropy": 2.9314780235290527, "loss/hidden": 0.482421875, "loss/logits": 0.09051553905010223, "loss/reg": 5486.595703125, "step": 1433 }, { "epoch": 0.03585, "grad_norm": 0.8068564534187317, "grad_norm_var": 0.043362734138599854, "learning_rate": 0.0001, "loss": 5474.8149, "loss/crossentropy": 2.6955857276916504, "loss/hidden": 0.51953125, "loss/logits": 0.10059453547000885, "loss/reg": 5474.19482421875, "step": 1434 }, { "epoch": 0.035875, "grad_norm": 1.4666972160339355, "grad_norm_var": 0.05549537286934751, "learning_rate": 0.0001, "loss": 5462.3555, "loss/crossentropy": 2.588399648666382, "loss/hidden": 0.55859375, "loss/logits": 0.09282772243022919, "loss/reg": 5461.7041015625, "step": 1435 }, { "epoch": 0.0359, "grad_norm": 1.117689847946167, "grad_norm_var": 0.0537338345703509, "learning_rate": 0.0001, "loss": 5450.0771, "loss/crossentropy": 2.9459269046783447, "loss/hidden": 0.51171875, "loss/logits": 0.0902022123336792, "loss/reg": 5449.47509765625, "step": 1436 }, { "epoch": 0.035925, "grad_norm": 1.105311632156372, "grad_norm_var": 0.04340371561122809, "learning_rate": 0.0001, "loss": 5437.9082, "loss/crossentropy": 2.984010934829712, "loss/hidden": 0.50390625, "loss/logits": 0.11168159544467926, "loss/reg": 5437.29248046875, "step": 1437 }, { "epoch": 0.03595, "grad_norm": 1.0142920017242432, "grad_norm_var": 0.04337053865574192, "learning_rate": 0.0001, "loss": 5425.6343, "loss/crossentropy": 3.048353910446167, "loss/hidden": 0.4921875, "loss/logits": 0.1168377697467804, "loss/reg": 5425.025390625, "step": 1438 }, { "epoch": 0.035975, "grad_norm": 1.074475646018982, "grad_norm_var": 0.036223558479937745, "learning_rate": 0.0001, "loss": 5413.6694, "loss/crossentropy": 2.865267276763916, "loss/hidden": 0.51171875, "loss/logits": 0.11835680902004242, "loss/reg": 5413.03955078125, "step": 1439 }, { "epoch": 0.036, "grad_norm": 1.2115192413330078, "grad_norm_var": 0.038626513677341955, "learning_rate": 0.0001, "loss": 5401.4404, "loss/crossentropy": 2.965080499649048, "loss/hidden": 0.51953125, "loss/logits": 0.12016730010509491, "loss/reg": 5400.80078125, "step": 1440 }, { "epoch": 0.036025, "grad_norm": 0.9538179636001587, "grad_norm_var": 0.0384582443619114, "learning_rate": 0.0001, "loss": 5389.1182, "loss/crossentropy": 3.0050806999206543, "loss/hidden": 0.490234375, "loss/logits": 0.09753800183534622, "loss/reg": 5388.5302734375, "step": 1441 }, { "epoch": 0.03605, "grad_norm": 0.846208930015564, "grad_norm_var": 0.040094179752653135, "learning_rate": 0.0001, "loss": 5376.7876, "loss/crossentropy": 2.678614854812622, "loss/hidden": 0.490234375, "loss/logits": 0.09301547706127167, "loss/reg": 5376.20458984375, "step": 1442 }, { "epoch": 0.036075, "grad_norm": 0.838445782661438, "grad_norm_var": 0.04007315713849911, "learning_rate": 0.0001, "loss": 5364.6421, "loss/crossentropy": 2.5030295848846436, "loss/hidden": 0.49609375, "loss/logits": 0.0893300473690033, "loss/reg": 5364.056640625, "step": 1443 }, { "epoch": 0.0361, "grad_norm": 1.0943560600280762, "grad_norm_var": 0.04026473399911965, "learning_rate": 0.0001, "loss": 5352.6411, "loss/crossentropy": 2.7688729763031006, "loss/hidden": 0.5234375, "loss/logits": 0.09878958761692047, "loss/reg": 5352.01904296875, "step": 1444 }, { "epoch": 0.036125, "grad_norm": 1.0070030689239502, "grad_norm_var": 0.040022175785339426, "learning_rate": 0.0001, "loss": 5340.7573, "loss/crossentropy": 2.664525270462036, "loss/hidden": 0.5234375, "loss/logits": 0.10403424501419067, "loss/reg": 5340.1298828125, "step": 1445 }, { "epoch": 0.03615, "grad_norm": 0.9361730217933655, "grad_norm_var": 0.03296033870427563, "learning_rate": 0.0001, "loss": 5328.8047, "loss/crossentropy": 3.0316553115844727, "loss/hidden": 0.51953125, "loss/logits": 0.10810248553752899, "loss/reg": 5328.17724609375, "step": 1446 }, { "epoch": 0.036175, "grad_norm": 0.8911219239234924, "grad_norm_var": 0.03235122087723141, "learning_rate": 0.0001, "loss": 5317.0811, "loss/crossentropy": 2.6316604614257812, "loss/hidden": 0.5, "loss/logits": 0.10301603376865387, "loss/reg": 5316.47802734375, "step": 1447 }, { "epoch": 0.0362, "grad_norm": 0.9054492115974426, "grad_norm_var": 0.0308996622053454, "learning_rate": 0.0001, "loss": 5305.5972, "loss/crossentropy": 3.067214012145996, "loss/hidden": 0.5, "loss/logits": 0.10464608669281006, "loss/reg": 5304.99267578125, "step": 1448 }, { "epoch": 0.036225, "grad_norm": 0.7438737750053406, "grad_norm_var": 0.03212250353753062, "learning_rate": 0.0001, "loss": 5294.0088, "loss/crossentropy": 2.698651075363159, "loss/hidden": 0.48828125, "loss/logits": 0.09136485308408737, "loss/reg": 5293.42919921875, "step": 1449 }, { "epoch": 0.03625, "grad_norm": 0.8691712617874146, "grad_norm_var": 0.030753536796646348, "learning_rate": 0.0001, "loss": 5282.9487, "loss/crossentropy": 2.8457107543945312, "loss/hidden": 0.486328125, "loss/logits": 0.10344676673412323, "loss/reg": 5282.35888671875, "step": 1450 }, { "epoch": 0.036275, "grad_norm": 1.5145204067230225, "grad_norm_var": 0.03384220665456894, "learning_rate": 0.0001, "loss": 5271.6914, "loss/crossentropy": 2.5761830806732178, "loss/hidden": 0.5234375, "loss/logits": 0.091462142765522, "loss/reg": 5271.07666015625, "step": 1451 }, { "epoch": 0.0363, "grad_norm": 1.5182558298110962, "grad_norm_var": 0.04974418429261451, "learning_rate": 0.0001, "loss": 5260.1943, "loss/crossentropy": 2.8467907905578613, "loss/hidden": 0.53125, "loss/logits": 0.10642577707767487, "loss/reg": 5259.556640625, "step": 1452 }, { "epoch": 0.036325, "grad_norm": 1.5889036655426025, "grad_norm_var": 0.06903922677874869, "learning_rate": 0.0001, "loss": 5248.917, "loss/crossentropy": 2.746821641921997, "loss/hidden": 0.5234375, "loss/logits": 0.09524772316217422, "loss/reg": 5248.29833984375, "step": 1453 }, { "epoch": 0.03635, "grad_norm": 1.0864880084991455, "grad_norm_var": 0.06889637151009295, "learning_rate": 0.0001, "loss": 5237.8438, "loss/crossentropy": 2.9466421604156494, "loss/hidden": 0.515625, "loss/logits": 0.10183542966842651, "loss/reg": 5237.22607421875, "step": 1454 }, { "epoch": 0.036375, "grad_norm": 0.9537948966026306, "grad_norm_var": 0.0696941507939409, "learning_rate": 0.0001, "loss": 5226.4126, "loss/crossentropy": 2.9261972904205322, "loss/hidden": 0.48828125, "loss/logits": 0.09991321712732315, "loss/reg": 5225.82421875, "step": 1455 }, { "epoch": 0.0364, "grad_norm": 0.8657480478286743, "grad_norm_var": 0.07017845849461607, "learning_rate": 0.0001, "loss": 5214.8638, "loss/crossentropy": 3.091569423675537, "loss/hidden": 0.5078125, "loss/logits": 0.10947701334953308, "loss/reg": 5214.24658203125, "step": 1456 }, { "epoch": 0.036425, "grad_norm": 0.8765159249305725, "grad_norm_var": 0.07142302758301801, "learning_rate": 0.0001, "loss": 5203.5029, "loss/crossentropy": 3.070120334625244, "loss/hidden": 0.490234375, "loss/logits": 0.09942461550235748, "loss/reg": 5202.9130859375, "step": 1457 }, { "epoch": 0.03645, "grad_norm": 0.8967143893241882, "grad_norm_var": 0.07032121058778491, "learning_rate": 0.0001, "loss": 5192.2266, "loss/crossentropy": 2.6533727645874023, "loss/hidden": 0.48828125, "loss/logits": 0.0900174081325531, "loss/reg": 5191.6484375, "step": 1458 }, { "epoch": 0.036475, "grad_norm": 1.0593829154968262, "grad_norm_var": 0.0675330313695351, "learning_rate": 0.0001, "loss": 5180.8418, "loss/crossentropy": 2.7807159423828125, "loss/hidden": 0.5, "loss/logits": 0.09789751470088959, "loss/reg": 5180.244140625, "step": 1459 }, { "epoch": 0.0365, "grad_norm": 3.7894692420959473, "grad_norm_var": 0.5372816782840233, "learning_rate": 0.0001, "loss": 5169.7563, "loss/crossentropy": 2.607316017150879, "loss/hidden": 0.54296875, "loss/logits": 0.11729206144809723, "loss/reg": 5169.09619140625, "step": 1460 }, { "epoch": 0.036525, "grad_norm": 1.3040951490402222, "grad_norm_var": 0.5344039803077333, "learning_rate": 0.0001, "loss": 5157.8638, "loss/crossentropy": 2.8131933212280273, "loss/hidden": 0.50390625, "loss/logits": 0.09452105313539505, "loss/reg": 5157.26513671875, "step": 1461 }, { "epoch": 0.03655, "grad_norm": 8.957572937011719, "grad_norm_var": 4.233578837622493, "learning_rate": 0.0001, "loss": 5146.8818, "loss/crossentropy": 2.7740039825439453, "loss/hidden": 0.5859375, "loss/logits": 0.1216060072183609, "loss/reg": 5146.17431640625, "step": 1462 }, { "epoch": 0.036575, "grad_norm": 1.2484403848648071, "grad_norm_var": 4.201172321158906, "learning_rate": 0.0001, "loss": 5135.6914, "loss/crossentropy": 2.657205104827881, "loss/hidden": 0.494140625, "loss/logits": 0.09523904323577881, "loss/reg": 5135.10205078125, "step": 1463 }, { "epoch": 0.0366, "grad_norm": 0.8138161897659302, "grad_norm_var": 4.212151832844503, "learning_rate": 0.0001, "loss": 5124.5767, "loss/crossentropy": 2.71421480178833, "loss/hidden": 0.48828125, "loss/logits": 0.08790698647499084, "loss/reg": 5124.00048828125, "step": 1464 }, { "epoch": 0.036625, "grad_norm": 0.9401508569717407, "grad_norm_var": 4.188087110269854, "learning_rate": 0.0001, "loss": 5113.332, "loss/crossentropy": 2.528451681137085, "loss/hidden": 0.49609375, "loss/logits": 0.10147030651569366, "loss/reg": 5112.734375, "step": 1465 }, { "epoch": 0.03665, "grad_norm": 0.9211971163749695, "grad_norm_var": 4.182023450957908, "learning_rate": 0.0001, "loss": 5102.2676, "loss/crossentropy": 2.9942121505737305, "loss/hidden": 0.51953125, "loss/logits": 0.0982511043548584, "loss/reg": 5101.64990234375, "step": 1466 }, { "epoch": 0.036675, "grad_norm": 1.070372462272644, "grad_norm_var": 4.209537857366257, "learning_rate": 0.0001, "loss": 5091.1826, "loss/crossentropy": 3.0105254650115967, "loss/hidden": 0.51953125, "loss/logits": 0.10811451077461243, "loss/reg": 5090.55517578125, "step": 1467 }, { "epoch": 0.0367, "grad_norm": 1.1308951377868652, "grad_norm_var": 4.230532903487014, "learning_rate": 0.0001, "loss": 5080.3057, "loss/crossentropy": 2.9224534034729004, "loss/hidden": 0.53125, "loss/logits": 0.1031089499592781, "loss/reg": 5079.67138671875, "step": 1468 }, { "epoch": 0.036725, "grad_norm": 1.8754812479019165, "grad_norm_var": 4.230695854346935, "learning_rate": 0.0001, "loss": 5069.5083, "loss/crossentropy": 2.867027521133423, "loss/hidden": 0.5234375, "loss/logits": 0.10570839047431946, "loss/reg": 5068.87939453125, "step": 1469 }, { "epoch": 0.03675, "grad_norm": 1.141131043434143, "grad_norm_var": 4.22614386003211, "learning_rate": 0.0001, "loss": 5058.5923, "loss/crossentropy": 2.6247942447662354, "loss/hidden": 0.53125, "loss/logits": 0.09891894459724426, "loss/reg": 5057.9619140625, "step": 1470 }, { "epoch": 0.036775, "grad_norm": 1.1037302017211914, "grad_norm_var": 4.211825607138292, "learning_rate": 0.0001, "loss": 5047.3931, "loss/crossentropy": 3.0050926208496094, "loss/hidden": 0.54296875, "loss/logits": 0.12298932671546936, "loss/reg": 5046.72705078125, "step": 1471 }, { "epoch": 0.0368, "grad_norm": 1.3930877447128296, "grad_norm_var": 4.167055799482511, "learning_rate": 0.0001, "loss": 5035.3618, "loss/crossentropy": 2.867047071456909, "loss/hidden": 0.546875, "loss/logits": 0.09695276618003845, "loss/reg": 5034.7177734375, "step": 1472 }, { "epoch": 0.036825, "grad_norm": 0.9566633701324463, "grad_norm_var": 4.157774262940288, "learning_rate": 0.0001, "loss": 5024.1187, "loss/crossentropy": 2.724266767501831, "loss/hidden": 0.5234375, "loss/logits": 0.09489968419075012, "loss/reg": 5023.50048828125, "step": 1473 }, { "epoch": 0.03685, "grad_norm": 1.1047159433364868, "grad_norm_var": 4.135769850467174, "learning_rate": 0.0001, "loss": 5013.0962, "loss/crossentropy": 2.5027365684509277, "loss/hidden": 0.51953125, "loss/logits": 0.09591224044561386, "loss/reg": 5012.48095703125, "step": 1474 }, { "epoch": 0.036875, "grad_norm": 1.4689037799835205, "grad_norm_var": 4.105776990056344, "learning_rate": 0.0001, "loss": 5001.6812, "loss/crossentropy": 2.7909207344055176, "loss/hidden": 0.54296875, "loss/logits": 0.1282854527235031, "loss/reg": 5001.009765625, "step": 1475 }, { "epoch": 0.0369, "grad_norm": 1.0739867687225342, "grad_norm_var": 3.855824374802542, "learning_rate": 0.0001, "loss": 4990.5381, "loss/crossentropy": 2.8297784328460693, "loss/hidden": 0.51171875, "loss/logits": 0.08772643655538559, "loss/reg": 4989.9384765625, "step": 1476 }, { "epoch": 0.036925, "grad_norm": 1.1636221408843994, "grad_norm_var": 3.863658397032923, "learning_rate": 0.0001, "loss": 4979.9316, "loss/crossentropy": 3.1510634422302246, "loss/hidden": 0.54296875, "loss/logits": 0.1621702015399933, "loss/reg": 4979.2265625, "step": 1477 }, { "epoch": 0.03695, "grad_norm": 0.8736411929130554, "grad_norm_var": 0.06906679248819637, "learning_rate": 0.0001, "loss": 4969.0767, "loss/crossentropy": 2.646597146987915, "loss/hidden": 0.5078125, "loss/logits": 0.09061123430728912, "loss/reg": 4968.47802734375, "step": 1478 }, { "epoch": 0.036975, "grad_norm": 1.014277458190918, "grad_norm_var": 0.06918584750399195, "learning_rate": 0.0001, "loss": 4958.5859, "loss/crossentropy": 2.814887762069702, "loss/hidden": 0.50390625, "loss/logits": 0.1120857298374176, "loss/reg": 4957.9697265625, "step": 1479 }, { "epoch": 0.037, "grad_norm": 1.068294882774353, "grad_norm_var": 0.06257783444038457, "learning_rate": 0.0001, "loss": 4947.8711, "loss/crossentropy": 2.810455799102783, "loss/hidden": 0.53515625, "loss/logits": 0.12479346990585327, "loss/reg": 4947.2109375, "step": 1480 }, { "epoch": 0.037025, "grad_norm": 0.9667389392852783, "grad_norm_var": 0.061900209019312945, "learning_rate": 0.0001, "loss": 4937.2061, "loss/crossentropy": 2.9907290935516357, "loss/hidden": 0.5, "loss/logits": 0.09580424427986145, "loss/reg": 4936.6103515625, "step": 1481 }, { "epoch": 0.03705, "grad_norm": 0.806204617023468, "grad_norm_var": 0.06616454229393133, "learning_rate": 0.0001, "loss": 4926.8564, "loss/crossentropy": 2.8602395057678223, "loss/hidden": 0.494140625, "loss/logits": 0.10172467678785324, "loss/reg": 4926.2607421875, "step": 1482 }, { "epoch": 0.037075, "grad_norm": 1.797229290008545, "grad_norm_var": 0.09260782793659808, "learning_rate": 0.0001, "loss": 4916.8042, "loss/crossentropy": 3.3809680938720703, "loss/hidden": 0.5625, "loss/logits": 0.15680980682373047, "loss/reg": 4916.0849609375, "step": 1483 }, { "epoch": 0.0371, "grad_norm": 0.965241014957428, "grad_norm_var": 0.09548839779796195, "learning_rate": 0.0001, "loss": 4906.4863, "loss/crossentropy": 2.7549846172332764, "loss/hidden": 0.5234375, "loss/logits": 0.10236407816410065, "loss/reg": 4905.8603515625, "step": 1484 }, { "epoch": 0.037125, "grad_norm": 0.9277069568634033, "grad_norm_var": 0.06289725839143258, "learning_rate": 0.0001, "loss": 4896.3291, "loss/crossentropy": 2.7426342964172363, "loss/hidden": 0.515625, "loss/logits": 0.10069979727268219, "loss/reg": 4895.712890625, "step": 1485 }, { "epoch": 0.03715, "grad_norm": 1.094347357749939, "grad_norm_var": 0.0628652724996159, "learning_rate": 0.0001, "loss": 4885.9609, "loss/crossentropy": 2.6395647525787354, "loss/hidden": 0.5, "loss/logits": 0.0898917019367218, "loss/reg": 4885.37109375, "step": 1486 }, { "epoch": 0.037175, "grad_norm": 1.3503471612930298, "grad_norm_var": 0.0664225551921736, "learning_rate": 0.0001, "loss": 4875.8916, "loss/crossentropy": 2.944330930709839, "loss/hidden": 0.5390625, "loss/logits": 0.16826876997947693, "loss/reg": 4875.18408203125, "step": 1487 }, { "epoch": 0.0372, "grad_norm": 1.0866377353668213, "grad_norm_var": 0.0614018308856836, "learning_rate": 0.0001, "loss": 4865.8545, "loss/crossentropy": 2.8836276531219482, "loss/hidden": 0.5625, "loss/logits": 0.11985873430967331, "loss/reg": 4865.17236328125, "step": 1488 }, { "epoch": 0.037225, "grad_norm": 1.2594707012176514, "grad_norm_var": 0.06104631059139612, "learning_rate": 0.0001, "loss": 4854.1958, "loss/crossentropy": 3.3286149501800537, "loss/hidden": 0.546875, "loss/logits": 0.1134444922208786, "loss/reg": 4853.53564453125, "step": 1489 }, { "epoch": 0.03725, "grad_norm": 1.0488533973693848, "grad_norm_var": 0.06140237843236241, "learning_rate": 0.0001, "loss": 4843.4141, "loss/crossentropy": 2.967571973800659, "loss/hidden": 0.5234375, "loss/logits": 0.10797536373138428, "loss/reg": 4842.78271484375, "step": 1490 }, { "epoch": 0.037275, "grad_norm": 1.0009063482284546, "grad_norm_var": 0.05349721406061207, "learning_rate": 0.0001, "loss": 4833.2422, "loss/crossentropy": 2.8375730514526367, "loss/hidden": 0.52734375, "loss/logits": 0.10792994499206543, "loss/reg": 4832.60693359375, "step": 1491 }, { "epoch": 0.0373, "grad_norm": 0.8741068840026855, "grad_norm_var": 0.0565167608334434, "learning_rate": 0.0001, "loss": 4823.21, "loss/crossentropy": 2.7234745025634766, "loss/hidden": 0.51171875, "loss/logits": 0.10996709764003754, "loss/reg": 4822.58837890625, "step": 1492 }, { "epoch": 0.037325, "grad_norm": 0.8451005220413208, "grad_norm_var": 0.05935315328759002, "learning_rate": 0.0001, "loss": 4813.2583, "loss/crossentropy": 2.9505717754364014, "loss/hidden": 0.494140625, "loss/logits": 0.10429343581199646, "loss/reg": 4812.65966796875, "step": 1493 }, { "epoch": 0.03735, "grad_norm": 0.8686822652816772, "grad_norm_var": 0.05947869834809385, "learning_rate": 0.0001, "loss": 4803.2378, "loss/crossentropy": 2.901232957839966, "loss/hidden": 0.51953125, "loss/logits": 0.09702814370393753, "loss/reg": 4802.62109375, "step": 1494 }, { "epoch": 0.037375, "grad_norm": 0.7863928079605103, "grad_norm_var": 0.0641405366837444, "learning_rate": 0.0001, "loss": 4793.2183, "loss/crossentropy": 2.769129991531372, "loss/hidden": 0.494140625, "loss/logits": 0.09640781581401825, "loss/reg": 4792.6279296875, "step": 1495 }, { "epoch": 0.0374, "grad_norm": 0.8319947719573975, "grad_norm_var": 0.06694816320778324, "learning_rate": 0.0001, "loss": 4783.624, "loss/crossentropy": 2.732968330383301, "loss/hidden": 0.4921875, "loss/logits": 0.1065090000629425, "loss/reg": 4783.025390625, "step": 1496 }, { "epoch": 0.037425, "grad_norm": 1.0754128694534302, "grad_norm_var": 0.06674251294971753, "learning_rate": 0.0001, "loss": 4774.0024, "loss/crossentropy": 2.5251784324645996, "loss/hidden": 0.5078125, "loss/logits": 0.09990575909614563, "loss/reg": 4773.39453125, "step": 1497 }, { "epoch": 0.03745, "grad_norm": 0.9813482761383057, "grad_norm_var": 0.06323119874060669, "learning_rate": 0.0001, "loss": 4764.0625, "loss/crossentropy": 2.790985584259033, "loss/hidden": 0.52734375, "loss/logits": 0.11107774078845978, "loss/reg": 4763.42431640625, "step": 1498 }, { "epoch": 0.037475, "grad_norm": 0.8058618903160095, "grad_norm_var": 0.025834880792797735, "learning_rate": 0.0001, "loss": 4754.4912, "loss/crossentropy": 2.819187879562378, "loss/hidden": 0.490234375, "loss/logits": 0.10257354378700256, "loss/reg": 4753.8984375, "step": 1499 }, { "epoch": 0.0375, "grad_norm": 0.9984162449836731, "grad_norm_var": 0.02580454190790243, "learning_rate": 0.0001, "loss": 4745.0625, "loss/crossentropy": 2.828315496444702, "loss/hidden": 0.5, "loss/logits": 0.10384377837181091, "loss/reg": 4744.45849609375, "step": 1500 } ], "logging_steps": 1, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.683532251136e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }