diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.0375, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.5e-05, + "grad_norm": 0.8990891575813293, + "learning_rate": 1.0000000000000002e-06, + "loss": 352499.0625, + "loss/crossentropy": 2.657993793487549, + "loss/hidden": 0.388671875, + "loss/logits": 0.10807140171527863, + "loss/reg": 352498.5625, + "step": 1 + }, + { + "epoch": 5e-05, + "grad_norm": 1.0061860084533691, + "learning_rate": 2.0000000000000003e-06, + "loss": 352499.0312, + "loss/crossentropy": 2.5393872261047363, + "loss/hidden": 0.375, + "loss/logits": 0.09729088842868805, + "loss/reg": 352498.5625, + "step": 2 + }, + { + "epoch": 7.5e-05, + "grad_norm": 1.1664918661117554, + "learning_rate": 3e-06, + "loss": 352494.0625, + "loss/crossentropy": 2.7846217155456543, + "loss/hidden": 0.41015625, + "loss/logits": 0.12509623169898987, + "loss/reg": 352493.53125, + "step": 3 + }, + { + "epoch": 0.0001, + "grad_norm": 1.007649302482605, + "learning_rate": 4.000000000000001e-06, + "loss": 352486.8125, + "loss/crossentropy": 2.478257656097412, + "loss/hidden": 0.38671875, + "loss/logits": 0.09696321189403534, + "loss/reg": 352486.34375, + "step": 4 + }, + { + "epoch": 0.000125, + "grad_norm": 1.150974988937378, + "learning_rate": 5e-06, + "loss": 352475.5938, + "loss/crossentropy": 2.636676073074341, + "loss/hidden": 0.416015625, + "loss/logits": 0.11413601040840149, + "loss/reg": 352475.0625, + "step": 5 + }, + { + "epoch": 0.00015, + "grad_norm": 1.0984797477722168, + "learning_rate": 6e-06, + "loss": 352463.0, + "loss/crossentropy": 2.4723658561706543, + "loss/hidden": 0.3984375, + "loss/logits": 0.1231672614812851, + "loss/reg": 352462.46875, + "step": 6 + }, + { + "epoch": 0.000175, + "grad_norm": 0.825570285320282, + "learning_rate": 7.000000000000001e-06, + "loss": 352448.3438, + "loss/crossentropy": 2.5369279384613037, + "loss/hidden": 0.40625, + "loss/logits": 0.10759128630161285, + "loss/reg": 352447.84375, + "step": 7 + }, + { + "epoch": 0.0002, + "grad_norm": 1.4584909677505493, + "learning_rate": 8.000000000000001e-06, + "loss": 352431.8125, + "loss/crossentropy": 2.6420071125030518, + "loss/hidden": 0.40234375, + "loss/logits": 0.10714638233184814, + "loss/reg": 352431.3125, + "step": 8 + }, + { + "epoch": 0.000225, + "grad_norm": 0.7993799448013306, + "learning_rate": 9e-06, + "loss": 352412.25, + "loss/crossentropy": 2.300945281982422, + "loss/hidden": 0.37890625, + "loss/logits": 0.0980607345700264, + "loss/reg": 352411.78125, + "step": 9 + }, + { + "epoch": 0.00025, + "grad_norm": 0.8132392168045044, + "learning_rate": 1e-05, + "loss": 352391.4688, + "loss/crossentropy": 2.549692392349243, + "loss/hidden": 0.388671875, + "loss/logits": 0.11239537596702576, + "loss/reg": 352390.96875, + "step": 10 + }, + { + "epoch": 0.000275, + "grad_norm": 0.7587840557098389, + "learning_rate": 1.1000000000000001e-05, + "loss": 352369.1875, + "loss/crossentropy": 2.391806125640869, + "loss/hidden": 0.40234375, + "loss/logits": 0.10249912738800049, + "loss/reg": 352368.6875, + "step": 11 + }, + { + "epoch": 0.0003, + "grad_norm": 0.9837542176246643, + "learning_rate": 1.2e-05, + "loss": 352344.4688, + "loss/crossentropy": 2.783439874649048, + "loss/hidden": 0.3984375, + "loss/logits": 0.11502979695796967, + "loss/reg": 352343.96875, + "step": 12 + }, + { + "epoch": 0.000325, + "grad_norm": 0.7176467776298523, + "learning_rate": 1.3000000000000001e-05, + "loss": 352317.2812, + "loss/crossentropy": 2.546398401260376, + "loss/hidden": 0.376953125, + "loss/logits": 0.10087086260318756, + "loss/reg": 352316.8125, + "step": 13 + }, + { + "epoch": 0.00035, + "grad_norm": 0.6804006695747375, + "learning_rate": 1.4000000000000001e-05, + "loss": 352288.7188, + "loss/crossentropy": 2.2503833770751953, + "loss/hidden": 0.365234375, + "loss/logits": 0.08064158260822296, + "loss/reg": 352288.28125, + "step": 14 + }, + { + "epoch": 0.000375, + "grad_norm": 0.7415475845336914, + "learning_rate": 1.5e-05, + "loss": 352259.4062, + "loss/crossentropy": 2.746664047241211, + "loss/hidden": 0.388671875, + "loss/logits": 0.12547868490219116, + "loss/reg": 352258.90625, + "step": 15 + }, + { + "epoch": 0.0004, + "grad_norm": 0.7663099765777588, + "grad_norm_var": 0.044582702584291416, + "learning_rate": 1.6000000000000003e-05, + "loss": 352227.8125, + "loss/crossentropy": 2.6695168018341064, + "loss/hidden": 0.37109375, + "loss/logits": 0.09486141800880432, + "loss/reg": 352227.34375, + "step": 16 + }, + { + "epoch": 0.000425, + "grad_norm": 0.7551850080490112, + "grad_norm_var": 0.04646286949992676, + "learning_rate": 1.7000000000000003e-05, + "loss": 352193.7812, + "loss/crossentropy": 2.4840545654296875, + "loss/hidden": 0.3671875, + "loss/logits": 0.09224767982959747, + "loss/reg": 352193.3125, + "step": 17 + }, + { + "epoch": 0.00045, + "grad_norm": 0.9829086661338806, + "grad_norm_var": 0.04623120069336625, + "learning_rate": 1.8e-05, + "loss": 352158.9062, + "loss/crossentropy": 2.8658523559570312, + "loss/hidden": 0.396484375, + "loss/logits": 0.10515056550502777, + "loss/reg": 352158.40625, + "step": 18 + }, + { + "epoch": 0.000475, + "grad_norm": 0.7825629711151123, + "grad_norm_var": 0.04278355162789619, + "learning_rate": 1.9e-05, + "loss": 352120.7188, + "loss/crossentropy": 2.770911455154419, + "loss/hidden": 0.357421875, + "loss/logits": 0.08866650611162186, + "loss/reg": 352120.28125, + "step": 19 + }, + { + "epoch": 0.0005, + "grad_norm": 0.7500621676445007, + "grad_norm_var": 0.04306775406588148, + "learning_rate": 2e-05, + "loss": 352080.9688, + "loss/crossentropy": 2.654839277267456, + "loss/hidden": 0.37109375, + "loss/logits": 0.09846656024456024, + "loss/reg": 352080.5, + "step": 20 + }, + { + "epoch": 0.000525, + "grad_norm": 1.1595454216003418, + "grad_norm_var": 0.043383044629798855, + "learning_rate": 2.1e-05, + "loss": 352037.3125, + "loss/crossentropy": 2.4238386154174805, + "loss/hidden": 0.373046875, + "loss/logits": 0.09536640346050262, + "loss/reg": 352036.84375, + "step": 21 + }, + { + "epoch": 0.00055, + "grad_norm": 0.800118625164032, + "grad_norm_var": 0.04024005917018294, + "learning_rate": 2.2000000000000003e-05, + "loss": 351981.875, + "loss/crossentropy": 2.3175930976867676, + "loss/hidden": 0.34375, + "loss/logits": 0.08112605661153793, + "loss/reg": 351981.4375, + "step": 22 + }, + { + "epoch": 0.000575, + "grad_norm": 1.008859395980835, + "grad_norm_var": 0.04147464320553755, + "learning_rate": 2.3000000000000003e-05, + "loss": 351930.5938, + "loss/crossentropy": 2.9110066890716553, + "loss/hidden": 0.396484375, + "loss/logits": 0.10111527144908905, + "loss/reg": 351930.09375, + "step": 23 + }, + { + "epoch": 0.0006, + "grad_norm": 0.6007267832756042, + "grad_norm_var": 0.0204320549079418, + "learning_rate": 2.4e-05, + "loss": 351874.0, + "loss/crossentropy": 2.521523952484131, + "loss/hidden": 0.341796875, + "loss/logits": 0.09783630818128586, + "loss/reg": 351873.5625, + "step": 24 + }, + { + "epoch": 0.000625, + "grad_norm": 0.8861953020095825, + "grad_norm_var": 0.020678149542751458, + "learning_rate": 2.5e-05, + "loss": 351818.5, + "loss/crossentropy": 2.551731586456299, + "loss/hidden": 0.376953125, + "loss/logits": 0.10109081864356995, + "loss/reg": 351818.03125, + "step": 25 + }, + { + "epoch": 0.00065, + "grad_norm": 0.680790364742279, + "grad_norm_var": 0.021968847889962718, + "learning_rate": 2.6000000000000002e-05, + "loss": 351757.6562, + "loss/crossentropy": 2.740166664123535, + "loss/hidden": 0.373046875, + "loss/logits": 0.10807862132787704, + "loss/reg": 351757.1875, + "step": 26 + }, + { + "epoch": 0.000675, + "grad_norm": 0.7792268395423889, + "grad_norm_var": 0.0218391159024706, + "learning_rate": 2.7000000000000002e-05, + "loss": 351695.5312, + "loss/crossentropy": 2.7071239948272705, + "loss/hidden": 0.369140625, + "loss/logits": 0.10798090696334839, + "loss/reg": 351695.0625, + "step": 27 + }, + { + "epoch": 0.0007, + "grad_norm": 0.6987195014953613, + "grad_norm_var": 0.02058860603810307, + "learning_rate": 2.8000000000000003e-05, + "loss": 351627.8438, + "loss/crossentropy": 2.501779317855835, + "loss/hidden": 0.359375, + "loss/logits": 0.09967444092035294, + "loss/reg": 351627.375, + "step": 28 + }, + { + "epoch": 0.000725, + "grad_norm": 0.8250280022621155, + "grad_norm_var": 0.020138411660303213, + "learning_rate": 2.9e-05, + "loss": 351556.5312, + "loss/crossentropy": 2.6017117500305176, + "loss/hidden": 0.353515625, + "loss/logits": 0.09585036337375641, + "loss/reg": 351556.09375, + "step": 29 + }, + { + "epoch": 0.00075, + "grad_norm": 0.7717598080635071, + "grad_norm_var": 0.019128449305659424, + "learning_rate": 3e-05, + "loss": 351480.3125, + "loss/crossentropy": 2.6378984451293945, + "loss/hidden": 0.380859375, + "loss/logits": 0.10243427008390427, + "loss/reg": 351479.84375, + "step": 30 + }, + { + "epoch": 0.000775, + "grad_norm": 0.6966306567192078, + "grad_norm_var": 0.019675560702599747, + "learning_rate": 3.1e-05, + "loss": 351397.9375, + "loss/crossentropy": 2.3649041652679443, + "loss/hidden": 0.357421875, + "loss/logits": 0.09597521275281906, + "loss/reg": 351397.46875, + "step": 31 + }, + { + "epoch": 0.0008, + "grad_norm": 0.664853572845459, + "grad_norm_var": 0.02089692070087943, + "learning_rate": 3.2000000000000005e-05, + "loss": 351315.0312, + "loss/crossentropy": 2.5563435554504395, + "loss/hidden": 0.353515625, + "loss/logits": 0.10334671288728714, + "loss/reg": 351314.5625, + "step": 32 + }, + { + "epoch": 0.000825, + "grad_norm": 0.7842820882797241, + "grad_norm_var": 0.02076550256052436, + "learning_rate": 3.3e-05, + "loss": 351228.1562, + "loss/crossentropy": 2.8158535957336426, + "loss/hidden": 0.34375, + "loss/logits": 0.09285588562488556, + "loss/reg": 351227.71875, + "step": 33 + }, + { + "epoch": 0.00085, + "grad_norm": 0.6544437408447266, + "grad_norm_var": 0.01969585228427011, + "learning_rate": 3.4000000000000007e-05, + "loss": 351137.2812, + "loss/crossentropy": 2.7246992588043213, + "loss/hidden": 0.369140625, + "loss/logits": 0.10069996118545532, + "loss/reg": 351136.8125, + "step": 34 + }, + { + "epoch": 0.000875, + "grad_norm": 0.6708226203918457, + "grad_norm_var": 0.02049744993643774, + "learning_rate": 3.5e-05, + "loss": 351042.1562, + "loss/crossentropy": 2.273859739303589, + "loss/hidden": 0.34375, + "loss/logits": 0.10322189331054688, + "loss/reg": 351041.71875, + "step": 35 + }, + { + "epoch": 0.0009, + "grad_norm": 0.9052221179008484, + "grad_norm_var": 0.021444739470696975, + "learning_rate": 3.6e-05, + "loss": 350939.875, + "loss/crossentropy": 2.6857171058654785, + "loss/hidden": 0.37109375, + "loss/logits": 0.10182373225688934, + "loss/reg": 350939.40625, + "step": 36 + }, + { + "epoch": 0.000925, + "grad_norm": 0.9864970445632935, + "grad_norm_var": 0.014713678080535903, + "learning_rate": 3.7e-05, + "loss": 350830.5, + "loss/crossentropy": 2.2562193870544434, + "loss/hidden": 0.37109375, + "loss/logits": 0.09030164778232574, + "loss/reg": 350830.03125, + "step": 37 + }, + { + "epoch": 0.00095, + "grad_norm": 0.8805357217788696, + "grad_norm_var": 0.015377688589989431, + "learning_rate": 3.8e-05, + "loss": 350718.7812, + "loss/crossentropy": 2.299184560775757, + "loss/hidden": 0.359375, + "loss/logits": 0.10313265025615692, + "loss/reg": 350718.3125, + "step": 38 + }, + { + "epoch": 0.000975, + "grad_norm": 0.786675751209259, + "grad_norm_var": 0.01171021529588548, + "learning_rate": 3.9000000000000006e-05, + "loss": 350606.125, + "loss/crossentropy": 2.5615806579589844, + "loss/hidden": 0.376953125, + "loss/logits": 0.10579188168048859, + "loss/reg": 350605.65625, + "step": 39 + }, + { + "epoch": 0.001, + "grad_norm": 0.6990974545478821, + "grad_norm_var": 0.010133824182919622, + "learning_rate": 4e-05, + "loss": 350486.5, + "loss/crossentropy": 2.393359899520874, + "loss/hidden": 0.326171875, + "loss/logits": 0.08165943622589111, + "loss/reg": 350486.09375, + "step": 40 + }, + { + "epoch": 0.001025, + "grad_norm": 0.7575247287750244, + "grad_norm_var": 0.00922957570391792, + "learning_rate": 4.1e-05, + "loss": 350362.9375, + "loss/crossentropy": 2.7492964267730713, + "loss/hidden": 0.35546875, + "loss/logits": 0.09948307275772095, + "loss/reg": 350362.46875, + "step": 41 + }, + { + "epoch": 0.00105, + "grad_norm": 0.7236489653587341, + "grad_norm_var": 0.008862411546386539, + "learning_rate": 4.2e-05, + "loss": 350231.0625, + "loss/crossentropy": 2.4042203426361084, + "loss/hidden": 0.34765625, + "loss/logits": 0.10723390430212021, + "loss/reg": 350230.59375, + "step": 42 + }, + { + "epoch": 0.001075, + "grad_norm": 0.6667615175247192, + "grad_norm_var": 0.009481747856356634, + "learning_rate": 4.3e-05, + "loss": 350095.2812, + "loss/crossentropy": 2.657647132873535, + "loss/hidden": 0.35546875, + "loss/logits": 0.0933607816696167, + "loss/reg": 350094.84375, + "step": 43 + }, + { + "epoch": 0.0011, + "grad_norm": 0.6898338198661804, + "grad_norm_var": 0.009560210933929624, + "learning_rate": 4.4000000000000006e-05, + "loss": 349956.0, + "loss/crossentropy": 2.505267381668091, + "loss/hidden": 0.33203125, + "loss/logits": 0.09544478356838226, + "loss/reg": 349955.5625, + "step": 44 + }, + { + "epoch": 0.001125, + "grad_norm": 0.7395535111427307, + "grad_norm_var": 0.009278307698633626, + "learning_rate": 4.5e-05, + "loss": 349811.4375, + "loss/crossentropy": 2.551318645477295, + "loss/hidden": 0.3515625, + "loss/logits": 0.1010345071554184, + "loss/reg": 349811.0, + "step": 45 + }, + { + "epoch": 0.00115, + "grad_norm": 0.766717255115509, + "grad_norm_var": 0.009268550582984859, + "learning_rate": 4.600000000000001e-05, + "loss": 349662.0, + "loss/crossentropy": 2.4719483852386475, + "loss/hidden": 0.349609375, + "loss/logits": 0.096840038895607, + "loss/reg": 349661.5625, + "step": 46 + }, + { + "epoch": 0.001175, + "grad_norm": 0.7225788831710815, + "grad_norm_var": 0.00911018026195678, + "learning_rate": 4.7e-05, + "loss": 349505.25, + "loss/crossentropy": 2.4699411392211914, + "loss/hidden": 0.34375, + "loss/logits": 0.09806468337774277, + "loss/reg": 349504.8125, + "step": 47 + }, + { + "epoch": 0.0012, + "grad_norm": 0.8643568158149719, + "grad_norm_var": 0.009168173684230396, + "learning_rate": 4.8e-05, + "loss": 349342.75, + "loss/crossentropy": 2.470043182373047, + "loss/hidden": 0.328125, + "loss/logits": 0.1013718917965889, + "loss/reg": 349342.3125, + "step": 48 + }, + { + "epoch": 0.001225, + "grad_norm": 0.7516047954559326, + "grad_norm_var": 0.009166844350170263, + "learning_rate": 4.9e-05, + "loss": 349173.875, + "loss/crossentropy": 2.684551239013672, + "loss/hidden": 0.3359375, + "loss/logits": 0.10089194774627686, + "loss/reg": 349173.4375, + "step": 49 + }, + { + "epoch": 0.00125, + "grad_norm": 0.7821515202522278, + "grad_norm_var": 0.008276118193317084, + "learning_rate": 5e-05, + "loss": 348998.3438, + "loss/crossentropy": 2.414154052734375, + "loss/hidden": 0.357421875, + "loss/logits": 0.09722993522882462, + "loss/reg": 348997.875, + "step": 50 + }, + { + "epoch": 0.001275, + "grad_norm": 0.6895391941070557, + "grad_norm_var": 0.008039034372423481, + "learning_rate": 5.1000000000000006e-05, + "loss": 348816.4062, + "loss/crossentropy": 2.731753349304199, + "loss/hidden": 0.330078125, + "loss/logits": 0.09635522961616516, + "loss/reg": 348815.96875, + "step": 51 + }, + { + "epoch": 0.0013, + "grad_norm": 0.7811358571052551, + "grad_norm_var": 0.006859586362939026, + "learning_rate": 5.2000000000000004e-05, + "loss": 348628.8438, + "loss/crossentropy": 2.4768221378326416, + "loss/hidden": 0.34375, + "loss/logits": 0.10576320439577103, + "loss/reg": 348628.40625, + "step": 52 + }, + { + "epoch": 0.001325, + "grad_norm": 0.7297627329826355, + "grad_norm_var": 0.003500148277329096, + "learning_rate": 5.300000000000001e-05, + "loss": 348433.0312, + "loss/crossentropy": 2.5304179191589355, + "loss/hidden": 0.3359375, + "loss/logits": 0.0967152938246727, + "loss/reg": 348432.59375, + "step": 53 + }, + { + "epoch": 0.00135, + "grad_norm": 0.7394194006919861, + "grad_norm_var": 0.002325683451179339, + "learning_rate": 5.4000000000000005e-05, + "loss": 348235.5312, + "loss/crossentropy": 2.220980644226074, + "loss/hidden": 0.330078125, + "loss/logits": 0.0968293845653534, + "loss/reg": 348235.09375, + "step": 54 + }, + { + "epoch": 0.001375, + "grad_norm": 0.9957661628723145, + "grad_norm_var": 0.0062716166232480814, + "learning_rate": 5.500000000000001e-05, + "loss": 348031.875, + "loss/crossentropy": 2.7074618339538574, + "loss/hidden": 0.36328125, + "loss/logits": 0.11125057935714722, + "loss/reg": 348031.40625, + "step": 55 + }, + { + "epoch": 0.0014, + "grad_norm": 0.7879313230514526, + "grad_norm_var": 0.006088293627237211, + "learning_rate": 5.6000000000000006e-05, + "loss": 347816.1562, + "loss/crossentropy": 2.515113592147827, + "loss/hidden": 0.33203125, + "loss/logits": 0.07407157868146896, + "loss/reg": 347815.75, + "step": 56 + }, + { + "epoch": 0.001425, + "grad_norm": 0.6987502574920654, + "grad_norm_var": 0.006337448092067281, + "learning_rate": 5.6999999999999996e-05, + "loss": 347603.8125, + "loss/crossentropy": 2.336862325668335, + "loss/hidden": 0.33203125, + "loss/logits": 0.10240699350833893, + "loss/reg": 347603.375, + "step": 57 + }, + { + "epoch": 0.00145, + "grad_norm": 0.7523587942123413, + "grad_norm_var": 0.006257107260669119, + "learning_rate": 5.8e-05, + "loss": 347378.0, + "loss/crossentropy": 2.4298152923583984, + "loss/hidden": 0.34375, + "loss/logits": 0.10168775916099548, + "loss/reg": 347377.5625, + "step": 58 + }, + { + "epoch": 0.001475, + "grad_norm": 0.9260439872741699, + "grad_norm_var": 0.007239314549827706, + "learning_rate": 5.9e-05, + "loss": 347145.7812, + "loss/crossentropy": 2.2897913455963135, + "loss/hidden": 0.353515625, + "loss/logits": 0.1300913393497467, + "loss/reg": 347145.3125, + "step": 59 + }, + { + "epoch": 0.0015, + "grad_norm": 0.9719749689102173, + "grad_norm_var": 0.008969534389899995, + "learning_rate": 6e-05, + "loss": 346908.125, + "loss/crossentropy": 2.8708131313323975, + "loss/hidden": 0.3515625, + "loss/logits": 0.09690976142883301, + "loss/reg": 346907.6875, + "step": 60 + }, + { + "epoch": 0.001525, + "grad_norm": 0.8373459577560425, + "grad_norm_var": 0.008860865848953618, + "learning_rate": 6.1e-05, + "loss": 346664.3125, + "loss/crossentropy": 2.327406644821167, + "loss/hidden": 0.33984375, + "loss/logits": 0.09570501744747162, + "loss/reg": 346663.875, + "step": 61 + }, + { + "epoch": 0.00155, + "grad_norm": 1.3656246662139893, + "grad_norm_var": 0.028634012422057358, + "learning_rate": 6.2e-05, + "loss": 346416.75, + "loss/crossentropy": 2.711550712585449, + "loss/hidden": 0.369140625, + "loss/logits": 0.09897757321596146, + "loss/reg": 346416.28125, + "step": 62 + }, + { + "epoch": 0.001575, + "grad_norm": 0.897807776927948, + "grad_norm_var": 0.027873421941307677, + "learning_rate": 6.3e-05, + "loss": 346158.4688, + "loss/crossentropy": 2.5601446628570557, + "loss/hidden": 0.365234375, + "loss/logits": 0.10523329675197601, + "loss/reg": 346158.0, + "step": 63 + }, + { + "epoch": 0.0016, + "grad_norm": 0.8370752334594727, + "grad_norm_var": 0.027861253679391126, + "learning_rate": 6.400000000000001e-05, + "loss": 345896.7812, + "loss/crossentropy": 2.370269536972046, + "loss/hidden": 0.3515625, + "loss/logits": 0.11418884992599487, + "loss/reg": 345896.3125, + "step": 64 + }, + { + "epoch": 0.001625, + "grad_norm": 0.9750260710716248, + "grad_norm_var": 0.02815364448027721, + "learning_rate": 6.500000000000001e-05, + "loss": 345627.4375, + "loss/crossentropy": 2.6104238033294678, + "loss/hidden": 0.359375, + "loss/logits": 0.10441127419471741, + "loss/reg": 345626.96875, + "step": 65 + }, + { + "epoch": 0.00165, + "grad_norm": 0.8502708077430725, + "grad_norm_var": 0.027732216036353386, + "learning_rate": 6.6e-05, + "loss": 345353.1562, + "loss/crossentropy": 2.0436782836914062, + "loss/hidden": 0.337890625, + "loss/logits": 0.10739766061306, + "loss/reg": 345352.71875, + "step": 66 + }, + { + "epoch": 0.001675, + "grad_norm": 0.9714474081993103, + "grad_norm_var": 0.026113840930633226, + "learning_rate": 6.7e-05, + "loss": 345073.3438, + "loss/crossentropy": 2.534754753112793, + "loss/hidden": 0.36328125, + "loss/logits": 0.09999954700469971, + "loss/reg": 345072.875, + "step": 67 + }, + { + "epoch": 0.0017, + "grad_norm": 0.9867809414863586, + "grad_norm_var": 0.02598149193676691, + "learning_rate": 6.800000000000001e-05, + "loss": 344781.3125, + "loss/crossentropy": 2.693114995956421, + "loss/hidden": 0.357421875, + "loss/logits": 0.10880092531442642, + "loss/reg": 344780.84375, + "step": 68 + }, + { + "epoch": 0.001725, + "grad_norm": 1.0025056600570679, + "grad_norm_var": 0.024614116160431364, + "learning_rate": 6.9e-05, + "loss": 344485.0312, + "loss/crossentropy": 2.3547627925872803, + "loss/hidden": 0.345703125, + "loss/logits": 0.12061820924282074, + "loss/reg": 344484.5625, + "step": 69 + }, + { + "epoch": 0.00175, + "grad_norm": 0.9661253690719604, + "grad_norm_var": 0.02260186596820497, + "learning_rate": 7e-05, + "loss": 344183.5312, + "loss/crossentropy": 2.389979124069214, + "loss/hidden": 0.365234375, + "loss/logits": 0.11400818079710007, + "loss/reg": 344183.0625, + "step": 70 + }, + { + "epoch": 0.001775, + "grad_norm": 0.8837181329727173, + "grad_norm_var": 0.022350632813116045, + "learning_rate": 7.1e-05, + "loss": 343858.7812, + "loss/crossentropy": 2.6821513175964355, + "loss/hidden": 0.35546875, + "loss/logits": 0.09980843961238861, + "loss/reg": 343858.3125, + "step": 71 + }, + { + "epoch": 0.0018, + "grad_norm": 0.943979799747467, + "grad_norm_var": 0.021136676300270333, + "learning_rate": 7.2e-05, + "loss": 343543.0938, + "loss/crossentropy": 2.401181697845459, + "loss/hidden": 0.3359375, + "loss/logits": 0.08882458508014679, + "loss/reg": 343542.65625, + "step": 72 + }, + { + "epoch": 0.001825, + "grad_norm": 0.8156776428222656, + "grad_norm_var": 0.018398747204270666, + "learning_rate": 7.3e-05, + "loss": 343215.3125, + "loss/crossentropy": 2.5160043239593506, + "loss/hidden": 0.333984375, + "loss/logits": 0.10164624452590942, + "loss/reg": 343214.875, + "step": 73 + }, + { + "epoch": 0.00185, + "grad_norm": 0.9953745603561401, + "grad_norm_var": 0.016123705970816124, + "learning_rate": 7.4e-05, + "loss": 342870.4688, + "loss/crossentropy": 2.549926280975342, + "loss/hidden": 0.357421875, + "loss/logits": 0.10379678010940552, + "loss/reg": 342870.0, + "step": 74 + }, + { + "epoch": 0.001875, + "grad_norm": 0.9699572920799255, + "grad_norm_var": 0.016094164977751277, + "learning_rate": 7.500000000000001e-05, + "loss": 342530.2812, + "loss/crossentropy": 2.412536144256592, + "loss/hidden": 0.35546875, + "loss/logits": 0.10453817248344421, + "loss/reg": 342529.8125, + "step": 75 + }, + { + "epoch": 0.0019, + "grad_norm": 1.2801817655563354, + "grad_norm_var": 0.022752608592142905, + "learning_rate": 7.6e-05, + "loss": 342192.3438, + "loss/crossentropy": 2.7425031661987305, + "loss/hidden": 0.37109375, + "loss/logits": 0.14211513102054596, + "loss/reg": 342191.84375, + "step": 76 + }, + { + "epoch": 0.001925, + "grad_norm": 0.8839605450630188, + "grad_norm_var": 0.022041054647295215, + "learning_rate": 7.7e-05, + "loss": 341842.0, + "loss/crossentropy": 2.3904454708099365, + "loss/hidden": 0.357421875, + "loss/logits": 0.10721279680728912, + "loss/reg": 341841.53125, + "step": 77 + }, + { + "epoch": 0.00195, + "grad_norm": 1.2837376594543457, + "grad_norm_var": 0.018212612959343667, + "learning_rate": 7.800000000000001e-05, + "loss": 341472.5625, + "loss/crossentropy": 2.8702011108398438, + "loss/hidden": 0.36328125, + "loss/logits": 0.11912151426076889, + "loss/reg": 341472.09375, + "step": 78 + }, + { + "epoch": 0.001975, + "grad_norm": 1.0258175134658813, + "grad_norm_var": 0.017979390728313744, + "learning_rate": 7.900000000000001e-05, + "loss": 341086.625, + "loss/crossentropy": 2.470363140106201, + "loss/hidden": 0.34765625, + "loss/logits": 0.09680184721946716, + "loss/reg": 341086.1875, + "step": 79 + }, + { + "epoch": 0.002, + "grad_norm": 0.8823785781860352, + "grad_norm_var": 0.017247493455519333, + "learning_rate": 8e-05, + "loss": 340713.0938, + "loss/crossentropy": 2.2525322437286377, + "loss/hidden": 0.333984375, + "loss/logits": 0.10186257213354111, + "loss/reg": 340712.65625, + "step": 80 + }, + { + "epoch": 0.002025, + "grad_norm": 0.8656690120697021, + "grad_norm_var": 0.018101116935594005, + "learning_rate": 8.1e-05, + "loss": 340332.9688, + "loss/crossentropy": 2.413992404937744, + "loss/hidden": 0.3671875, + "loss/logits": 0.12494418025016785, + "loss/reg": 340332.46875, + "step": 81 + }, + { + "epoch": 0.00205, + "grad_norm": 1.462515115737915, + "grad_norm_var": 0.03130815801283344, + "learning_rate": 8.2e-05, + "loss": 339945.3125, + "loss/crossentropy": 2.6302881240844727, + "loss/hidden": 0.353515625, + "loss/logits": 0.12398800253868103, + "loss/reg": 339944.84375, + "step": 82 + }, + { + "epoch": 0.002075, + "grad_norm": 0.816981315612793, + "grad_norm_var": 0.03367041332098678, + "learning_rate": 8.3e-05, + "loss": 339556.2812, + "loss/crossentropy": 2.577521562576294, + "loss/hidden": 0.3359375, + "loss/logits": 0.10806369036436081, + "loss/reg": 339555.84375, + "step": 83 + }, + { + "epoch": 0.0021, + "grad_norm": 1.0367132425308228, + "grad_norm_var": 0.03371103591838046, + "learning_rate": 8.4e-05, + "loss": 339152.4688, + "loss/crossentropy": 2.751410484313965, + "loss/hidden": 0.37109375, + "loss/logits": 0.13619256019592285, + "loss/reg": 339151.96875, + "step": 84 + }, + { + "epoch": 0.002125, + "grad_norm": 0.9986303448677063, + "grad_norm_var": 0.03371440316385298, + "learning_rate": 8.5e-05, + "loss": 338737.25, + "loss/crossentropy": 2.4450559616088867, + "loss/hidden": 0.35546875, + "loss/logits": 0.10456883907318115, + "loss/reg": 338736.78125, + "step": 85 + }, + { + "epoch": 0.00215, + "grad_norm": 1.2683286666870117, + "grad_norm_var": 0.03777680309527393, + "learning_rate": 8.6e-05, + "loss": 338297.5625, + "loss/crossentropy": 2.4777944087982178, + "loss/hidden": 0.380859375, + "loss/logits": 0.11096605658531189, + "loss/reg": 338297.0625, + "step": 86 + }, + { + "epoch": 0.002175, + "grad_norm": 0.8816508054733276, + "grad_norm_var": 0.037816248324434316, + "learning_rate": 8.7e-05, + "loss": 337870.1875, + "loss/crossentropy": 2.3346803188323975, + "loss/hidden": 0.341796875, + "loss/logits": 0.10044071823358536, + "loss/reg": 337869.75, + "step": 87 + }, + { + "epoch": 0.0022, + "grad_norm": 0.974234938621521, + "grad_norm_var": 0.037543709145419144, + "learning_rate": 8.800000000000001e-05, + "loss": 337436.875, + "loss/crossentropy": 2.5214929580688477, + "loss/hidden": 0.349609375, + "loss/logits": 0.10045409202575684, + "loss/reg": 337436.4375, + "step": 88 + }, + { + "epoch": 0.002225, + "grad_norm": 1.0935665369033813, + "grad_norm_var": 0.0345174990707664, + "learning_rate": 8.900000000000001e-05, + "loss": 336995.4688, + "loss/crossentropy": 2.659154176712036, + "loss/hidden": 0.36328125, + "loss/logits": 0.12203093618154526, + "loss/reg": 336994.96875, + "step": 89 + }, + { + "epoch": 0.00225, + "grad_norm": 0.8209539651870728, + "grad_norm_var": 0.03757256211988141, + "learning_rate": 9e-05, + "loss": 336544.0312, + "loss/crossentropy": 2.3388657569885254, + "loss/hidden": 0.33984375, + "loss/logits": 0.0983554944396019, + "loss/reg": 336543.59375, + "step": 90 + }, + { + "epoch": 0.002275, + "grad_norm": 1.0395029783248901, + "grad_norm_var": 0.037280256977323664, + "learning_rate": 9.1e-05, + "loss": 336100.2812, + "loss/crossentropy": 2.104595184326172, + "loss/hidden": 0.35546875, + "loss/logits": 0.10804722458124161, + "loss/reg": 336099.8125, + "step": 91 + }, + { + "epoch": 0.0023, + "grad_norm": 0.9130696654319763, + "grad_norm_var": 0.033869955312990745, + "learning_rate": 9.200000000000001e-05, + "loss": 335647.1562, + "loss/crossentropy": 2.1718626022338867, + "loss/hidden": 0.341796875, + "loss/logits": 0.11622267961502075, + "loss/reg": 335646.6875, + "step": 92 + }, + { + "epoch": 0.002325, + "grad_norm": 0.9693914651870728, + "grad_norm_var": 0.03282797555167226, + "learning_rate": 9.300000000000001e-05, + "loss": 335189.0938, + "loss/crossentropy": 2.4374241828918457, + "loss/hidden": 0.353515625, + "loss/logits": 0.1102052628993988, + "loss/reg": 335188.625, + "step": 93 + }, + { + "epoch": 0.00235, + "grad_norm": 1.4234834909439087, + "grad_norm_var": 0.038947392506656735, + "learning_rate": 9.4e-05, + "loss": 334715.6875, + "loss/crossentropy": 2.4830873012542725, + "loss/hidden": 0.361328125, + "loss/logits": 0.10641457140445709, + "loss/reg": 334715.21875, + "step": 94 + }, + { + "epoch": 0.002375, + "grad_norm": 1.064066767692566, + "grad_norm_var": 0.03901976710384876, + "learning_rate": 9.5e-05, + "loss": 334240.6562, + "loss/crossentropy": 2.2354929447174072, + "loss/hidden": 0.337890625, + "loss/logits": 0.10279420763254166, + "loss/reg": 334240.21875, + "step": 95 + }, + { + "epoch": 0.0024, + "grad_norm": 2.2034084796905518, + "grad_norm_var": 0.12174535248636993, + "learning_rate": 9.6e-05, + "loss": 333756.1875, + "loss/crossentropy": 3.0666353702545166, + "loss/hidden": 0.43359375, + "loss/logits": 0.1290225386619568, + "loss/reg": 333755.625, + "step": 96 + }, + { + "epoch": 0.002425, + "grad_norm": 1.1879992485046387, + "grad_norm_var": 0.11754435314787527, + "learning_rate": 9.7e-05, + "loss": 333257.75, + "loss/crossentropy": 2.5517079830169678, + "loss/hidden": 0.376953125, + "loss/logits": 0.12321121245622635, + "loss/reg": 333257.25, + "step": 97 + }, + { + "epoch": 0.00245, + "grad_norm": 1.6118693351745605, + "grad_norm_var": 0.1254674718569584, + "learning_rate": 9.8e-05, + "loss": 332759.9688, + "loss/crossentropy": 2.5265674591064453, + "loss/hidden": 0.384765625, + "loss/logits": 0.12099769711494446, + "loss/reg": 332759.46875, + "step": 98 + }, + { + "epoch": 0.002475, + "grad_norm": 2.0734384059906006, + "grad_norm_var": 0.16935214632967607, + "learning_rate": 9.900000000000001e-05, + "loss": 332225.625, + "loss/crossentropy": 3.1464080810546875, + "loss/hidden": 0.416015625, + "loss/logits": 0.1275343894958496, + "loss/reg": 332225.09375, + "step": 99 + }, + { + "epoch": 0.0025, + "grad_norm": 1.3410392999649048, + "grad_norm_var": 0.16760112767482122, + "learning_rate": 0.0001, + "loss": 331675.25, + "loss/crossentropy": 2.381397247314453, + "loss/hidden": 0.359375, + "loss/logits": 0.11763107776641846, + "loss/reg": 331674.78125, + "step": 100 + }, + { + "epoch": 0.002525, + "grad_norm": 1.091958999633789, + "grad_norm_var": 0.1651227977177586, + "learning_rate": 0.0001, + "loss": 331125.6875, + "loss/crossentropy": 2.41511869430542, + "loss/hidden": 0.361328125, + "loss/logits": 0.11357417702674866, + "loss/reg": 331125.21875, + "step": 101 + }, + { + "epoch": 0.00255, + "grad_norm": 1.239707112312317, + "grad_norm_var": 0.16509402495576694, + "learning_rate": 0.0001, + "loss": 330589.7188, + "loss/crossentropy": 2.671584367752075, + "loss/hidden": 0.37109375, + "loss/logits": 0.11257486045360565, + "loss/reg": 330589.25, + "step": 102 + }, + { + "epoch": 0.002575, + "grad_norm": 1.2163573503494263, + "grad_norm_var": 0.15585436844416117, + "learning_rate": 0.0001, + "loss": 330047.3438, + "loss/crossentropy": 2.4153730869293213, + "loss/hidden": 0.3671875, + "loss/logits": 0.11950643360614777, + "loss/reg": 330046.84375, + "step": 103 + }, + { + "epoch": 0.0026, + "grad_norm": 1.0298452377319336, + "grad_norm_var": 0.15388056810262113, + "learning_rate": 0.0001, + "loss": 329506.1562, + "loss/crossentropy": 2.2123711109161377, + "loss/hidden": 0.3515625, + "loss/logits": 0.10439261794090271, + "loss/reg": 329505.6875, + "step": 104 + }, + { + "epoch": 0.002625, + "grad_norm": 1.1107213497161865, + "grad_norm_var": 0.15349545218275876, + "learning_rate": 0.0001, + "loss": 328957.9688, + "loss/crossentropy": 2.4851179122924805, + "loss/hidden": 0.3671875, + "loss/logits": 0.10356856882572174, + "loss/reg": 328957.5, + "step": 105 + }, + { + "epoch": 0.00265, + "grad_norm": 1.1000527143478394, + "grad_norm_var": 0.14161442987122588, + "learning_rate": 0.0001, + "loss": 328412.3125, + "loss/crossentropy": 2.6760635375976562, + "loss/hidden": 0.353515625, + "loss/logits": 0.10187040269374847, + "loss/reg": 328411.84375, + "step": 106 + }, + { + "epoch": 0.002675, + "grad_norm": 0.9892934560775757, + "grad_norm_var": 0.14343889146716268, + "learning_rate": 0.0001, + "loss": 327867.8438, + "loss/crossentropy": 2.3127148151397705, + "loss/hidden": 0.357421875, + "loss/logits": 0.12052441388368607, + "loss/reg": 327867.375, + "step": 107 + }, + { + "epoch": 0.0027, + "grad_norm": 1.6295005083084106, + "grad_norm_var": 0.13995610376130016, + "learning_rate": 0.0001, + "loss": 327327.125, + "loss/crossentropy": 2.7045395374298096, + "loss/hidden": 0.40234375, + "loss/logits": 0.1712554693222046, + "loss/reg": 327326.5625, + "step": 108 + }, + { + "epoch": 0.002725, + "grad_norm": 1.1677104234695435, + "grad_norm_var": 0.13287532765518548, + "learning_rate": 0.0001, + "loss": 326764.9688, + "loss/crossentropy": 3.237940549850464, + "loss/hidden": 0.376953125, + "loss/logits": 0.11507049202919006, + "loss/reg": 326764.46875, + "step": 109 + }, + { + "epoch": 0.00275, + "grad_norm": 1.0809372663497925, + "grad_norm_var": 0.13651149296486545, + "learning_rate": 0.0001, + "loss": 326164.9688, + "loss/crossentropy": 2.2961301803588867, + "loss/hidden": 0.3671875, + "loss/logits": 0.12248177826404572, + "loss/reg": 326164.46875, + "step": 110 + }, + { + "epoch": 0.002775, + "grad_norm": 0.9283381104469299, + "grad_norm_var": 0.14231480086240134, + "learning_rate": 0.0001, + "loss": 325608.25, + "loss/crossentropy": 2.353752613067627, + "loss/hidden": 0.333984375, + "loss/logits": 0.09742879867553711, + "loss/reg": 325607.8125, + "step": 111 + }, + { + "epoch": 0.0028, + "grad_norm": 3.4329140186309814, + "grad_norm_var": 0.3828229891507322, + "learning_rate": 0.0001, + "loss": 325060.25, + "loss/crossentropy": 2.472973108291626, + "loss/hidden": 0.3671875, + "loss/logits": 0.10647609084844589, + "loss/reg": 325059.78125, + "step": 112 + }, + { + "epoch": 0.002825, + "grad_norm": 1.4601819515228271, + "grad_norm_var": 0.3801412532983319, + "learning_rate": 0.0001, + "loss": 324513.25, + "loss/crossentropy": 2.605860471725464, + "loss/hidden": 0.37109375, + "loss/logits": 0.09949396550655365, + "loss/reg": 324512.78125, + "step": 113 + }, + { + "epoch": 0.00285, + "grad_norm": 0.9892399907112122, + "grad_norm_var": 0.3873205653246254, + "learning_rate": 0.0001, + "loss": 323942.9375, + "loss/crossentropy": 2.413478374481201, + "loss/hidden": 0.37109375, + "loss/logits": 0.11860661953687668, + "loss/reg": 323942.4375, + "step": 114 + }, + { + "epoch": 0.002875, + "grad_norm": 0.990548849105835, + "grad_norm_var": 0.3586952225239992, + "learning_rate": 0.0001, + "loss": 323375.625, + "loss/crossentropy": 2.3325929641723633, + "loss/hidden": 0.369140625, + "loss/logits": 0.112250916659832, + "loss/reg": 323375.15625, + "step": 115 + }, + { + "epoch": 0.0029, + "grad_norm": 0.9300064444541931, + "grad_norm_var": 0.3669996764077588, + "learning_rate": 0.0001, + "loss": 322812.5938, + "loss/crossentropy": 2.345551013946533, + "loss/hidden": 0.353515625, + "loss/logits": 0.11626957356929779, + "loss/reg": 322812.125, + "step": 116 + }, + { + "epoch": 0.002925, + "grad_norm": 0.8991410732269287, + "grad_norm_var": 0.3740087755224611, + "learning_rate": 0.0001, + "loss": 322255.1875, + "loss/crossentropy": 2.345388889312744, + "loss/hidden": 0.359375, + "loss/logits": 0.12949004769325256, + "loss/reg": 322254.6875, + "step": 117 + }, + { + "epoch": 0.00295, + "grad_norm": 1.031494379043579, + "grad_norm_var": 0.37734152840261964, + "learning_rate": 0.0001, + "loss": 321703.5, + "loss/crossentropy": 2.427797555923462, + "loss/hidden": 0.3671875, + "loss/logits": 0.1204058825969696, + "loss/reg": 321703.0, + "step": 118 + }, + { + "epoch": 0.002975, + "grad_norm": 0.8741577863693237, + "grad_norm_var": 0.38615619634775694, + "learning_rate": 0.0001, + "loss": 321156.4688, + "loss/crossentropy": 2.613999605178833, + "loss/hidden": 0.359375, + "loss/logits": 0.11682362854480743, + "loss/reg": 321156.0, + "step": 119 + }, + { + "epoch": 0.003, + "grad_norm": 0.9654676914215088, + "grad_norm_var": 0.3881140201880987, + "learning_rate": 0.0001, + "loss": 320605.125, + "loss/crossentropy": 2.579970121383667, + "loss/hidden": 0.359375, + "loss/logits": 0.11028732359409332, + "loss/reg": 320604.65625, + "step": 120 + }, + { + "epoch": 0.003025, + "grad_norm": 0.9691014289855957, + "grad_norm_var": 0.39150146698907184, + "learning_rate": 0.0001, + "loss": 320055.25, + "loss/crossentropy": 2.3624041080474854, + "loss/hidden": 0.3671875, + "loss/logits": 0.11543390154838562, + "loss/reg": 320054.78125, + "step": 121 + }, + { + "epoch": 0.00305, + "grad_norm": 1.3653072118759155, + "grad_norm_var": 0.39183782310619764, + "learning_rate": 0.0001, + "loss": 319477.5, + "loss/crossentropy": 2.8515052795410156, + "loss/hidden": 0.375, + "loss/logits": 0.1050575003027916, + "loss/reg": 319477.03125, + "step": 122 + }, + { + "epoch": 0.003075, + "grad_norm": 1.1826074123382568, + "grad_norm_var": 0.3879316075394361, + "learning_rate": 0.0001, + "loss": 318915.0938, + "loss/crossentropy": 2.2997782230377197, + "loss/hidden": 0.37109375, + "loss/logits": 0.1062554344534874, + "loss/reg": 318914.625, + "step": 123 + }, + { + "epoch": 0.0031, + "grad_norm": 1.293823003768921, + "grad_norm_var": 0.3776996752167721, + "learning_rate": 0.0001, + "loss": 318354.0625, + "loss/crossentropy": 2.4876811504364014, + "loss/hidden": 0.376953125, + "loss/logits": 0.12040136754512787, + "loss/reg": 318353.5625, + "step": 124 + }, + { + "epoch": 0.003125, + "grad_norm": 0.9141258001327515, + "grad_norm_var": 0.3835733183646012, + "learning_rate": 0.0001, + "loss": 317789.2812, + "loss/crossentropy": 2.641594409942627, + "loss/hidden": 0.3515625, + "loss/logits": 0.10970079898834229, + "loss/reg": 317788.8125, + "step": 125 + }, + { + "epoch": 0.00315, + "grad_norm": 1.066786289215088, + "grad_norm_var": 0.3838231454325568, + "learning_rate": 0.0001, + "loss": 317230.2812, + "loss/crossentropy": 2.4842031002044678, + "loss/hidden": 0.375, + "loss/logits": 0.09922473877668381, + "loss/reg": 317229.8125, + "step": 126 + }, + { + "epoch": 0.003175, + "grad_norm": 0.8703321814537048, + "grad_norm_var": 0.38617957642686856, + "learning_rate": 0.0001, + "loss": 316661.7188, + "loss/crossentropy": 2.226027011871338, + "loss/hidden": 0.359375, + "loss/logits": 0.10238594561815262, + "loss/reg": 316661.25, + "step": 127 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9229066967964172, + "grad_norm_var": 0.03339106383164576, + "learning_rate": 0.0001, + "loss": 316095.2812, + "loss/crossentropy": 2.399365186691284, + "loss/hidden": 0.361328125, + "loss/logits": 0.10705035924911499, + "loss/reg": 316094.8125, + "step": 128 + }, + { + "epoch": 0.003225, + "grad_norm": 0.8661057949066162, + "grad_norm_var": 0.02258822527587553, + "learning_rate": 0.0001, + "loss": 315522.375, + "loss/crossentropy": 2.181664228439331, + "loss/hidden": 0.35546875, + "loss/logits": 0.10982446372509003, + "loss/reg": 315521.90625, + "step": 129 + }, + { + "epoch": 0.00325, + "grad_norm": 1.0535880327224731, + "grad_norm_var": 0.02268437097398235, + "learning_rate": 0.0001, + "loss": 314946.25, + "loss/crossentropy": 2.577036142349243, + "loss/hidden": 0.37109375, + "loss/logits": 0.11703610420227051, + "loss/reg": 314945.75, + "step": 130 + }, + { + "epoch": 0.003275, + "grad_norm": 1.115898847579956, + "grad_norm_var": 0.02330423350459072, + "learning_rate": 0.0001, + "loss": 314301.0938, + "loss/crossentropy": 2.495171308517456, + "loss/hidden": 0.361328125, + "loss/logits": 0.0935671329498291, + "loss/reg": 314300.625, + "step": 131 + }, + { + "epoch": 0.0033, + "grad_norm": 1.5032321214675903, + "grad_norm_var": 0.036958687026436544, + "learning_rate": 0.0001, + "loss": 313705.0938, + "loss/crossentropy": 2.527507781982422, + "loss/hidden": 0.361328125, + "loss/logits": 0.10602883994579315, + "loss/reg": 313704.625, + "step": 132 + }, + { + "epoch": 0.003325, + "grad_norm": 1.007105827331543, + "grad_norm_var": 0.035430911148712066, + "learning_rate": 0.0001, + "loss": 313114.1875, + "loss/crossentropy": 2.235227584838867, + "loss/hidden": 0.365234375, + "loss/logits": 0.1109972670674324, + "loss/reg": 313113.71875, + "step": 133 + }, + { + "epoch": 0.00335, + "grad_norm": 1.1278802156448364, + "grad_norm_var": 0.03561144449842336, + "learning_rate": 0.0001, + "loss": 312513.6875, + "loss/crossentropy": 2.4092252254486084, + "loss/hidden": 0.3671875, + "loss/logits": 0.11512389779090881, + "loss/reg": 312513.21875, + "step": 134 + }, + { + "epoch": 0.003375, + "grad_norm": 0.9926447868347168, + "grad_norm_var": 0.033416226700900546, + "learning_rate": 0.0001, + "loss": 311928.1562, + "loss/crossentropy": 2.5909008979797363, + "loss/hidden": 0.37109375, + "loss/logits": 0.1105237603187561, + "loss/reg": 311927.6875, + "step": 135 + }, + { + "epoch": 0.0034, + "grad_norm": 1.1020681858062744, + "grad_norm_var": 0.0325682488842796, + "learning_rate": 0.0001, + "loss": 311347.2188, + "loss/crossentropy": 2.4427216053009033, + "loss/hidden": 0.349609375, + "loss/logits": 0.10573369264602661, + "loss/reg": 311346.75, + "step": 136 + }, + { + "epoch": 0.003425, + "grad_norm": 1.4713138341903687, + "grad_norm_var": 0.040598215379358274, + "learning_rate": 0.0001, + "loss": 310768.625, + "loss/crossentropy": 2.1684117317199707, + "loss/hidden": 0.36328125, + "loss/logits": 0.13670936226844788, + "loss/reg": 310768.125, + "step": 137 + }, + { + "epoch": 0.00345, + "grad_norm": 1.02042818069458, + "grad_norm_var": 0.03656716460911245, + "learning_rate": 0.0001, + "loss": 310182.9375, + "loss/crossentropy": 2.5576694011688232, + "loss/hidden": 0.353515625, + "loss/logits": 0.1007159948348999, + "loss/reg": 310182.46875, + "step": 138 + }, + { + "epoch": 0.003475, + "grad_norm": 1.0042083263397217, + "grad_norm_var": 0.036458819511781294, + "learning_rate": 0.0001, + "loss": 309594.6562, + "loss/crossentropy": 2.290005683898926, + "loss/hidden": 0.376953125, + "loss/logits": 0.12406025826931, + "loss/reg": 309594.15625, + "step": 139 + }, + { + "epoch": 0.0035, + "grad_norm": 1.167313814163208, + "grad_norm_var": 0.03390765521071438, + "learning_rate": 0.0001, + "loss": 309012.4062, + "loss/crossentropy": 3.0499072074890137, + "loss/hidden": 0.3671875, + "loss/logits": 0.10719737410545349, + "loss/reg": 309011.9375, + "step": 140 + }, + { + "epoch": 0.003525, + "grad_norm": 1.158155918121338, + "grad_norm_var": 0.03238307654376082, + "learning_rate": 0.0001, + "loss": 308423.6562, + "loss/crossentropy": 2.15006422996521, + "loss/hidden": 0.3671875, + "loss/logits": 0.11542148888111115, + "loss/reg": 308423.1875, + "step": 141 + }, + { + "epoch": 0.00355, + "grad_norm": 1.092081069946289, + "grad_norm_var": 0.03234267285948652, + "learning_rate": 0.0001, + "loss": 307830.7188, + "loss/crossentropy": 2.273993730545044, + "loss/hidden": 0.390625, + "loss/logits": 0.13467177748680115, + "loss/reg": 307830.1875, + "step": 142 + }, + { + "epoch": 0.003575, + "grad_norm": 1.0921188592910767, + "grad_norm_var": 0.028855911181609867, + "learning_rate": 0.0001, + "loss": 307240.7812, + "loss/crossentropy": 2.25028395652771, + "loss/hidden": 0.36328125, + "loss/logits": 0.1119830310344696, + "loss/reg": 307240.3125, + "step": 143 + }, + { + "epoch": 0.0036, + "grad_norm": 1.1455613374710083, + "grad_norm_var": 0.026516861915888274, + "learning_rate": 0.0001, + "loss": 306582.3125, + "loss/crossentropy": 2.4379734992980957, + "loss/hidden": 0.37890625, + "loss/logits": 0.1230136901140213, + "loss/reg": 306581.8125, + "step": 144 + }, + { + "epoch": 0.003625, + "grad_norm": 1.1363896131515503, + "grad_norm_var": 0.021933560562692318, + "learning_rate": 0.0001, + "loss": 305979.9688, + "loss/crossentropy": 2.93501353263855, + "loss/hidden": 0.37890625, + "loss/logits": 0.11584587395191193, + "loss/reg": 305979.46875, + "step": 145 + }, + { + "epoch": 0.00365, + "grad_norm": 1.3462218046188354, + "grad_norm_var": 0.024036067985930682, + "learning_rate": 0.0001, + "loss": 305383.7188, + "loss/crossentropy": 2.9245474338531494, + "loss/hidden": 0.365234375, + "loss/logits": 0.0957048088312149, + "loss/reg": 305383.25, + "step": 146 + }, + { + "epoch": 0.003675, + "grad_norm": 1.6365784406661987, + "grad_norm_var": 0.03825433404631227, + "learning_rate": 0.0001, + "loss": 304787.3125, + "loss/crossentropy": 2.446303367614746, + "loss/hidden": 0.396484375, + "loss/logits": 0.13205978274345398, + "loss/reg": 304786.78125, + "step": 147 + }, + { + "epoch": 0.0037, + "grad_norm": 1.2740546464920044, + "grad_norm_var": 0.03189546019524201, + "learning_rate": 0.0001, + "loss": 304129.375, + "loss/crossentropy": 2.645003318786621, + "loss/hidden": 0.39453125, + "loss/logits": 0.11766268312931061, + "loss/reg": 304128.875, + "step": 148 + }, + { + "epoch": 0.003725, + "grad_norm": 1.23747718334198, + "grad_norm_var": 0.030105001627378593, + "learning_rate": 0.0001, + "loss": 303520.3125, + "loss/crossentropy": 2.148782253265381, + "loss/hidden": 0.380859375, + "loss/logits": 0.11950172483921051, + "loss/reg": 303519.8125, + "step": 149 + }, + { + "epoch": 0.00375, + "grad_norm": 1.1281896829605103, + "grad_norm_var": 0.03010253596778695, + "learning_rate": 0.0001, + "loss": 302918.0625, + "loss/crossentropy": 2.431253433227539, + "loss/hidden": 0.361328125, + "loss/logits": 0.10273518413305283, + "loss/reg": 302917.59375, + "step": 150 + }, + { + "epoch": 0.003775, + "grad_norm": 1.233536720275879, + "grad_norm_var": 0.02746115673277192, + "learning_rate": 0.0001, + "loss": 302312.5, + "loss/crossentropy": 2.5889945030212402, + "loss/hidden": 0.3828125, + "loss/logits": 0.13727445900440216, + "loss/reg": 302311.96875, + "step": 151 + }, + { + "epoch": 0.0038, + "grad_norm": 0.9557918310165405, + "grad_norm_var": 0.030764173494984705, + "learning_rate": 0.0001, + "loss": 301711.2812, + "loss/crossentropy": 2.881540298461914, + "loss/hidden": 0.375, + "loss/logits": 0.09907993674278259, + "loss/reg": 301710.8125, + "step": 152 + }, + { + "epoch": 0.003825, + "grad_norm": 1.6195590496063232, + "grad_norm_var": 0.03762476341575312, + "learning_rate": 0.0001, + "loss": 301111.375, + "loss/crossentropy": 2.677419424057007, + "loss/hidden": 0.41796875, + "loss/logits": 0.1280079483985901, + "loss/reg": 301110.84375, + "step": 153 + }, + { + "epoch": 0.00385, + "grad_norm": 1.2070472240447998, + "grad_norm_var": 0.035259098369091586, + "learning_rate": 0.0001, + "loss": 300519.5938, + "loss/crossentropy": 2.0665385723114014, + "loss/hidden": 0.388671875, + "loss/logits": 0.12192954123020172, + "loss/reg": 300519.09375, + "step": 154 + }, + { + "epoch": 0.003875, + "grad_norm": 1.0260303020477295, + "grad_norm_var": 0.03467658118889426, + "learning_rate": 0.0001, + "loss": 299925.3125, + "loss/crossentropy": 2.2250208854675293, + "loss/hidden": 0.373046875, + "loss/logits": 0.09717711806297302, + "loss/reg": 299924.84375, + "step": 155 + }, + { + "epoch": 0.0039, + "grad_norm": 3.2926387786865234, + "grad_norm_var": 0.30319103627804694, + "learning_rate": 0.0001, + "loss": 299323.6562, + "loss/crossentropy": 2.505962610244751, + "loss/hidden": 0.376953125, + "loss/logits": 0.09269723296165466, + "loss/reg": 299323.1875, + "step": 156 + }, + { + "epoch": 0.003925, + "grad_norm": 1.0687026977539062, + "grad_norm_var": 0.30596545543291087, + "learning_rate": 0.0001, + "loss": 298658.0625, + "loss/crossentropy": 2.4630236625671387, + "loss/hidden": 0.37109375, + "loss/logits": 0.10382035374641418, + "loss/reg": 298657.59375, + "step": 157 + }, + { + "epoch": 0.00395, + "grad_norm": 2.530480146408081, + "grad_norm_var": 0.38710687727071186, + "learning_rate": 0.0001, + "loss": 298049.2188, + "loss/crossentropy": 2.357905149459839, + "loss/hidden": 0.380859375, + "loss/logits": 0.10747890919446945, + "loss/reg": 298048.71875, + "step": 158 + }, + { + "epoch": 0.003975, + "grad_norm": 1.1068466901779175, + "grad_norm_var": 0.3864507502154694, + "learning_rate": 0.0001, + "loss": 297454.375, + "loss/crossentropy": 2.54504132270813, + "loss/hidden": 0.390625, + "loss/logits": 0.1359184831380844, + "loss/reg": 297453.84375, + "step": 159 + }, + { + "epoch": 0.004, + "grad_norm": 1.300596833229065, + "grad_norm_var": 0.3819891406164392, + "learning_rate": 0.0001, + "loss": 296827.9062, + "loss/crossentropy": 2.3191466331481934, + "loss/hidden": 0.369140625, + "loss/logits": 0.10781531035900116, + "loss/reg": 296827.4375, + "step": 160 + }, + { + "epoch": 0.004025, + "grad_norm": 1.139121413230896, + "grad_norm_var": 0.38187765086207476, + "learning_rate": 0.0001, + "loss": 296222.1875, + "loss/crossentropy": 2.337054491043091, + "loss/hidden": 0.359375, + "loss/logits": 0.09755780547857285, + "loss/reg": 296221.71875, + "step": 161 + }, + { + "epoch": 0.00405, + "grad_norm": 0.9172402620315552, + "grad_norm_var": 0.398967868486981, + "learning_rate": 0.0001, + "loss": 295604.0938, + "loss/crossentropy": 2.536017417907715, + "loss/hidden": 0.36328125, + "loss/logits": 0.10009163618087769, + "loss/reg": 295603.625, + "step": 162 + }, + { + "epoch": 0.004075, + "grad_norm": 0.87556391954422, + "grad_norm_var": 0.4128959927795121, + "learning_rate": 0.0001, + "loss": 294997.2188, + "loss/crossentropy": 2.4224531650543213, + "loss/hidden": 0.35546875, + "loss/logits": 0.09989573061466217, + "loss/reg": 294996.75, + "step": 163 + }, + { + "epoch": 0.0041, + "grad_norm": 1.1213515996932983, + "grad_norm_var": 0.416297804221395, + "learning_rate": 0.0001, + "loss": 294385.9688, + "loss/crossentropy": 2.5139639377593994, + "loss/hidden": 0.353515625, + "loss/logits": 0.12164813280105591, + "loss/reg": 294385.5, + "step": 164 + }, + { + "epoch": 0.004125, + "grad_norm": 0.9138302803039551, + "grad_norm_var": 0.4281321991307845, + "learning_rate": 0.0001, + "loss": 293774.9375, + "loss/crossentropy": 2.6180176734924316, + "loss/hidden": 0.37109375, + "loss/logits": 0.11297552287578583, + "loss/reg": 293774.46875, + "step": 165 + }, + { + "epoch": 0.00415, + "grad_norm": 0.8265255093574524, + "grad_norm_var": 0.442330460385934, + "learning_rate": 0.0001, + "loss": 293165.0625, + "loss/crossentropy": 2.337454080581665, + "loss/hidden": 0.357421875, + "loss/logits": 0.11443285644054413, + "loss/reg": 293164.59375, + "step": 166 + }, + { + "epoch": 0.004175, + "grad_norm": 1.0448765754699707, + "grad_norm_var": 0.4467533252113644, + "learning_rate": 0.0001, + "loss": 292557.4375, + "loss/crossentropy": 2.405266523361206, + "loss/hidden": 0.353515625, + "loss/logits": 0.09610580652952194, + "loss/reg": 292557.0, + "step": 167 + }, + { + "epoch": 0.0042, + "grad_norm": 1.2320325374603271, + "grad_norm_var": 0.43850816507715845, + "learning_rate": 0.0001, + "loss": 291948.2188, + "loss/crossentropy": 2.1647391319274902, + "loss/hidden": 0.373046875, + "loss/logits": 0.09476009011268616, + "loss/reg": 291947.75, + "step": 168 + }, + { + "epoch": 0.004225, + "grad_norm": 1.198795199394226, + "grad_norm_var": 0.4331266932673277, + "learning_rate": 0.0001, + "loss": 291335.2188, + "loss/crossentropy": 2.5491573810577393, + "loss/hidden": 0.36328125, + "loss/logits": 0.10695995390415192, + "loss/reg": 291334.75, + "step": 169 + }, + { + "epoch": 0.00425, + "grad_norm": 1.2981737852096558, + "grad_norm_var": 0.4325150250770269, + "learning_rate": 0.0001, + "loss": 290728.0, + "loss/crossentropy": 2.8310611248016357, + "loss/hidden": 0.376953125, + "loss/logits": 0.1097162589430809, + "loss/reg": 290727.5, + "step": 170 + }, + { + "epoch": 0.004275, + "grad_norm": 0.8877711296081543, + "grad_norm_var": 0.4388671877783653, + "learning_rate": 0.0001, + "loss": 290118.9375, + "loss/crossentropy": 2.337736129760742, + "loss/hidden": 0.361328125, + "loss/logits": 0.10747654736042023, + "loss/reg": 290118.46875, + "step": 171 + }, + { + "epoch": 0.0043, + "grad_norm": 1.618420124053955, + "grad_norm_var": 0.16860597836987498, + "learning_rate": 0.0001, + "loss": 289504.875, + "loss/crossentropy": 2.511380195617676, + "loss/hidden": 0.376953125, + "loss/logits": 0.11618395149707794, + "loss/reg": 289504.375, + "step": 172 + }, + { + "epoch": 0.004325, + "grad_norm": 1.0306227207183838, + "grad_norm_var": 0.16932527275910872, + "learning_rate": 0.0001, + "loss": 288894.0, + "loss/crossentropy": 2.384922742843628, + "loss/hidden": 0.3515625, + "loss/logits": 0.11106235533952713, + "loss/reg": 288893.53125, + "step": 173 + }, + { + "epoch": 0.00435, + "grad_norm": 1.1711888313293457, + "grad_norm_var": 0.041883230518856604, + "learning_rate": 0.0001, + "loss": 288282.2812, + "loss/crossentropy": 2.558091878890991, + "loss/hidden": 0.365234375, + "loss/logits": 0.11737295985221863, + "loss/reg": 288281.8125, + "step": 174 + }, + { + "epoch": 0.004375, + "grad_norm": 1.187247395515442, + "grad_norm_var": 0.04230506281728168, + "learning_rate": 0.0001, + "loss": 287670.5, + "loss/crossentropy": 2.7218947410583496, + "loss/hidden": 0.375, + "loss/logits": 0.12176042795181274, + "loss/reg": 287670.0, + "step": 175 + }, + { + "epoch": 0.0044, + "grad_norm": 0.8527270555496216, + "grad_norm_var": 0.04347262990668099, + "learning_rate": 0.0001, + "loss": 287040.0, + "loss/crossentropy": 2.3621020317077637, + "loss/hidden": 0.3359375, + "loss/logits": 0.09604518115520477, + "loss/reg": 287039.5625, + "step": 176 + }, + { + "epoch": 0.004425, + "grad_norm": 1.0103626251220703, + "grad_norm_var": 0.0435318982609776, + "learning_rate": 0.0001, + "loss": 286424.9062, + "loss/crossentropy": 2.7167232036590576, + "loss/hidden": 0.34375, + "loss/logits": 0.09909898042678833, + "loss/reg": 286424.46875, + "step": 177 + }, + { + "epoch": 0.00445, + "grad_norm": 1.8277630805969238, + "grad_norm_var": 0.07629581584707286, + "learning_rate": 0.0001, + "loss": 285803.625, + "loss/crossentropy": 2.4057064056396484, + "loss/hidden": 0.423828125, + "loss/logits": 0.12081070989370346, + "loss/reg": 285803.09375, + "step": 178 + }, + { + "epoch": 0.004475, + "grad_norm": 1.0141959190368652, + "grad_norm_var": 0.07277399737205728, + "learning_rate": 0.0001, + "loss": 285182.7188, + "loss/crossentropy": 2.4887070655822754, + "loss/hidden": 0.369140625, + "loss/logits": 0.10165692120790482, + "loss/reg": 285182.25, + "step": 179 + }, + { + "epoch": 0.0045, + "grad_norm": 1.0775957107543945, + "grad_norm_var": 0.07300095477065095, + "learning_rate": 0.0001, + "loss": 284551.3125, + "loss/crossentropy": 2.5259671211242676, + "loss/hidden": 0.38671875, + "loss/logits": 0.12639477849006653, + "loss/reg": 284550.8125, + "step": 180 + }, + { + "epoch": 0.004525, + "grad_norm": 0.741888701915741, + "grad_norm_var": 0.07996517019108704, + "learning_rate": 0.0001, + "loss": 283933.2812, + "loss/crossentropy": 2.320598840713501, + "loss/hidden": 0.33984375, + "loss/logits": 0.09457631409168243, + "loss/reg": 283932.84375, + "step": 181 + }, + { + "epoch": 0.00455, + "grad_norm": 1.2044017314910889, + "grad_norm_var": 0.07378781633814714, + "learning_rate": 0.0001, + "loss": 283313.875, + "loss/crossentropy": 2.6596872806549072, + "loss/hidden": 0.365234375, + "loss/logits": 0.11112990975379944, + "loss/reg": 283313.40625, + "step": 182 + }, + { + "epoch": 0.004575, + "grad_norm": 1.0499355792999268, + "grad_norm_var": 0.07371858828104802, + "learning_rate": 0.0001, + "loss": 282699.0312, + "loss/crossentropy": 2.4527785778045654, + "loss/hidden": 0.365234375, + "loss/logits": 0.10764066874980927, + "loss/reg": 282698.5625, + "step": 183 + }, + { + "epoch": 0.0046, + "grad_norm": 0.9653274416923523, + "grad_norm_var": 0.07525412007987213, + "learning_rate": 0.0001, + "loss": 282087.6562, + "loss/crossentropy": 2.3212976455688477, + "loss/hidden": 0.37109375, + "loss/logits": 0.1268739551305771, + "loss/reg": 282087.15625, + "step": 184 + }, + { + "epoch": 0.004625, + "grad_norm": 0.8533391952514648, + "grad_norm_var": 0.079706512148127, + "learning_rate": 0.0001, + "loss": 281480.6875, + "loss/crossentropy": 2.3971142768859863, + "loss/hidden": 0.345703125, + "loss/logits": 0.09729812294244766, + "loss/reg": 281480.25, + "step": 185 + }, + { + "epoch": 0.00465, + "grad_norm": 0.9148571491241455, + "grad_norm_var": 0.0793712865271918, + "learning_rate": 0.0001, + "loss": 280867.7188, + "loss/crossentropy": 2.2173333168029785, + "loss/hidden": 0.333984375, + "loss/logits": 0.08855113387107849, + "loss/reg": 280867.28125, + "step": 186 + }, + { + "epoch": 0.004675, + "grad_norm": 0.8931951522827148, + "grad_norm_var": 0.07922833522774221, + "learning_rate": 0.0001, + "loss": 280253.6875, + "loss/crossentropy": 2.5096046924591064, + "loss/hidden": 0.337890625, + "loss/logits": 0.10861176252365112, + "loss/reg": 280253.25, + "step": 187 + }, + { + "epoch": 0.0047, + "grad_norm": 0.9277564883232117, + "grad_norm_var": 0.06022543723158867, + "learning_rate": 0.0001, + "loss": 279635.9062, + "loss/crossentropy": 2.5655405521392822, + "loss/hidden": 0.359375, + "loss/logits": 0.10575452446937561, + "loss/reg": 279635.4375, + "step": 188 + }, + { + "epoch": 0.004725, + "grad_norm": 0.8951659202575684, + "grad_norm_var": 0.06163460251059214, + "learning_rate": 0.0001, + "loss": 279024.125, + "loss/crossentropy": 2.498629093170166, + "loss/hidden": 0.35546875, + "loss/logits": 0.09899453818798065, + "loss/reg": 279023.65625, + "step": 189 + }, + { + "epoch": 0.00475, + "grad_norm": 0.9445829391479492, + "grad_norm_var": 0.06078005444577621, + "learning_rate": 0.0001, + "loss": 278415.8125, + "loss/crossentropy": 2.5467264652252197, + "loss/hidden": 0.353515625, + "loss/logits": 0.09790721535682678, + "loss/reg": 278415.375, + "step": 190 + }, + { + "epoch": 0.004775, + "grad_norm": 1.0030640363693237, + "grad_norm_var": 0.058854968365466796, + "learning_rate": 0.0001, + "loss": 277809.5312, + "loss/crossentropy": 2.3620338439941406, + "loss/hidden": 0.35546875, + "loss/logits": 0.08987599611282349, + "loss/reg": 277809.09375, + "step": 191 + }, + { + "epoch": 0.0048, + "grad_norm": 1.0268861055374146, + "grad_norm_var": 0.05707516016780034, + "learning_rate": 0.0001, + "loss": 277195.1562, + "loss/crossentropy": 2.4163973331451416, + "loss/hidden": 0.3359375, + "loss/logits": 0.10799846798181534, + "loss/reg": 277194.71875, + "step": 192 + }, + { + "epoch": 0.004825, + "grad_norm": 1.0668983459472656, + "grad_norm_var": 0.05718799704836983, + "learning_rate": 0.0001, + "loss": 276591.3125, + "loss/crossentropy": 2.436892509460449, + "loss/hidden": 0.361328125, + "loss/logits": 0.11163410544395447, + "loss/reg": 276590.84375, + "step": 193 + }, + { + "epoch": 0.00485, + "grad_norm": 0.8416240811347961, + "grad_norm_var": 0.012472236631485067, + "learning_rate": 0.0001, + "loss": 275983.6562, + "loss/crossentropy": 2.4544501304626465, + "loss/hidden": 0.357421875, + "loss/logits": 0.09674258530139923, + "loss/reg": 275983.1875, + "step": 194 + }, + { + "epoch": 0.004875, + "grad_norm": 0.9170304536819458, + "grad_norm_var": 0.012409338821028607, + "learning_rate": 0.0001, + "loss": 275381.8438, + "loss/crossentropy": 2.3440940380096436, + "loss/hidden": 0.345703125, + "loss/logits": 0.10958852618932724, + "loss/reg": 275381.375, + "step": 195 + }, + { + "epoch": 0.0049, + "grad_norm": 0.9952634572982788, + "grad_norm_var": 0.011517069584062688, + "learning_rate": 0.0001, + "loss": 274752.875, + "loss/crossentropy": 2.655498504638672, + "loss/hidden": 0.357421875, + "loss/logits": 0.1187039464712143, + "loss/reg": 274752.40625, + "step": 196 + }, + { + "epoch": 0.004925, + "grad_norm": 0.8331409692764282, + "grad_norm_var": 0.009474079189952281, + "learning_rate": 0.0001, + "loss": 274145.5938, + "loss/crossentropy": 2.363325357437134, + "loss/hidden": 0.34375, + "loss/logits": 0.1032324731349945, + "loss/reg": 274145.15625, + "step": 197 + }, + { + "epoch": 0.00495, + "grad_norm": 1.0865329504013062, + "grad_norm_var": 0.0064743748957604525, + "learning_rate": 0.0001, + "loss": 273545.7812, + "loss/crossentropy": 2.2604920864105225, + "loss/hidden": 0.357421875, + "loss/logits": 0.1005285307765007, + "loss/reg": 273545.3125, + "step": 198 + }, + { + "epoch": 0.004975, + "grad_norm": 0.9372029900550842, + "grad_norm_var": 0.00578024785383775, + "learning_rate": 0.0001, + "loss": 272943.7188, + "loss/crossentropy": 2.70699143409729, + "loss/hidden": 0.357421875, + "loss/logits": 0.10466171056032181, + "loss/reg": 272943.25, + "step": 199 + }, + { + "epoch": 0.005, + "grad_norm": 1.295333981513977, + "grad_norm_var": 0.013531057663448332, + "learning_rate": 0.0001, + "loss": 272339.8125, + "loss/crossentropy": 2.5356268882751465, + "loss/hidden": 0.34375, + "loss/logits": 0.09528132528066635, + "loss/reg": 272339.375, + "step": 200 + }, + { + "epoch": 0.005025, + "grad_norm": 0.8280879855155945, + "grad_norm_var": 0.013945141975632147, + "learning_rate": 0.0001, + "loss": 271735.5625, + "loss/crossentropy": 2.7488901615142822, + "loss/hidden": 0.341796875, + "loss/logits": 0.10489709675312042, + "loss/reg": 271735.125, + "step": 201 + }, + { + "epoch": 0.00505, + "grad_norm": 1.2747125625610352, + "grad_norm_var": 0.01973283747601891, + "learning_rate": 0.0001, + "loss": 271125.9062, + "loss/crossentropy": 2.9518277645111084, + "loss/hidden": 0.375, + "loss/logits": 0.124430812895298, + "loss/reg": 271125.40625, + "step": 202 + }, + { + "epoch": 0.005075, + "grad_norm": 1.2745908498764038, + "grad_norm_var": 0.024135134657858496, + "learning_rate": 0.0001, + "loss": 270510.0938, + "loss/crossentropy": 2.5481607913970947, + "loss/hidden": 0.36328125, + "loss/logits": 0.1074289083480835, + "loss/reg": 270509.625, + "step": 203 + }, + { + "epoch": 0.0051, + "grad_norm": 0.9992681741714478, + "grad_norm_var": 0.023677797963842655, + "learning_rate": 0.0001, + "loss": 269877.4688, + "loss/crossentropy": 2.6645333766937256, + "loss/hidden": 0.357421875, + "loss/logits": 0.1071658581495285, + "loss/reg": 269877.0, + "step": 204 + }, + { + "epoch": 0.005125, + "grad_norm": 1.423200011253357, + "grad_norm_var": 0.03275789245082638, + "learning_rate": 0.0001, + "loss": 269255.5625, + "loss/crossentropy": 2.802064895629883, + "loss/hidden": 0.36328125, + "loss/logits": 0.08967677503824234, + "loss/reg": 269255.125, + "step": 205 + }, + { + "epoch": 0.00515, + "grad_norm": 1.1624194383621216, + "grad_norm_var": 0.03275731314503016, + "learning_rate": 0.0001, + "loss": 268633.5938, + "loss/crossentropy": 2.4441721439361572, + "loss/hidden": 0.38671875, + "loss/logits": 0.09957188367843628, + "loss/reg": 268633.09375, + "step": 206 + }, + { + "epoch": 0.005175, + "grad_norm": 0.9844810962677002, + "grad_norm_var": 0.032920781672822114, + "learning_rate": 0.0001, + "loss": 268009.5, + "loss/crossentropy": 2.4509806632995605, + "loss/hidden": 0.357421875, + "loss/logits": 0.09912946820259094, + "loss/reg": 268009.03125, + "step": 207 + }, + { + "epoch": 0.0052, + "grad_norm": 1.0119577646255493, + "grad_norm_var": 0.03299896367337376, + "learning_rate": 0.0001, + "loss": 267392.7188, + "loss/crossentropy": 2.3468456268310547, + "loss/hidden": 0.361328125, + "loss/logits": 0.10302117466926575, + "loss/reg": 267392.25, + "step": 208 + }, + { + "epoch": 0.005225, + "grad_norm": 0.9958096146583557, + "grad_norm_var": 0.033232689832916586, + "learning_rate": 0.0001, + "loss": 266773.6562, + "loss/crossentropy": 2.5680363178253174, + "loss/hidden": 0.345703125, + "loss/logits": 0.10389181971549988, + "loss/reg": 266773.21875, + "step": 209 + }, + { + "epoch": 0.00525, + "grad_norm": 1.0967997312545776, + "grad_norm_var": 0.030083708105304044, + "learning_rate": 0.0001, + "loss": 266151.75, + "loss/crossentropy": 2.4047651290893555, + "loss/hidden": 0.373046875, + "loss/logits": 0.1094539687037468, + "loss/reg": 266151.28125, + "step": 210 + }, + { + "epoch": 0.005275, + "grad_norm": 0.8511112928390503, + "grad_norm_var": 0.03169748523245202, + "learning_rate": 0.0001, + "loss": 265518.3438, + "loss/crossentropy": 2.393298387527466, + "loss/hidden": 0.330078125, + "loss/logits": 0.1038215309381485, + "loss/reg": 265517.90625, + "step": 211 + }, + { + "epoch": 0.0053, + "grad_norm": 1.0173771381378174, + "grad_norm_var": 0.031520604359032575, + "learning_rate": 0.0001, + "loss": 264902.6875, + "loss/crossentropy": 2.589514970779419, + "loss/hidden": 0.37109375, + "loss/logits": 0.10674653202295303, + "loss/reg": 264902.21875, + "step": 212 + }, + { + "epoch": 0.005325, + "grad_norm": 1.0270274877548218, + "grad_norm_var": 0.027824445234146098, + "learning_rate": 0.0001, + "loss": 264265.8125, + "loss/crossentropy": 2.455270290374756, + "loss/hidden": 0.3515625, + "loss/logits": 0.10016101598739624, + "loss/reg": 264265.375, + "step": 213 + }, + { + "epoch": 0.00535, + "grad_norm": 0.952225387096405, + "grad_norm_var": 0.02881909636722586, + "learning_rate": 0.0001, + "loss": 263649.0625, + "loss/crossentropy": 2.4784069061279297, + "loss/hidden": 0.341796875, + "loss/logits": 0.0942714512348175, + "loss/reg": 263648.625, + "step": 214 + }, + { + "epoch": 0.005375, + "grad_norm": 1.1497489213943481, + "grad_norm_var": 0.02785863109532749, + "learning_rate": 0.0001, + "loss": 263029.5312, + "loss/crossentropy": 2.5415937900543213, + "loss/hidden": 0.365234375, + "loss/logits": 0.10113269090652466, + "loss/reg": 263029.0625, + "step": 215 + }, + { + "epoch": 0.0054, + "grad_norm": 1.118820309638977, + "grad_norm_var": 0.024832392848142825, + "learning_rate": 0.0001, + "loss": 262410.8125, + "loss/crossentropy": 2.605389356613159, + "loss/hidden": 0.359375, + "loss/logits": 0.08895421028137207, + "loss/reg": 262410.375, + "step": 216 + }, + { + "epoch": 0.005425, + "grad_norm": 1.3159630298614502, + "grad_norm_var": 0.02377871624893686, + "learning_rate": 0.0001, + "loss": 261791.7344, + "loss/crossentropy": 2.860142230987549, + "loss/hidden": 0.373046875, + "loss/logits": 0.07822292298078537, + "loss/reg": 261791.28125, + "step": 217 + }, + { + "epoch": 0.00545, + "grad_norm": 1.0709484815597534, + "grad_norm_var": 0.02172128046532335, + "learning_rate": 0.0001, + "loss": 261152.7188, + "loss/crossentropy": 2.735309600830078, + "loss/hidden": 0.357421875, + "loss/logits": 0.09882272779941559, + "loss/reg": 261152.265625, + "step": 218 + }, + { + "epoch": 0.005475, + "grad_norm": 0.9251193404197693, + "grad_norm_var": 0.020787409451776744, + "learning_rate": 0.0001, + "loss": 260535.3281, + "loss/crossentropy": 2.57698130607605, + "loss/hidden": 0.349609375, + "loss/logits": 0.10102309286594391, + "loss/reg": 260534.875, + "step": 219 + }, + { + "epoch": 0.0055, + "grad_norm": 1.2007627487182617, + "grad_norm_var": 0.021454394906030537, + "learning_rate": 0.0001, + "loss": 259921.6406, + "loss/crossentropy": 2.6678407192230225, + "loss/hidden": 0.373046875, + "loss/logits": 0.12056195735931396, + "loss/reg": 259921.140625, + "step": 220 + }, + { + "epoch": 0.005525, + "grad_norm": 0.8163579702377319, + "grad_norm_var": 0.016821595485903732, + "learning_rate": 0.0001, + "loss": 259304.5312, + "loss/crossentropy": 2.159184217453003, + "loss/hidden": 0.3359375, + "loss/logits": 0.08772530406713486, + "loss/reg": 259304.109375, + "step": 221 + }, + { + "epoch": 0.00555, + "grad_norm": 1.004481315612793, + "grad_norm_var": 0.015877592297527447, + "learning_rate": 0.0001, + "loss": 258690.1094, + "loss/crossentropy": 2.620434045791626, + "loss/hidden": 0.35546875, + "loss/logits": 0.11029927432537079, + "loss/reg": 258689.640625, + "step": 222 + }, + { + "epoch": 0.005575, + "grad_norm": 1.2820781469345093, + "grad_norm_var": 0.01946037259012514, + "learning_rate": 0.0001, + "loss": 258077.8125, + "loss/crossentropy": 2.3812220096588135, + "loss/hidden": 0.3671875, + "loss/logits": 0.11590490490198135, + "loss/reg": 258077.328125, + "step": 223 + }, + { + "epoch": 0.0056, + "grad_norm": 1.1068248748779297, + "grad_norm_var": 0.019512738349682583, + "learning_rate": 0.0001, + "loss": 257462.875, + "loss/crossentropy": 2.499555826187134, + "loss/hidden": 0.34375, + "loss/logits": 0.10359837114810944, + "loss/reg": 257462.421875, + "step": 224 + }, + { + "epoch": 0.005625, + "grad_norm": 1.0697591304779053, + "grad_norm_var": 0.019239198728426683, + "learning_rate": 0.0001, + "loss": 256847.6719, + "loss/crossentropy": 2.564305305480957, + "loss/hidden": 0.357421875, + "loss/logits": 0.10096286982297897, + "loss/reg": 256847.21875, + "step": 225 + }, + { + "epoch": 0.00565, + "grad_norm": 1.013181209564209, + "grad_norm_var": 0.019297556456657598, + "learning_rate": 0.0001, + "loss": 256187.0781, + "loss/crossentropy": 2.682983636856079, + "loss/hidden": 0.373046875, + "loss/logits": 0.10669070482254028, + "loss/reg": 256186.59375, + "step": 226 + }, + { + "epoch": 0.005675, + "grad_norm": 1.149176836013794, + "grad_norm_var": 0.016643495102077507, + "learning_rate": 0.0001, + "loss": 255567.5156, + "loss/crossentropy": 2.8012313842773438, + "loss/hidden": 0.37109375, + "loss/logits": 0.1071043312549591, + "loss/reg": 255567.03125, + "step": 227 + }, + { + "epoch": 0.0057, + "grad_norm": 0.8703486323356628, + "grad_norm_var": 0.019148532368986424, + "learning_rate": 0.0001, + "loss": 254945.2188, + "loss/crossentropy": 2.1328365802764893, + "loss/hidden": 0.341796875, + "loss/logits": 0.09531014412641525, + "loss/reg": 254944.78125, + "step": 228 + }, + { + "epoch": 0.005725, + "grad_norm": 0.9841340780258179, + "grad_norm_var": 0.019492424773756397, + "learning_rate": 0.0001, + "loss": 254324.0156, + "loss/crossentropy": 2.6227455139160156, + "loss/hidden": 0.359375, + "loss/logits": 0.10204833000898361, + "loss/reg": 254323.546875, + "step": 229 + }, + { + "epoch": 0.00575, + "grad_norm": 0.8693890571594238, + "grad_norm_var": 0.02115991781307726, + "learning_rate": 0.0001, + "loss": 253708.0469, + "loss/crossentropy": 2.373120069503784, + "loss/hidden": 0.333984375, + "loss/logits": 0.1063462570309639, + "loss/reg": 253707.609375, + "step": 230 + }, + { + "epoch": 0.005775, + "grad_norm": 0.8079307675361633, + "grad_norm_var": 0.024335258901194465, + "learning_rate": 0.0001, + "loss": 253093.1562, + "loss/crossentropy": 2.293689250946045, + "loss/hidden": 0.345703125, + "loss/logits": 0.10375718027353287, + "loss/reg": 253092.703125, + "step": 231 + }, + { + "epoch": 0.0058, + "grad_norm": 0.8402238488197327, + "grad_norm_var": 0.026177766881752735, + "learning_rate": 0.0001, + "loss": 252477.5938, + "loss/crossentropy": 2.4137558937072754, + "loss/hidden": 0.3203125, + "loss/logits": 0.09599176049232483, + "loss/reg": 252477.171875, + "step": 232 + }, + { + "epoch": 0.005825, + "grad_norm": 1.118818759918213, + "grad_norm_var": 0.020838201426947003, + "learning_rate": 0.0001, + "loss": 251863.7812, + "loss/crossentropy": 2.377997398376465, + "loss/hidden": 0.359375, + "loss/logits": 0.1122322753071785, + "loss/reg": 251863.3125, + "step": 233 + }, + { + "epoch": 0.00585, + "grad_norm": 1.1597261428833008, + "grad_norm_var": 0.02207478026816852, + "learning_rate": 0.0001, + "loss": 251232.2969, + "loss/crossentropy": 2.248708724975586, + "loss/hidden": 0.34375, + "loss/logits": 0.10326389968395233, + "loss/reg": 251231.84375, + "step": 234 + }, + { + "epoch": 0.005875, + "grad_norm": 0.8862115144729614, + "grad_norm_var": 0.022628637096132904, + "learning_rate": 0.0001, + "loss": 250618.3281, + "loss/crossentropy": 2.364410877227783, + "loss/hidden": 0.357421875, + "loss/logits": 0.09520143270492554, + "loss/reg": 250617.875, + "step": 235 + }, + { + "epoch": 0.0059, + "grad_norm": 2.3220930099487305, + "grad_norm_var": 0.12955472788699715, + "learning_rate": 0.0001, + "loss": 250009.4219, + "loss/crossentropy": 2.3695480823516846, + "loss/hidden": 0.4140625, + "loss/logits": 0.20651775598526, + "loss/reg": 250008.796875, + "step": 236 + }, + { + "epoch": 0.005925, + "grad_norm": 0.8119528293609619, + "grad_norm_var": 0.1297115526028732, + "learning_rate": 0.0001, + "loss": 249394.3438, + "loss/crossentropy": 2.531947374343872, + "loss/hidden": 0.328125, + "loss/logits": 0.10816951841115952, + "loss/reg": 249393.90625, + "step": 237 + }, + { + "epoch": 0.00595, + "grad_norm": 0.8456658124923706, + "grad_norm_var": 0.1329087010824144, + "learning_rate": 0.0001, + "loss": 248792.0156, + "loss/crossentropy": 2.1139674186706543, + "loss/hidden": 0.341796875, + "loss/logits": 0.09102342277765274, + "loss/reg": 248791.578125, + "step": 238 + }, + { + "epoch": 0.005975, + "grad_norm": 1.0733762979507446, + "grad_norm_var": 0.12975996092384426, + "learning_rate": 0.0001, + "loss": 248187.4844, + "loss/crossentropy": 2.523265838623047, + "loss/hidden": 0.345703125, + "loss/logits": 0.10305313020944595, + "loss/reg": 248187.03125, + "step": 239 + }, + { + "epoch": 0.006, + "grad_norm": 1.1087757349014282, + "grad_norm_var": 0.129772885642123, + "learning_rate": 0.0001, + "loss": 247579.5469, + "loss/crossentropy": 2.562262535095215, + "loss/hidden": 0.35546875, + "loss/logits": 0.10083188116550446, + "loss/reg": 247579.09375, + "step": 240 + }, + { + "epoch": 0.006025, + "grad_norm": 1.3459455966949463, + "grad_norm_var": 0.13496698854201672, + "learning_rate": 0.0001, + "loss": 246977.7344, + "loss/crossentropy": 2.211017370223999, + "loss/hidden": 0.349609375, + "loss/logits": 0.09050680696964264, + "loss/reg": 246977.296875, + "step": 241 + }, + { + "epoch": 0.00605, + "grad_norm": 1.1334515810012817, + "grad_norm_var": 0.13487275370998758, + "learning_rate": 0.0001, + "loss": 246389.9375, + "loss/crossentropy": 2.424520492553711, + "loss/hidden": 0.34375, + "loss/logits": 0.09681625664234161, + "loss/reg": 246389.5, + "step": 242 + }, + { + "epoch": 0.006075, + "grad_norm": 0.8859056234359741, + "grad_norm_var": 0.13688003263884654, + "learning_rate": 0.0001, + "loss": 245796.7031, + "loss/crossentropy": 2.399360179901123, + "loss/hidden": 0.341796875, + "loss/logits": 0.10117021203041077, + "loss/reg": 245796.265625, + "step": 243 + }, + { + "epoch": 0.0061, + "grad_norm": 1.0999157428741455, + "grad_norm_var": 0.13416995876895163, + "learning_rate": 0.0001, + "loss": 245189.2188, + "loss/crossentropy": 2.5314061641693115, + "loss/hidden": 0.3671875, + "loss/logits": 0.09776099026203156, + "loss/reg": 245188.75, + "step": 244 + }, + { + "epoch": 0.006125, + "grad_norm": 0.9259365797042847, + "grad_norm_var": 0.13513208581838518, + "learning_rate": 0.0001, + "loss": 244560.7344, + "loss/crossentropy": 2.1860015392303467, + "loss/hidden": 0.345703125, + "loss/logits": 0.09684273600578308, + "loss/reg": 244560.296875, + "step": 245 + }, + { + "epoch": 0.00615, + "grad_norm": 1.2072253227233887, + "grad_norm_var": 0.13290427526062487, + "learning_rate": 0.0001, + "loss": 243912.7031, + "loss/crossentropy": 2.7621207237243652, + "loss/hidden": 0.359375, + "loss/logits": 0.10011032223701477, + "loss/reg": 243912.25, + "step": 246 + }, + { + "epoch": 0.006175, + "grad_norm": 0.9330417513847351, + "grad_norm_var": 0.12903841831431506, + "learning_rate": 0.0001, + "loss": 243308.4375, + "loss/crossentropy": 2.5903942584991455, + "loss/hidden": 0.341796875, + "loss/logits": 0.08663488924503326, + "loss/reg": 243308.015625, + "step": 247 + }, + { + "epoch": 0.0062, + "grad_norm": 0.9449116587638855, + "grad_norm_var": 0.12601160933316582, + "learning_rate": 0.0001, + "loss": 242708.8438, + "loss/crossentropy": 2.523956537246704, + "loss/hidden": 0.34375, + "loss/logits": 0.1024409681558609, + "loss/reg": 242708.390625, + "step": 248 + }, + { + "epoch": 0.006225, + "grad_norm": 0.9360359907150269, + "grad_norm_var": 0.12795021034255089, + "learning_rate": 0.0001, + "loss": 242110.125, + "loss/crossentropy": 2.697937488555908, + "loss/hidden": 0.34375, + "loss/logits": 0.10301453620195389, + "loss/reg": 242109.671875, + "step": 249 + }, + { + "epoch": 0.00625, + "grad_norm": 1.4438672065734863, + "grad_norm_var": 0.13521121045655154, + "learning_rate": 0.0001, + "loss": 241507.6875, + "loss/crossentropy": 2.2453622817993164, + "loss/hidden": 0.369140625, + "loss/logits": 0.15388593077659607, + "loss/reg": 241507.171875, + "step": 250 + }, + { + "epoch": 0.006275, + "grad_norm": 1.023462176322937, + "grad_norm_var": 0.13212816249865156, + "learning_rate": 0.0001, + "loss": 240912.5156, + "loss/crossentropy": 2.5658745765686035, + "loss/hidden": 0.33203125, + "loss/logits": 0.0870145931839943, + "loss/reg": 240912.09375, + "step": 251 + }, + { + "epoch": 0.0063, + "grad_norm": 1.0394799709320068, + "grad_norm_var": 0.03066997116386473, + "learning_rate": 0.0001, + "loss": 240319.1562, + "loss/crossentropy": 2.660517930984497, + "loss/hidden": 0.369140625, + "loss/logits": 0.09973147511482239, + "loss/reg": 240318.6875, + "step": 252 + }, + { + "epoch": 0.006325, + "grad_norm": 0.8985840082168579, + "grad_norm_var": 0.028419025472375382, + "learning_rate": 0.0001, + "loss": 239699.5781, + "loss/crossentropy": 2.5660433769226074, + "loss/hidden": 0.34375, + "loss/logits": 0.10593757778406143, + "loss/reg": 239699.125, + "step": 253 + }, + { + "epoch": 0.00635, + "grad_norm": 1.0081883668899536, + "grad_norm_var": 0.025580286139437903, + "learning_rate": 0.0001, + "loss": 239101.4531, + "loss/crossentropy": 2.7612223625183105, + "loss/hidden": 0.357421875, + "loss/logits": 0.09220311045646667, + "loss/reg": 239101.0, + "step": 254 + }, + { + "epoch": 0.006375, + "grad_norm": 0.9036446809768677, + "grad_norm_var": 0.027146159272535185, + "learning_rate": 0.0001, + "loss": 238506.4844, + "loss/crossentropy": 2.500701665878296, + "loss/hidden": 0.341796875, + "loss/logits": 0.0896071344614029, + "loss/reg": 238506.046875, + "step": 255 + }, + { + "epoch": 0.0064, + "grad_norm": 1.1582329273223877, + "grad_norm_var": 0.027670804759285694, + "learning_rate": 0.0001, + "loss": 237910.5781, + "loss/crossentropy": 2.4937164783477783, + "loss/hidden": 0.328125, + "loss/logits": 0.08801822364330292, + "loss/reg": 237910.15625, + "step": 256 + }, + { + "epoch": 0.006425, + "grad_norm": 0.9264727830886841, + "grad_norm_var": 0.02242301018755981, + "learning_rate": 0.0001, + "loss": 237308.7812, + "loss/crossentropy": 2.568125009536743, + "loss/hidden": 0.31640625, + "loss/logits": 0.07117190212011337, + "loss/reg": 237308.390625, + "step": 257 + }, + { + "epoch": 0.00645, + "grad_norm": 0.9694778323173523, + "grad_norm_var": 0.021825782526880135, + "learning_rate": 0.0001, + "loss": 236710.5625, + "loss/crossentropy": 2.389742612838745, + "loss/hidden": 0.35546875, + "loss/logits": 0.10670766979455948, + "loss/reg": 236710.09375, + "step": 258 + }, + { + "epoch": 0.006475, + "grad_norm": 0.9396619200706482, + "grad_norm_var": 0.02105226506656847, + "learning_rate": 0.0001, + "loss": 236112.0, + "loss/crossentropy": 2.7042641639709473, + "loss/hidden": 0.34375, + "loss/logits": 0.09515395760536194, + "loss/reg": 236111.5625, + "step": 259 + }, + { + "epoch": 0.0065, + "grad_norm": 1.0471324920654297, + "grad_norm_var": 0.020680741980486004, + "learning_rate": 0.0001, + "loss": 235514.3906, + "loss/crossentropy": 2.3326146602630615, + "loss/hidden": 0.353515625, + "loss/logits": 0.11733515560626984, + "loss/reg": 235513.921875, + "step": 260 + }, + { + "epoch": 0.006525, + "grad_norm": 0.9425455331802368, + "grad_norm_var": 0.020491703959659387, + "learning_rate": 0.0001, + "loss": 234854.2656, + "loss/crossentropy": 2.465421438217163, + "loss/hidden": 0.341796875, + "loss/logits": 0.0917583629488945, + "loss/reg": 234853.828125, + "step": 261 + }, + { + "epoch": 0.00655, + "grad_norm": 1.0369884967803955, + "grad_norm_var": 0.018056091150610066, + "learning_rate": 0.0001, + "loss": 234247.1875, + "loss/crossentropy": 2.373871326446533, + "loss/hidden": 0.328125, + "loss/logits": 0.09279663860797882, + "loss/reg": 234246.765625, + "step": 262 + }, + { + "epoch": 0.006575, + "grad_norm": 0.9980863928794861, + "grad_norm_var": 0.017657570783222276, + "learning_rate": 0.0001, + "loss": 233649.6875, + "loss/crossentropy": 2.6594350337982178, + "loss/hidden": 0.357421875, + "loss/logits": 0.09325754642486572, + "loss/reg": 233649.234375, + "step": 263 + }, + { + "epoch": 0.0066, + "grad_norm": 0.9231501221656799, + "grad_norm_var": 0.017886320334470678, + "learning_rate": 0.0001, + "loss": 233047.1406, + "loss/crossentropy": 2.5805892944335938, + "loss/hidden": 0.333984375, + "loss/logits": 0.09794628620147705, + "loss/reg": 233046.703125, + "step": 264 + }, + { + "epoch": 0.006625, + "grad_norm": 1.0291575193405151, + "grad_norm_var": 0.017482775408658568, + "learning_rate": 0.0001, + "loss": 232452.5938, + "loss/crossentropy": 2.751997232437134, + "loss/hidden": 0.33984375, + "loss/logits": 0.09924400597810745, + "loss/reg": 232452.15625, + "step": 265 + }, + { + "epoch": 0.00665, + "grad_norm": 0.9338252544403076, + "grad_norm_var": 0.0047809106518514, + "learning_rate": 0.0001, + "loss": 231850.6875, + "loss/crossentropy": 2.4896867275238037, + "loss/hidden": 0.36328125, + "loss/logits": 0.10598400980234146, + "loss/reg": 231850.21875, + "step": 266 + }, + { + "epoch": 0.006675, + "grad_norm": 0.9800401926040649, + "grad_norm_var": 0.004682617947830892, + "learning_rate": 0.0001, + "loss": 231259.6719, + "loss/crossentropy": 2.419142723083496, + "loss/hidden": 0.35546875, + "loss/logits": 0.10172034800052643, + "loss/reg": 231259.21875, + "step": 267 + }, + { + "epoch": 0.0067, + "grad_norm": 1.3181782960891724, + "grad_norm_var": 0.011620460850276437, + "learning_rate": 0.0001, + "loss": 230605.5781, + "loss/crossentropy": 2.649991273880005, + "loss/hidden": 0.361328125, + "loss/logits": 0.102925606071949, + "loss/reg": 230605.109375, + "step": 268 + }, + { + "epoch": 0.006725, + "grad_norm": 1.3538788557052612, + "grad_norm_var": 0.018369025736734025, + "learning_rate": 0.0001, + "loss": 230004.8594, + "loss/crossentropy": 2.4643285274505615, + "loss/hidden": 0.34375, + "loss/logits": 0.10253345966339111, + "loss/reg": 230004.40625, + "step": 269 + }, + { + "epoch": 0.00675, + "grad_norm": 1.4021903276443481, + "grad_norm_var": 0.026962756524618155, + "learning_rate": 0.0001, + "loss": 229408.0, + "loss/crossentropy": 2.2580270767211914, + "loss/hidden": 0.34375, + "loss/logits": 0.08600744605064392, + "loss/reg": 229407.5625, + "step": 270 + }, + { + "epoch": 0.006775, + "grad_norm": 0.983797013759613, + "grad_norm_var": 0.025758329972640683, + "learning_rate": 0.0001, + "loss": 228810.3281, + "loss/crossentropy": 2.394120693206787, + "loss/hidden": 0.337890625, + "loss/logits": 0.10623958706855774, + "loss/reg": 228809.890625, + "step": 271 + }, + { + "epoch": 0.0068, + "grad_norm": 0.9135763049125671, + "grad_norm_var": 0.02625990416445673, + "learning_rate": 0.0001, + "loss": 228215.3438, + "loss/crossentropy": 2.6530239582061768, + "loss/hidden": 0.37109375, + "loss/logits": 0.10234525799751282, + "loss/reg": 228214.875, + "step": 272 + }, + { + "epoch": 0.006825, + "grad_norm": 0.9926636815071106, + "grad_norm_var": 0.025499722066653265, + "learning_rate": 0.0001, + "loss": 227619.7812, + "loss/crossentropy": 2.3669707775115967, + "loss/hidden": 0.34375, + "loss/logits": 0.09970035403966904, + "loss/reg": 227619.34375, + "step": 273 + }, + { + "epoch": 0.00685, + "grad_norm": 1.1078017950057983, + "grad_norm_var": 0.025251574412599117, + "learning_rate": 0.0001, + "loss": 227035.0469, + "loss/crossentropy": 2.6442811489105225, + "loss/hidden": 0.34375, + "loss/logits": 0.11636050045490265, + "loss/reg": 227034.59375, + "step": 274 + }, + { + "epoch": 0.006875, + "grad_norm": 1.0784547328948975, + "grad_norm_var": 0.02429490171527709, + "learning_rate": 0.0001, + "loss": 226429.125, + "loss/crossentropy": 2.456956386566162, + "loss/hidden": 0.349609375, + "loss/logits": 0.09312474727630615, + "loss/reg": 226428.6875, + "step": 275 + }, + { + "epoch": 0.0069, + "grad_norm": 0.9759635329246521, + "grad_norm_var": 0.02478188381783776, + "learning_rate": 0.0001, + "loss": 225820.25, + "loss/crossentropy": 2.6639130115509033, + "loss/hidden": 0.34375, + "loss/logits": 0.09759189188480377, + "loss/reg": 225819.8125, + "step": 276 + }, + { + "epoch": 0.006925, + "grad_norm": 1.0306111574172974, + "grad_norm_var": 0.023879888291922433, + "learning_rate": 0.0001, + "loss": 225229.0, + "loss/crossentropy": 2.3712949752807617, + "loss/hidden": 0.34375, + "loss/logits": 0.09938957542181015, + "loss/reg": 225228.5625, + "step": 277 + }, + { + "epoch": 0.00695, + "grad_norm": 0.8460413813591003, + "grad_norm_var": 0.026901071203976896, + "learning_rate": 0.0001, + "loss": 224635.5938, + "loss/crossentropy": 2.413285255432129, + "loss/hidden": 0.359375, + "loss/logits": 0.10232223570346832, + "loss/reg": 224635.125, + "step": 278 + }, + { + "epoch": 0.006975, + "grad_norm": 0.9385935664176941, + "grad_norm_var": 0.027567505122787835, + "learning_rate": 0.0001, + "loss": 224046.9062, + "loss/crossentropy": 2.5851972103118896, + "loss/hidden": 0.345703125, + "loss/logits": 0.10651391744613647, + "loss/reg": 224046.453125, + "step": 279 + }, + { + "epoch": 0.007, + "grad_norm": 1.1753954887390137, + "grad_norm_var": 0.027261276031140782, + "learning_rate": 0.0001, + "loss": 223464.125, + "loss/crossentropy": 2.621198892593384, + "loss/hidden": 0.37109375, + "loss/logits": 0.1016470193862915, + "loss/reg": 223463.65625, + "step": 280 + }, + { + "epoch": 0.007025, + "grad_norm": 0.7150670289993286, + "grad_norm_var": 0.03498090695509711, + "learning_rate": 0.0001, + "loss": 222877.25, + "loss/crossentropy": 2.3241894245147705, + "loss/hidden": 0.330078125, + "loss/logits": 0.08896858990192413, + "loss/reg": 222876.828125, + "step": 281 + }, + { + "epoch": 0.00705, + "grad_norm": 0.7701132893562317, + "grad_norm_var": 0.039118337071005614, + "learning_rate": 0.0001, + "loss": 222293.3594, + "loss/crossentropy": 2.402475595474243, + "loss/hidden": 0.333984375, + "loss/logits": 0.08732319623231888, + "loss/reg": 222292.9375, + "step": 282 + }, + { + "epoch": 0.007075, + "grad_norm": 0.8436958193778992, + "grad_norm_var": 0.04130473994550788, + "learning_rate": 0.0001, + "loss": 221710.1719, + "loss/crossentropy": 2.3136937618255615, + "loss/hidden": 0.341796875, + "loss/logits": 0.08763205260038376, + "loss/reg": 221709.75, + "step": 283 + }, + { + "epoch": 0.0071, + "grad_norm": 1.1946781873703003, + "grad_norm_var": 0.03747769835346162, + "learning_rate": 0.0001, + "loss": 221121.3125, + "loss/crossentropy": 2.662444829940796, + "loss/hidden": 0.34375, + "loss/logits": 0.09524821490049362, + "loss/reg": 221120.875, + "step": 284 + }, + { + "epoch": 0.007125, + "grad_norm": 1.2086204290390015, + "grad_norm_var": 0.032333006419608526, + "learning_rate": 0.0001, + "loss": 220536.6875, + "loss/crossentropy": 2.615415334701538, + "loss/hidden": 0.35546875, + "loss/logits": 0.08893512934446335, + "loss/reg": 220536.25, + "step": 285 + }, + { + "epoch": 0.00715, + "grad_norm": 0.9618238210678101, + "grad_norm_var": 0.02148886117231328, + "learning_rate": 0.0001, + "loss": 219940.6406, + "loss/crossentropy": 2.6531989574432373, + "loss/hidden": 0.353515625, + "loss/logits": 0.10557064414024353, + "loss/reg": 219940.1875, + "step": 286 + }, + { + "epoch": 0.007175, + "grad_norm": 1.1346673965454102, + "grad_norm_var": 0.022916324895915455, + "learning_rate": 0.0001, + "loss": 219349.0469, + "loss/crossentropy": 2.2792797088623047, + "loss/hidden": 0.365234375, + "loss/logits": 0.0966588705778122, + "loss/reg": 219348.578125, + "step": 287 + }, + { + "epoch": 0.0072, + "grad_norm": 1.011553406715393, + "grad_norm_var": 0.022478923702978902, + "learning_rate": 0.0001, + "loss": 218766.7188, + "loss/crossentropy": 2.4701690673828125, + "loss/hidden": 0.35546875, + "loss/logits": 0.10134139657020569, + "loss/reg": 218766.265625, + "step": 288 + }, + { + "epoch": 0.007225, + "grad_norm": 1.0193150043487549, + "grad_norm_var": 0.02250041335545263, + "learning_rate": 0.0001, + "loss": 218185.0312, + "loss/crossentropy": 2.465268135070801, + "loss/hidden": 0.333984375, + "loss/logits": 0.09241659939289093, + "loss/reg": 218184.609375, + "step": 289 + }, + { + "epoch": 0.00725, + "grad_norm": 1.0223941802978516, + "grad_norm_var": 0.021737527223071717, + "learning_rate": 0.0001, + "loss": 217599.0938, + "loss/crossentropy": 2.7111759185791016, + "loss/hidden": 0.330078125, + "loss/logits": 0.07478834688663483, + "loss/reg": 217598.6875, + "step": 290 + }, + { + "epoch": 0.007275, + "grad_norm": 0.9252673983573914, + "grad_norm_var": 0.02150853476019776, + "learning_rate": 0.0001, + "loss": 217004.875, + "loss/crossentropy": 2.551020860671997, + "loss/hidden": 0.3515625, + "loss/logits": 0.09227164834737778, + "loss/reg": 217004.4375, + "step": 291 + }, + { + "epoch": 0.0073, + "grad_norm": 1.1825777292251587, + "grad_norm_var": 0.02390391973484823, + "learning_rate": 0.0001, + "loss": 216435.7344, + "loss/crossentropy": 2.4061431884765625, + "loss/hidden": 0.337890625, + "loss/logits": 0.09683596342802048, + "loss/reg": 216435.296875, + "step": 292 + }, + { + "epoch": 0.007325, + "grad_norm": 1.2532509565353394, + "grad_norm_var": 0.027946987478518166, + "learning_rate": 0.0001, + "loss": 215860.2344, + "loss/crossentropy": 2.4278337955474854, + "loss/hidden": 0.392578125, + "loss/logits": 0.11733637750148773, + "loss/reg": 215859.71875, + "step": 293 + }, + { + "epoch": 0.00735, + "grad_norm": 1.0421700477600098, + "grad_norm_var": 0.02599317334285066, + "learning_rate": 0.0001, + "loss": 215285.9531, + "loss/crossentropy": 2.326347827911377, + "loss/hidden": 0.333984375, + "loss/logits": 0.09628565609455109, + "loss/reg": 215285.515625, + "step": 294 + }, + { + "epoch": 0.007375, + "grad_norm": 0.8057188391685486, + "grad_norm_var": 0.028626581178707273, + "learning_rate": 0.0001, + "loss": 214712.9844, + "loss/crossentropy": 2.355774402618408, + "loss/hidden": 0.34375, + "loss/logits": 0.1015491932630539, + "loss/reg": 214712.546875, + "step": 295 + }, + { + "epoch": 0.0074, + "grad_norm": 0.9605076909065247, + "grad_norm_var": 0.026964136984593035, + "learning_rate": 0.0001, + "loss": 214139.4062, + "loss/crossentropy": 2.6580722332000732, + "loss/hidden": 0.3671875, + "loss/logits": 0.10701446235179901, + "loss/reg": 214138.9375, + "step": 296 + }, + { + "epoch": 0.007425, + "grad_norm": 1.0147887468338013, + "grad_norm_var": 0.02106352582271733, + "learning_rate": 0.0001, + "loss": 213561.5625, + "loss/crossentropy": 2.488769054412842, + "loss/hidden": 0.337890625, + "loss/logits": 0.09455054253339767, + "loss/reg": 213561.125, + "step": 297 + }, + { + "epoch": 0.00745, + "grad_norm": 0.9428306818008423, + "grad_norm_var": 0.017128520051063786, + "learning_rate": 0.0001, + "loss": 212931.4375, + "loss/crossentropy": 2.7574522495269775, + "loss/hidden": 0.333984375, + "loss/logits": 0.08713274449110031, + "loss/reg": 212931.015625, + "step": 298 + }, + { + "epoch": 0.007475, + "grad_norm": 0.9792951345443726, + "grad_norm_var": 0.014859793940526804, + "learning_rate": 0.0001, + "loss": 212353.3281, + "loss/crossentropy": 2.5566937923431396, + "loss/hidden": 0.359375, + "loss/logits": 0.08882761001586914, + "loss/reg": 212352.875, + "step": 299 + }, + { + "epoch": 0.0075, + "grad_norm": 0.896796464920044, + "grad_norm_var": 0.01431050394401887, + "learning_rate": 0.0001, + "loss": 211783.0781, + "loss/crossentropy": 2.5626933574676514, + "loss/hidden": 0.330078125, + "loss/logits": 0.09205923229455948, + "loss/reg": 211782.65625, + "step": 300 + }, + { + "epoch": 0.007525, + "grad_norm": 0.7680262923240662, + "grad_norm_var": 0.015515187773533511, + "learning_rate": 0.0001, + "loss": 211217.1875, + "loss/crossentropy": 2.431285858154297, + "loss/hidden": 0.33203125, + "loss/logits": 0.0967300534248352, + "loss/reg": 211216.765625, + "step": 301 + }, + { + "epoch": 0.00755, + "grad_norm": 0.9614930152893066, + "grad_norm_var": 0.015516660641321244, + "learning_rate": 0.0001, + "loss": 210649.875, + "loss/crossentropy": 2.1730730533599854, + "loss/hidden": 0.33984375, + "loss/logits": 0.09347091615200043, + "loss/reg": 210649.4375, + "step": 302 + }, + { + "epoch": 0.007575, + "grad_norm": 1.0022411346435547, + "grad_norm_var": 0.014147341758013413, + "learning_rate": 0.0001, + "loss": 210079.5938, + "loss/crossentropy": 2.476670503616333, + "loss/hidden": 0.376953125, + "loss/logits": 0.11385416984558105, + "loss/reg": 210079.109375, + "step": 303 + }, + { + "epoch": 0.0076, + "grad_norm": 0.8967432975769043, + "grad_norm_var": 0.014591702837682874, + "learning_rate": 0.0001, + "loss": 209458.6562, + "loss/crossentropy": 2.4919116497039795, + "loss/hidden": 0.33984375, + "loss/logits": 0.10090088099241257, + "loss/reg": 209458.21875, + "step": 304 + }, + { + "epoch": 0.007625, + "grad_norm": 1.0569437742233276, + "grad_norm_var": 0.014879512513798866, + "learning_rate": 0.0001, + "loss": 208890.5938, + "loss/crossentropy": 2.506409168243408, + "loss/hidden": 0.34375, + "loss/logits": 0.0949743390083313, + "loss/reg": 208890.15625, + "step": 305 + }, + { + "epoch": 0.00765, + "grad_norm": 0.9816184639930725, + "grad_norm_var": 0.014763490776195794, + "learning_rate": 0.0001, + "loss": 208320.8594, + "loss/crossentropy": 2.578554153442383, + "loss/hidden": 0.337890625, + "loss/logits": 0.10027292370796204, + "loss/reg": 208320.421875, + "step": 306 + }, + { + "epoch": 0.007675, + "grad_norm": 0.8720800876617432, + "grad_norm_var": 0.015324127636003245, + "learning_rate": 0.0001, + "loss": 207765.6406, + "loss/crossentropy": 2.460149049758911, + "loss/hidden": 0.3203125, + "loss/logits": 0.07679739594459534, + "loss/reg": 207765.25, + "step": 307 + }, + { + "epoch": 0.0077, + "grad_norm": 1.0239520072937012, + "grad_norm_var": 0.012529052004580344, + "learning_rate": 0.0001, + "loss": 207164.1094, + "loss/crossentropy": 2.575766086578369, + "loss/hidden": 0.349609375, + "loss/logits": 0.0994313433766365, + "loss/reg": 207163.65625, + "step": 308 + }, + { + "epoch": 0.007725, + "grad_norm": 0.8378457427024841, + "grad_norm_var": 0.007412576724656938, + "learning_rate": 0.0001, + "loss": 206549.0469, + "loss/crossentropy": 2.478156566619873, + "loss/hidden": 0.33984375, + "loss/logits": 0.08868305385112762, + "loss/reg": 206548.625, + "step": 309 + }, + { + "epoch": 0.00775, + "grad_norm": 0.9334253668785095, + "grad_norm_var": 0.006673036559606219, + "learning_rate": 0.0001, + "loss": 205981.6719, + "loss/crossentropy": 2.2035024166107178, + "loss/hidden": 0.326171875, + "loss/logits": 0.09419551491737366, + "loss/reg": 205981.25, + "step": 310 + }, + { + "epoch": 0.007775, + "grad_norm": 0.9910398721694946, + "grad_norm_var": 0.0056647394639551, + "learning_rate": 0.0001, + "loss": 205414.7656, + "loss/crossentropy": 2.25825834274292, + "loss/hidden": 0.375, + "loss/logits": 0.1014915332198143, + "loss/reg": 205414.296875, + "step": 311 + }, + { + "epoch": 0.0078, + "grad_norm": 1.8309675455093384, + "grad_norm_var": 0.05482355491819148, + "learning_rate": 0.0001, + "loss": 204837.3125, + "loss/crossentropy": 2.665398120880127, + "loss/hidden": 0.388671875, + "loss/logits": 0.11154994368553162, + "loss/reg": 204836.8125, + "step": 312 + }, + { + "epoch": 0.007825, + "grad_norm": 1.2233872413635254, + "grad_norm_var": 0.05797169042160041, + "learning_rate": 0.0001, + "loss": 204219.4844, + "loss/crossentropy": 2.435774326324463, + "loss/hidden": 0.36328125, + "loss/logits": 0.09434737265110016, + "loss/reg": 204219.03125, + "step": 313 + }, + { + "epoch": 0.00785, + "grad_norm": 1.1289325952529907, + "grad_norm_var": 0.05840960239376898, + "learning_rate": 0.0001, + "loss": 203649.0625, + "loss/crossentropy": 2.789903163909912, + "loss/hidden": 0.35546875, + "loss/logits": 0.09921029210090637, + "loss/reg": 203648.609375, + "step": 314 + }, + { + "epoch": 0.007875, + "grad_norm": 1.1229325532913208, + "grad_norm_var": 0.05884196868796773, + "learning_rate": 0.0001, + "loss": 203077.5625, + "loss/crossentropy": 2.476925849914551, + "loss/hidden": 0.359375, + "loss/logits": 0.09086183458566666, + "loss/reg": 203077.109375, + "step": 315 + }, + { + "epoch": 0.0079, + "grad_norm": 1.3443208932876587, + "grad_norm_var": 0.06323050957540814, + "learning_rate": 0.0001, + "loss": 202519.1094, + "loss/crossentropy": 2.7508223056793213, + "loss/hidden": 0.37109375, + "loss/logits": 0.09828442335128784, + "loss/reg": 202518.640625, + "step": 316 + }, + { + "epoch": 0.007925, + "grad_norm": 0.928200900554657, + "grad_norm_var": 0.05857714402731625, + "learning_rate": 0.0001, + "loss": 201938.7031, + "loss/crossentropy": 2.7406234741210938, + "loss/hidden": 0.36328125, + "loss/logits": 0.09745270013809204, + "loss/reg": 201938.25, + "step": 317 + }, + { + "epoch": 0.00795, + "grad_norm": 0.7533476948738098, + "grad_norm_var": 0.06432425513706654, + "learning_rate": 0.0001, + "loss": 201374.375, + "loss/crossentropy": 2.1193270683288574, + "loss/hidden": 0.32421875, + "loss/logits": 0.09022863209247589, + "loss/reg": 201373.953125, + "step": 318 + }, + { + "epoch": 0.007975, + "grad_norm": 1.1448767185211182, + "grad_norm_var": 0.06453541031427007, + "learning_rate": 0.0001, + "loss": 200807.0156, + "loss/crossentropy": 2.310652494430542, + "loss/hidden": 0.34375, + "loss/logits": 0.09617799520492554, + "loss/reg": 200806.578125, + "step": 319 + }, + { + "epoch": 0.008, + "grad_norm": 0.9061183333396912, + "grad_norm_var": 0.06432819005303253, + "learning_rate": 0.0001, + "loss": 200243.875, + "loss/crossentropy": 2.229034900665283, + "loss/hidden": 0.33203125, + "loss/logits": 0.08925358951091766, + "loss/reg": 200243.453125, + "step": 320 + }, + { + "epoch": 0.008025, + "grad_norm": 0.8542099595069885, + "grad_norm_var": 0.06718233247158024, + "learning_rate": 0.0001, + "loss": 199683.2188, + "loss/crossentropy": 2.2956204414367676, + "loss/hidden": 0.32421875, + "loss/logits": 0.07856299728155136, + "loss/reg": 199682.8125, + "step": 321 + }, + { + "epoch": 0.00805, + "grad_norm": 1.0101763010025024, + "grad_norm_var": 0.06695454165222821, + "learning_rate": 0.0001, + "loss": 199094.9219, + "loss/crossentropy": 2.689578056335449, + "loss/hidden": 0.3359375, + "loss/logits": 0.09941205382347107, + "loss/reg": 199094.484375, + "step": 322 + }, + { + "epoch": 0.008075, + "grad_norm": 1.1850814819335938, + "grad_norm_var": 0.06537643569359085, + "learning_rate": 0.0001, + "loss": 198532.7031, + "loss/crossentropy": 2.731456756591797, + "loss/hidden": 0.357421875, + "loss/logits": 0.09926258027553558, + "loss/reg": 198532.25, + "step": 323 + }, + { + "epoch": 0.0081, + "grad_norm": 1.215268611907959, + "grad_norm_var": 0.06633188906198194, + "learning_rate": 0.0001, + "loss": 197967.2344, + "loss/crossentropy": 3.0295090675354004, + "loss/hidden": 0.35546875, + "loss/logits": 0.10664241015911102, + "loss/reg": 197966.765625, + "step": 324 + }, + { + "epoch": 0.008125, + "grad_norm": 1.026171326637268, + "grad_norm_var": 0.062263808409530684, + "learning_rate": 0.0001, + "loss": 197410.1719, + "loss/crossentropy": 2.5564486980438232, + "loss/hidden": 0.3671875, + "loss/logits": 0.10696598887443542, + "loss/reg": 197409.703125, + "step": 325 + }, + { + "epoch": 0.00815, + "grad_norm": 0.9427042603492737, + "grad_norm_var": 0.062063225036427014, + "learning_rate": 0.0001, + "loss": 196858.6875, + "loss/crossentropy": 2.5432636737823486, + "loss/hidden": 0.3359375, + "loss/logits": 0.09693843126296997, + "loss/reg": 196858.25, + "step": 326 + }, + { + "epoch": 0.008175, + "grad_norm": 1.0309820175170898, + "grad_norm_var": 0.06158008077064516, + "learning_rate": 0.0001, + "loss": 196308.8594, + "loss/crossentropy": 2.2705764770507812, + "loss/hidden": 0.34375, + "loss/logits": 0.0911949872970581, + "loss/reg": 196308.421875, + "step": 327 + }, + { + "epoch": 0.0082, + "grad_norm": 0.8754981756210327, + "grad_norm_var": 0.025895035422288125, + "learning_rate": 0.0001, + "loss": 195759.7188, + "loss/crossentropy": 2.734731674194336, + "loss/hidden": 0.34375, + "loss/logits": 0.09578876197338104, + "loss/reg": 195759.28125, + "step": 328 + }, + { + "epoch": 0.008225, + "grad_norm": 0.8853642344474792, + "grad_norm_var": 0.024918108806193592, + "learning_rate": 0.0001, + "loss": 195213.2344, + "loss/crossentropy": 2.5435054302215576, + "loss/hidden": 0.3671875, + "loss/logits": 0.10717779397964478, + "loss/reg": 195212.765625, + "step": 329 + }, + { + "epoch": 0.00825, + "grad_norm": 0.7243690490722656, + "grad_norm_var": 0.029386820407246182, + "learning_rate": 0.0001, + "loss": 194668.5938, + "loss/crossentropy": 2.6787264347076416, + "loss/hidden": 0.330078125, + "loss/logits": 0.0956207662820816, + "loss/reg": 194668.171875, + "step": 330 + }, + { + "epoch": 0.008275, + "grad_norm": 0.8909747004508972, + "grad_norm_var": 0.028850196653393077, + "learning_rate": 0.0001, + "loss": 194133.2031, + "loss/crossentropy": 2.426013231277466, + "loss/hidden": 0.345703125, + "loss/logits": 0.09088262915611267, + "loss/reg": 194132.765625, + "step": 331 + }, + { + "epoch": 0.0083, + "grad_norm": 0.932327926158905, + "grad_norm_var": 0.019575128143828616, + "learning_rate": 0.0001, + "loss": 193583.8594, + "loss/crossentropy": 2.385460138320923, + "loss/hidden": 0.33203125, + "loss/logits": 0.08535876125097275, + "loss/reg": 193583.4375, + "step": 332 + }, + { + "epoch": 0.008325, + "grad_norm": 1.1217122077941895, + "grad_norm_var": 0.021182687079661245, + "learning_rate": 0.0001, + "loss": 193025.6562, + "loss/crossentropy": 2.676365613937378, + "loss/hidden": 0.369140625, + "loss/logits": 0.10710591077804565, + "loss/reg": 193025.1875, + "step": 333 + }, + { + "epoch": 0.00835, + "grad_norm": 0.92121422290802, + "grad_norm_var": 0.018123849895345677, + "learning_rate": 0.0001, + "loss": 192477.2656, + "loss/crossentropy": 2.4655494689941406, + "loss/hidden": 0.357421875, + "loss/logits": 0.09714263677597046, + "loss/reg": 192476.8125, + "step": 334 + }, + { + "epoch": 0.008375, + "grad_norm": 1.0069211721420288, + "grad_norm_var": 0.01626569057576172, + "learning_rate": 0.0001, + "loss": 191934.8438, + "loss/crossentropy": 2.6041626930236816, + "loss/hidden": 0.357421875, + "loss/logits": 0.08919402956962585, + "loss/reg": 191934.390625, + "step": 335 + }, + { + "epoch": 0.0084, + "grad_norm": 0.9137717485427856, + "grad_norm_var": 0.01620358310926281, + "learning_rate": 0.0001, + "loss": 191393.7969, + "loss/crossentropy": 2.503648042678833, + "loss/hidden": 0.3515625, + "loss/logits": 0.09979408979415894, + "loss/reg": 191393.34375, + "step": 336 + }, + { + "epoch": 0.008425, + "grad_norm": 1.6518378257751465, + "grad_norm_var": 0.043541088119168056, + "learning_rate": 0.0001, + "loss": 190852.2188, + "loss/crossentropy": 2.46239972114563, + "loss/hidden": 0.412109375, + "loss/logits": 0.13722866773605347, + "loss/reg": 190851.671875, + "step": 337 + }, + { + "epoch": 0.00845, + "grad_norm": 1.1564044952392578, + "grad_norm_var": 0.04466845511115949, + "learning_rate": 0.0001, + "loss": 190309.7969, + "loss/crossentropy": 2.662630796432495, + "loss/hidden": 0.37109375, + "loss/logits": 0.10392837226390839, + "loss/reg": 190309.328125, + "step": 338 + }, + { + "epoch": 0.008475, + "grad_norm": 0.9882319569587708, + "grad_norm_var": 0.04302094135640619, + "learning_rate": 0.0001, + "loss": 189746.9375, + "loss/crossentropy": 2.6329383850097656, + "loss/hidden": 0.3671875, + "loss/logits": 0.0980134829878807, + "loss/reg": 189746.46875, + "step": 339 + }, + { + "epoch": 0.0085, + "grad_norm": 0.9910109043121338, + "grad_norm_var": 0.04025769198177695, + "learning_rate": 0.0001, + "loss": 189200.5781, + "loss/crossentropy": 2.8005034923553467, + "loss/hidden": 0.36328125, + "loss/logits": 0.10183151066303253, + "loss/reg": 189200.109375, + "step": 340 + }, + { + "epoch": 0.008525, + "grad_norm": 0.9753293395042419, + "grad_norm_var": 0.040267042628729106, + "learning_rate": 0.0001, + "loss": 188662.3906, + "loss/crossentropy": 2.3686320781707764, + "loss/hidden": 0.328125, + "loss/logits": 0.08583434671163559, + "loss/reg": 188661.984375, + "step": 341 + }, + { + "epoch": 0.00855, + "grad_norm": 0.961402952671051, + "grad_norm_var": 0.04014469928526877, + "learning_rate": 0.0001, + "loss": 188133.3438, + "loss/crossentropy": 2.5603463649749756, + "loss/hidden": 0.3359375, + "loss/logits": 0.08635769784450531, + "loss/reg": 188132.921875, + "step": 342 + }, + { + "epoch": 0.008575, + "grad_norm": 1.043996810913086, + "grad_norm_var": 0.040206082506468434, + "learning_rate": 0.0001, + "loss": 187566.3281, + "loss/crossentropy": 2.4850785732269287, + "loss/hidden": 0.326171875, + "loss/logits": 0.0760430246591568, + "loss/reg": 187565.921875, + "step": 343 + }, + { + "epoch": 0.0086, + "grad_norm": 0.9029485583305359, + "grad_norm_var": 0.03978826045033466, + "learning_rate": 0.0001, + "loss": 187032.0938, + "loss/crossentropy": 2.2597060203552246, + "loss/hidden": 0.34375, + "loss/logits": 0.10078242421150208, + "loss/reg": 187031.65625, + "step": 344 + }, + { + "epoch": 0.008625, + "grad_norm": 1.0176180601119995, + "grad_norm_var": 0.03878524009466933, + "learning_rate": 0.0001, + "loss": 186498.3906, + "loss/crossentropy": 2.4756250381469727, + "loss/hidden": 0.359375, + "loss/logits": 0.09760268777608871, + "loss/reg": 186497.9375, + "step": 345 + }, + { + "epoch": 0.00865, + "grad_norm": 0.9344642162322998, + "grad_norm_var": 0.03347254015602245, + "learning_rate": 0.0001, + "loss": 185958.5469, + "loss/crossentropy": 2.703340768814087, + "loss/hidden": 0.349609375, + "loss/logits": 0.08978617191314697, + "loss/reg": 185958.109375, + "step": 346 + }, + { + "epoch": 0.008675, + "grad_norm": 0.8480978012084961, + "grad_norm_var": 0.03435728661884322, + "learning_rate": 0.0001, + "loss": 185421.1875, + "loss/crossentropy": 2.5134730339050293, + "loss/hidden": 0.333984375, + "loss/logits": 0.08980560302734375, + "loss/reg": 185420.765625, + "step": 347 + }, + { + "epoch": 0.0087, + "grad_norm": 0.8679680228233337, + "grad_norm_var": 0.03539387937272805, + "learning_rate": 0.0001, + "loss": 184879.4688, + "loss/crossentropy": 2.4469387531280518, + "loss/hidden": 0.349609375, + "loss/logits": 0.0879959762096405, + "loss/reg": 184879.03125, + "step": 348 + }, + { + "epoch": 0.008725, + "grad_norm": 1.590740442276001, + "grad_norm_var": 0.05557060158014301, + "learning_rate": 0.0001, + "loss": 184311.4062, + "loss/crossentropy": 2.9886884689331055, + "loss/hidden": 0.36328125, + "loss/logits": 0.10285791009664536, + "loss/reg": 184310.9375, + "step": 349 + }, + { + "epoch": 0.00875, + "grad_norm": 1.0205501317977905, + "grad_norm_var": 0.05450480041651103, + "learning_rate": 0.0001, + "loss": 183769.2344, + "loss/crossentropy": 2.7456204891204834, + "loss/hidden": 0.37109375, + "loss/logits": 0.10173928737640381, + "loss/reg": 183768.765625, + "step": 350 + }, + { + "epoch": 0.008775, + "grad_norm": 0.8315656781196594, + "grad_norm_var": 0.05753804406173935, + "learning_rate": 0.0001, + "loss": 183234.9531, + "loss/crossentropy": 2.419807195663452, + "loss/hidden": 0.337890625, + "loss/logits": 0.09012450277805328, + "loss/reg": 183234.53125, + "step": 351 + }, + { + "epoch": 0.0088, + "grad_norm": 1.123649001121521, + "grad_norm_var": 0.056660911801374364, + "learning_rate": 0.0001, + "loss": 182707.2656, + "loss/crossentropy": 2.7368407249450684, + "loss/hidden": 0.3515625, + "loss/logits": 0.09511645883321762, + "loss/reg": 182706.8125, + "step": 352 + }, + { + "epoch": 0.008825, + "grad_norm": 0.9307337999343872, + "grad_norm_var": 0.03193119714682216, + "learning_rate": 0.0001, + "loss": 182174.3906, + "loss/crossentropy": 2.529085397720337, + "loss/hidden": 0.3359375, + "loss/logits": 0.07927797734737396, + "loss/reg": 182173.96875, + "step": 353 + }, + { + "epoch": 0.00885, + "grad_norm": 0.8824712634086609, + "grad_norm_var": 0.031330232119417885, + "learning_rate": 0.0001, + "loss": 181640.3438, + "loss/crossentropy": 2.314749240875244, + "loss/hidden": 0.330078125, + "loss/logits": 0.09226064383983612, + "loss/reg": 181639.921875, + "step": 354 + }, + { + "epoch": 0.008875, + "grad_norm": 0.8409653902053833, + "grad_norm_var": 0.032807275137196494, + "learning_rate": 0.0001, + "loss": 181106.4531, + "loss/crossentropy": 2.33561635017395, + "loss/hidden": 0.34375, + "loss/logits": 0.09517761319875717, + "loss/reg": 181106.015625, + "step": 355 + }, + { + "epoch": 0.0089, + "grad_norm": 0.9306114315986633, + "grad_norm_var": 0.03298864164169767, + "learning_rate": 0.0001, + "loss": 180575.0625, + "loss/crossentropy": 2.4940664768218994, + "loss/hidden": 0.3359375, + "loss/logits": 0.09492896497249603, + "loss/reg": 180574.625, + "step": 356 + }, + { + "epoch": 0.008925, + "grad_norm": 0.7635557055473328, + "grad_norm_var": 0.035964318367633044, + "learning_rate": 0.0001, + "loss": 180047.5469, + "loss/crossentropy": 2.4428975582122803, + "loss/hidden": 0.3359375, + "loss/logits": 0.09701186418533325, + "loss/reg": 180047.109375, + "step": 357 + }, + { + "epoch": 0.00895, + "grad_norm": 1.1170213222503662, + "grad_norm_var": 0.037336672350648095, + "learning_rate": 0.0001, + "loss": 179509.2656, + "loss/crossentropy": 2.3574512004852295, + "loss/hidden": 0.33203125, + "loss/logits": 0.11497388780117035, + "loss/reg": 179508.8125, + "step": 358 + }, + { + "epoch": 0.008975, + "grad_norm": 0.7869230508804321, + "grad_norm_var": 0.03920273191698739, + "learning_rate": 0.0001, + "loss": 178985.1719, + "loss/crossentropy": 2.291933059692383, + "loss/hidden": 0.328125, + "loss/logits": 0.08891676366329193, + "loss/reg": 178984.75, + "step": 359 + }, + { + "epoch": 0.009, + "grad_norm": 0.9132789969444275, + "grad_norm_var": 0.039128246987934705, + "learning_rate": 0.0001, + "loss": 178473.2969, + "loss/crossentropy": 2.655181884765625, + "loss/hidden": 0.359375, + "loss/logits": 0.11087474972009659, + "loss/reg": 178472.828125, + "step": 360 + }, + { + "epoch": 0.009025, + "grad_norm": 0.8164347410202026, + "grad_norm_var": 0.04017976554790762, + "learning_rate": 0.0001, + "loss": 177953.1406, + "loss/crossentropy": 2.362248659133911, + "loss/hidden": 0.3359375, + "loss/logits": 0.10053600370883942, + "loss/reg": 177952.703125, + "step": 361 + }, + { + "epoch": 0.00905, + "grad_norm": 1.0977365970611572, + "grad_norm_var": 0.041508992287856414, + "learning_rate": 0.0001, + "loss": 177436.2656, + "loss/crossentropy": 2.665569305419922, + "loss/hidden": 0.357421875, + "loss/logits": 0.09883129596710205, + "loss/reg": 177435.8125, + "step": 362 + }, + { + "epoch": 0.009075, + "grad_norm": 1.1380103826522827, + "grad_norm_var": 0.04243092831887196, + "learning_rate": 0.0001, + "loss": 176886.3906, + "loss/crossentropy": 2.4346649646759033, + "loss/hidden": 0.361328125, + "loss/logits": 0.09544631838798523, + "loss/reg": 176885.9375, + "step": 363 + }, + { + "epoch": 0.0091, + "grad_norm": 1.7445480823516846, + "grad_norm_var": 0.07756442404939948, + "learning_rate": 0.0001, + "loss": 176361.0625, + "loss/crossentropy": 2.4675376415252686, + "loss/hidden": 0.36328125, + "loss/logits": 0.10034389793872833, + "loss/reg": 176360.59375, + "step": 364 + }, + { + "epoch": 0.009125, + "grad_norm": 0.9684613347053528, + "grad_norm_var": 0.055494477448874056, + "learning_rate": 0.0001, + "loss": 175839.0781, + "loss/crossentropy": 2.854846239089966, + "loss/hidden": 0.37109375, + "loss/logits": 0.12111840397119522, + "loss/reg": 175838.578125, + "step": 365 + }, + { + "epoch": 0.00915, + "grad_norm": 0.9467345476150513, + "grad_norm_var": 0.055575263489238046, + "learning_rate": 0.0001, + "loss": 175320.0, + "loss/crossentropy": 2.522322654724121, + "loss/hidden": 0.333984375, + "loss/logits": 0.08145372569561005, + "loss/reg": 175319.578125, + "step": 366 + }, + { + "epoch": 0.009175, + "grad_norm": 1.475239634513855, + "grad_norm_var": 0.06791183377343075, + "learning_rate": 0.0001, + "loss": 174809.4688, + "loss/crossentropy": 2.6860809326171875, + "loss/hidden": 0.375, + "loss/logits": 0.10367526113986969, + "loss/reg": 174808.984375, + "step": 367 + }, + { + "epoch": 0.0092, + "grad_norm": 0.9959954619407654, + "grad_norm_var": 0.067332491810451, + "learning_rate": 0.0001, + "loss": 174250.1875, + "loss/crossentropy": 2.5077319145202637, + "loss/hidden": 0.341796875, + "loss/logits": 0.09400855004787445, + "loss/reg": 174249.75, + "step": 368 + }, + { + "epoch": 0.009225, + "grad_norm": 0.776086151599884, + "grad_norm_var": 0.07070489089481526, + "learning_rate": 0.0001, + "loss": 173736.8906, + "loss/crossentropy": 2.2909703254699707, + "loss/hidden": 0.330078125, + "loss/logits": 0.08630555123090744, + "loss/reg": 173736.46875, + "step": 369 + }, + { + "epoch": 0.00925, + "grad_norm": 0.9044707417488098, + "grad_norm_var": 0.07035481726732086, + "learning_rate": 0.0001, + "loss": 173222.7188, + "loss/crossentropy": 2.2393484115600586, + "loss/hidden": 0.34375, + "loss/logits": 0.09456537663936615, + "loss/reg": 173222.28125, + "step": 370 + }, + { + "epoch": 0.009275, + "grad_norm": 0.9271029233932495, + "grad_norm_var": 0.0688369333629584, + "learning_rate": 0.0001, + "loss": 172693.1875, + "loss/crossentropy": 2.1928179264068604, + "loss/hidden": 0.357421875, + "loss/logits": 0.09735684096813202, + "loss/reg": 172692.734375, + "step": 371 + }, + { + "epoch": 0.0093, + "grad_norm": 1.1868821382522583, + "grad_norm_var": 0.06992523411455233, + "learning_rate": 0.0001, + "loss": 172176.3438, + "loss/crossentropy": 2.3856608867645264, + "loss/hidden": 0.3359375, + "loss/logits": 0.08821407705545425, + "loss/reg": 172175.921875, + "step": 372 + }, + { + "epoch": 0.009325, + "grad_norm": 0.9966070652008057, + "grad_norm_var": 0.06488801284089368, + "learning_rate": 0.0001, + "loss": 171664.9531, + "loss/crossentropy": 2.5923867225646973, + "loss/hidden": 0.322265625, + "loss/logits": 0.08066350221633911, + "loss/reg": 171664.546875, + "step": 373 + }, + { + "epoch": 0.00935, + "grad_norm": 1.5168896913528442, + "grad_norm_var": 0.07848294009414543, + "learning_rate": 0.0001, + "loss": 171161.125, + "loss/crossentropy": 2.563632011413574, + "loss/hidden": 0.37109375, + "loss/logits": 0.09065365791320801, + "loss/reg": 171160.65625, + "step": 374 + }, + { + "epoch": 0.009375, + "grad_norm": 1.0048420429229736, + "grad_norm_var": 0.07309627264863429, + "learning_rate": 0.0001, + "loss": 170612.1406, + "loss/crossentropy": 2.9609603881835938, + "loss/hidden": 0.3515625, + "loss/logits": 0.07979125529527664, + "loss/reg": 170611.703125, + "step": 375 + }, + { + "epoch": 0.0094, + "grad_norm": 1.1617481708526611, + "grad_norm_var": 0.07116372542663024, + "learning_rate": 0.0001, + "loss": 170073.0312, + "loss/crossentropy": 2.7246451377868652, + "loss/hidden": 0.357421875, + "loss/logits": 0.08837760984897614, + "loss/reg": 170072.578125, + "step": 376 + }, + { + "epoch": 0.009425, + "grad_norm": 2.0842902660369873, + "grad_norm_var": 0.12308328174345645, + "learning_rate": 0.0001, + "loss": 169564.4531, + "loss/crossentropy": 3.3340728282928467, + "loss/hidden": 0.392578125, + "loss/logits": 0.09503665566444397, + "loss/reg": 169563.96875, + "step": 377 + }, + { + "epoch": 0.00945, + "grad_norm": 1.0924917459487915, + "grad_norm_var": 0.123144523949518, + "learning_rate": 0.0001, + "loss": 169061.9688, + "loss/crossentropy": 2.3122220039367676, + "loss/hidden": 0.365234375, + "loss/logits": 0.09052115678787231, + "loss/reg": 169061.515625, + "step": 378 + }, + { + "epoch": 0.009475, + "grad_norm": 1.092575192451477, + "grad_norm_var": 0.12354321705638256, + "learning_rate": 0.0001, + "loss": 168547.3281, + "loss/crossentropy": 2.7103536128997803, + "loss/hidden": 0.373046875, + "loss/logits": 0.08894005417823792, + "loss/reg": 168546.859375, + "step": 379 + }, + { + "epoch": 0.0095, + "grad_norm": 0.9879716634750366, + "grad_norm_var": 0.10233713040041952, + "learning_rate": 0.0001, + "loss": 168007.6094, + "loss/crossentropy": 2.4375011920928955, + "loss/hidden": 0.333984375, + "loss/logits": 0.0795358344912529, + "loss/reg": 168007.203125, + "step": 380 + }, + { + "epoch": 0.009525, + "grad_norm": 1.2313728332519531, + "grad_norm_var": 0.10091046048275355, + "learning_rate": 0.0001, + "loss": 167493.8594, + "loss/crossentropy": 2.495237350463867, + "loss/hidden": 0.3515625, + "loss/logits": 0.09075236320495605, + "loss/reg": 167493.421875, + "step": 381 + }, + { + "epoch": 0.00955, + "grad_norm": 1.0618592500686646, + "grad_norm_var": 0.09863663992902129, + "learning_rate": 0.0001, + "loss": 166984.2812, + "loss/crossentropy": 2.63665771484375, + "loss/hidden": 0.361328125, + "loss/logits": 0.0936121791601181, + "loss/reg": 166983.828125, + "step": 382 + }, + { + "epoch": 0.009575, + "grad_norm": 0.8288941979408264, + "grad_norm_var": 0.09723720292607822, + "learning_rate": 0.0001, + "loss": 166477.375, + "loss/crossentropy": 2.3199474811553955, + "loss/hidden": 0.341796875, + "loss/logits": 0.09022307395935059, + "loss/reg": 166476.9375, + "step": 383 + }, + { + "epoch": 0.0096, + "grad_norm": 0.9028894305229187, + "grad_norm_var": 0.09926415799220058, + "learning_rate": 0.0001, + "loss": 165976.7656, + "loss/crossentropy": 2.2981746196746826, + "loss/hidden": 0.357421875, + "loss/logits": 0.09859772026538849, + "loss/reg": 165976.3125, + "step": 384 + }, + { + "epoch": 0.009625, + "grad_norm": 0.8596852421760559, + "grad_norm_var": 0.09598108104973425, + "learning_rate": 0.0001, + "loss": 165479.5469, + "loss/crossentropy": 2.7811458110809326, + "loss/hidden": 0.39453125, + "loss/logits": 0.11124817281961441, + "loss/reg": 165479.046875, + "step": 385 + }, + { + "epoch": 0.00965, + "grad_norm": 0.9104531407356262, + "grad_norm_var": 0.09581535998637468, + "learning_rate": 0.0001, + "loss": 164984.2969, + "loss/crossentropy": 2.1067349910736084, + "loss/hidden": 0.35546875, + "loss/logits": 0.09003035724163055, + "loss/reg": 164983.84375, + "step": 386 + }, + { + "epoch": 0.009675, + "grad_norm": 0.8352228999137878, + "grad_norm_var": 0.09864986513955144, + "learning_rate": 0.0001, + "loss": 164489.4844, + "loss/crossentropy": 2.380166530609131, + "loss/hidden": 0.337890625, + "loss/logits": 0.09342695772647858, + "loss/reg": 164489.046875, + "step": 387 + }, + { + "epoch": 0.0097, + "grad_norm": 0.7623141407966614, + "grad_norm_var": 0.10554492279767776, + "learning_rate": 0.0001, + "loss": 164000.4062, + "loss/crossentropy": 2.1462862491607666, + "loss/hidden": 0.330078125, + "loss/logits": 0.08137300610542297, + "loss/reg": 164000.0, + "step": 388 + }, + { + "epoch": 0.009725, + "grad_norm": 0.8828011155128479, + "grad_norm_var": 0.10766734548416322, + "learning_rate": 0.0001, + "loss": 163505.2031, + "loss/crossentropy": 2.5420355796813965, + "loss/hidden": 0.333984375, + "loss/logits": 0.08073568344116211, + "loss/reg": 163504.78125, + "step": 389 + }, + { + "epoch": 0.00975, + "grad_norm": 1.0450934171676636, + "grad_norm_var": 0.09384582994403522, + "learning_rate": 0.0001, + "loss": 163012.25, + "loss/crossentropy": 2.5532455444335938, + "loss/hidden": 0.35546875, + "loss/logits": 0.0926065444946289, + "loss/reg": 163011.796875, + "step": 390 + }, + { + "epoch": 0.009775, + "grad_norm": 0.9712976813316345, + "grad_norm_var": 0.09410261616215033, + "learning_rate": 0.0001, + "loss": 162519.625, + "loss/crossentropy": 2.5090253353118896, + "loss/hidden": 0.37109375, + "loss/logits": 0.09214738011360168, + "loss/reg": 162519.15625, + "step": 391 + }, + { + "epoch": 0.0098, + "grad_norm": 0.9004305005073547, + "grad_norm_var": 0.09428308254190747, + "learning_rate": 0.0001, + "loss": 162016.3281, + "loss/crossentropy": 2.5954947471618652, + "loss/hidden": 0.365234375, + "loss/logits": 0.09887327998876572, + "loss/reg": 162015.859375, + "step": 392 + }, + { + "epoch": 0.009825, + "grad_norm": 0.979634165763855, + "grad_norm_var": 0.014986441797913471, + "learning_rate": 0.0001, + "loss": 161516.9375, + "loss/crossentropy": 2.4483871459960938, + "loss/hidden": 0.369140625, + "loss/logits": 0.09639716148376465, + "loss/reg": 161516.46875, + "step": 393 + }, + { + "epoch": 0.00985, + "grad_norm": 0.8064605593681335, + "grad_norm_var": 0.015011118141572648, + "learning_rate": 0.0001, + "loss": 161025.2812, + "loss/crossentropy": 2.655043363571167, + "loss/hidden": 0.357421875, + "loss/logits": 0.10133032500743866, + "loss/reg": 161024.828125, + "step": 394 + }, + { + "epoch": 0.009875, + "grad_norm": 0.9056344032287598, + "grad_norm_var": 0.013421822749984299, + "learning_rate": 0.0001, + "loss": 160534.5938, + "loss/crossentropy": 2.545508623123169, + "loss/hidden": 0.353515625, + "loss/logits": 0.0969725251197815, + "loss/reg": 160534.140625, + "step": 395 + }, + { + "epoch": 0.0099, + "grad_norm": 1.2923853397369385, + "grad_norm_var": 0.021586792538579876, + "learning_rate": 0.0001, + "loss": 160045.3281, + "loss/crossentropy": 2.465571880340576, + "loss/hidden": 0.3828125, + "loss/logits": 0.10686793923377991, + "loss/reg": 160044.84375, + "step": 396 + }, + { + "epoch": 0.009925, + "grad_norm": 1.1261005401611328, + "grad_norm_var": 0.018309319694934344, + "learning_rate": 0.0001, + "loss": 159557.3281, + "loss/crossentropy": 2.82228684425354, + "loss/hidden": 0.365234375, + "loss/logits": 0.10036415606737137, + "loss/reg": 159556.859375, + "step": 397 + }, + { + "epoch": 0.00995, + "grad_norm": 1.027556300163269, + "grad_norm_var": 0.01783441826447003, + "learning_rate": 0.0001, + "loss": 159073.2969, + "loss/crossentropy": 2.621041774749756, + "loss/hidden": 0.357421875, + "loss/logits": 0.09349606931209564, + "loss/reg": 159072.84375, + "step": 398 + }, + { + "epoch": 0.009975, + "grad_norm": 1.0642552375793457, + "grad_norm_var": 0.017816103035810337, + "learning_rate": 0.0001, + "loss": 158590.3594, + "loss/crossentropy": 2.474287748336792, + "loss/hidden": 0.365234375, + "loss/logits": 0.10626979172229767, + "loss/reg": 158589.890625, + "step": 399 + }, + { + "epoch": 0.01, + "grad_norm": 1.4468071460723877, + "grad_norm_var": 0.03256261720358751, + "learning_rate": 0.0001, + "loss": 158087.3125, + "loss/crossentropy": 2.40484619140625, + "loss/hidden": 0.376953125, + "loss/logits": 0.10928124189376831, + "loss/reg": 158086.828125, + "step": 400 + }, + { + "epoch": 0.010025, + "grad_norm": 1.073398232460022, + "grad_norm_var": 0.031746375115213714, + "learning_rate": 0.0001, + "loss": 157606.4375, + "loss/crossentropy": 2.74391770362854, + "loss/hidden": 0.3359375, + "loss/logits": 0.0976434051990509, + "loss/reg": 157606.0, + "step": 401 + }, + { + "epoch": 0.01005, + "grad_norm": 0.9355063438415527, + "grad_norm_var": 0.031480248395230365, + "learning_rate": 0.0001, + "loss": 157125.625, + "loss/crossentropy": 2.8919923305511475, + "loss/hidden": 0.34375, + "loss/logits": 0.10159678757190704, + "loss/reg": 157125.171875, + "step": 402 + }, + { + "epoch": 0.010075, + "grad_norm": 1.0578041076660156, + "grad_norm_var": 0.029584649326750645, + "learning_rate": 0.0001, + "loss": 156649.6094, + "loss/crossentropy": 2.58225679397583, + "loss/hidden": 0.34765625, + "loss/logits": 0.10078255087137222, + "loss/reg": 156649.15625, + "step": 403 + }, + { + "epoch": 0.0101, + "grad_norm": 1.1878315210342407, + "grad_norm_var": 0.026432017201673298, + "learning_rate": 0.0001, + "loss": 156176.1406, + "loss/crossentropy": 2.4818239212036133, + "loss/hidden": 0.37109375, + "loss/logits": 0.09668318927288055, + "loss/reg": 156175.671875, + "step": 404 + }, + { + "epoch": 0.010125, + "grad_norm": 0.9575701951980591, + "grad_norm_var": 0.02517501771146251, + "learning_rate": 0.0001, + "loss": 155698.6094, + "loss/crossentropy": 2.7240066528320312, + "loss/hidden": 0.373046875, + "loss/logits": 0.09359210729598999, + "loss/reg": 155698.140625, + "step": 405 + }, + { + "epoch": 0.01015, + "grad_norm": 1.1337554454803467, + "grad_norm_var": 0.025624751561605173, + "learning_rate": 0.0001, + "loss": 155223.0469, + "loss/crossentropy": 2.814995050430298, + "loss/hidden": 0.361328125, + "loss/logits": 0.07844653725624084, + "loss/reg": 155222.609375, + "step": 406 + }, + { + "epoch": 0.010175, + "grad_norm": 1.8449225425720215, + "grad_norm_var": 0.06367491192329258, + "learning_rate": 0.0001, + "loss": 154748.3906, + "loss/crossentropy": 2.988466262817383, + "loss/hidden": 0.451171875, + "loss/logits": 0.14243215322494507, + "loss/reg": 154747.796875, + "step": 407 + }, + { + "epoch": 0.0102, + "grad_norm": 1.2929456233978271, + "grad_norm_var": 0.062401523680452785, + "learning_rate": 0.0001, + "loss": 154275.25, + "loss/crossentropy": 2.7575523853302, + "loss/hidden": 0.369140625, + "loss/logits": 0.08892758190631866, + "loss/reg": 154274.796875, + "step": 408 + }, + { + "epoch": 0.010225, + "grad_norm": 1.2019063234329224, + "grad_norm_var": 0.060935676088661225, + "learning_rate": 0.0001, + "loss": 153802.2656, + "loss/crossentropy": 2.486464023590088, + "loss/hidden": 0.37890625, + "loss/logits": 0.09872272610664368, + "loss/reg": 153801.78125, + "step": 409 + }, + { + "epoch": 0.01025, + "grad_norm": 1.1344937086105347, + "grad_norm_var": 0.05275884218374903, + "learning_rate": 0.0001, + "loss": 153332.4062, + "loss/crossentropy": 2.8986072540283203, + "loss/hidden": 0.376953125, + "loss/logits": 0.11386501789093018, + "loss/reg": 153331.921875, + "step": 410 + }, + { + "epoch": 0.010275, + "grad_norm": 1.3534355163574219, + "grad_norm_var": 0.049645825083297034, + "learning_rate": 0.0001, + "loss": 152866.0156, + "loss/crossentropy": 2.475773572921753, + "loss/hidden": 0.390625, + "loss/logits": 0.09350752830505371, + "loss/reg": 152865.53125, + "step": 411 + }, + { + "epoch": 0.0103, + "grad_norm": 0.7451173663139343, + "grad_norm_var": 0.06130727780234874, + "learning_rate": 0.0001, + "loss": 152397.2031, + "loss/crossentropy": 2.3074593544006348, + "loss/hidden": 0.3359375, + "loss/logits": 0.07472251355648041, + "loss/reg": 152396.796875, + "step": 412 + }, + { + "epoch": 0.010325, + "grad_norm": 1.9421056509017944, + "grad_norm_var": 0.09907635194970485, + "learning_rate": 0.0001, + "loss": 151888.7656, + "loss/crossentropy": 2.791627883911133, + "loss/hidden": 0.375, + "loss/logits": 0.08407286554574966, + "loss/reg": 151888.3125, + "step": 413 + }, + { + "epoch": 0.01035, + "grad_norm": 1.2029465436935425, + "grad_norm_var": 0.09667484496340699, + "learning_rate": 0.0001, + "loss": 151422.8594, + "loss/crossentropy": 2.326814651489258, + "loss/hidden": 0.373046875, + "loss/logits": 0.08045229315757751, + "loss/reg": 151422.40625, + "step": 414 + }, + { + "epoch": 0.010375, + "grad_norm": 0.9819397330284119, + "grad_norm_var": 0.09884528804566108, + "learning_rate": 0.0001, + "loss": 150928.8125, + "loss/crossentropy": 2.497159242630005, + "loss/hidden": 0.375, + "loss/logits": 0.09636622667312622, + "loss/reg": 150928.34375, + "step": 415 + }, + { + "epoch": 0.0104, + "grad_norm": 0.9080580472946167, + "grad_norm_var": 0.10057013774897232, + "learning_rate": 0.0001, + "loss": 150443.7188, + "loss/crossentropy": 2.4876914024353027, + "loss/hidden": 0.361328125, + "loss/logits": 0.08849430084228516, + "loss/reg": 150443.265625, + "step": 416 + }, + { + "epoch": 0.010425, + "grad_norm": 0.7911828756332397, + "grad_norm_var": 0.10973268347362262, + "learning_rate": 0.0001, + "loss": 149973.9688, + "loss/crossentropy": 2.4696006774902344, + "loss/hidden": 0.35546875, + "loss/logits": 0.08970560133457184, + "loss/reg": 149973.53125, + "step": 417 + }, + { + "epoch": 0.01045, + "grad_norm": 1.0096138715744019, + "grad_norm_var": 0.10778883463160321, + "learning_rate": 0.0001, + "loss": 149510.4219, + "loss/crossentropy": 2.434112787246704, + "loss/hidden": 0.361328125, + "loss/logits": 0.09028290212154388, + "loss/reg": 149509.96875, + "step": 418 + }, + { + "epoch": 0.010475, + "grad_norm": 1.0627332925796509, + "grad_norm_var": 0.10771556252531594, + "learning_rate": 0.0001, + "loss": 149044.7969, + "loss/crossentropy": 2.982994794845581, + "loss/hidden": 0.37890625, + "loss/logits": 0.0978410542011261, + "loss/reg": 149044.3125, + "step": 419 + }, + { + "epoch": 0.0105, + "grad_norm": 1.0105903148651123, + "grad_norm_var": 0.10930270282818669, + "learning_rate": 0.0001, + "loss": 148578.8438, + "loss/crossentropy": 2.3700225353240967, + "loss/hidden": 0.349609375, + "loss/logits": 0.09978696703910828, + "loss/reg": 148578.390625, + "step": 420 + }, + { + "epoch": 0.010525, + "grad_norm": 0.9768433570861816, + "grad_norm_var": 0.1088035849174147, + "learning_rate": 0.0001, + "loss": 148120.1875, + "loss/crossentropy": 2.602926015853882, + "loss/hidden": 0.369140625, + "loss/logits": 0.10290530323982239, + "loss/reg": 148119.71875, + "step": 421 + }, + { + "epoch": 0.01055, + "grad_norm": 0.9353519082069397, + "grad_norm_var": 0.11201198441522882, + "learning_rate": 0.0001, + "loss": 147660.4844, + "loss/crossentropy": 2.577287435531616, + "loss/hidden": 0.341796875, + "loss/logits": 0.0870780348777771, + "loss/reg": 147660.0625, + "step": 422 + }, + { + "epoch": 0.010575, + "grad_norm": 1.143856167793274, + "grad_norm_var": 0.07773815687167945, + "learning_rate": 0.0001, + "loss": 147202.5312, + "loss/crossentropy": 2.737713575363159, + "loss/hidden": 0.392578125, + "loss/logits": 0.1179521381855011, + "loss/reg": 147202.015625, + "step": 423 + }, + { + "epoch": 0.0106, + "grad_norm": 0.9985752105712891, + "grad_norm_var": 0.07580946258821748, + "learning_rate": 0.0001, + "loss": 146743.3125, + "loss/crossentropy": 2.3523805141448975, + "loss/hidden": 0.357421875, + "loss/logits": 0.09739204496145248, + "loss/reg": 146742.859375, + "step": 424 + }, + { + "epoch": 0.010625, + "grad_norm": 0.9732621908187866, + "grad_norm_var": 0.07558668643125382, + "learning_rate": 0.0001, + "loss": 146291.2812, + "loss/crossentropy": 2.7479147911071777, + "loss/hidden": 0.37109375, + "loss/logits": 0.10617141425609589, + "loss/reg": 146290.796875, + "step": 425 + }, + { + "epoch": 0.01065, + "grad_norm": 0.9876713752746582, + "grad_norm_var": 0.07573274274057457, + "learning_rate": 0.0001, + "loss": 145810.1719, + "loss/crossentropy": 2.486886501312256, + "loss/hidden": 0.35546875, + "loss/logits": 0.09212806820869446, + "loss/reg": 145809.71875, + "step": 426 + }, + { + "epoch": 0.010675, + "grad_norm": 1.0289918184280396, + "grad_norm_var": 0.06978905016892202, + "learning_rate": 0.0001, + "loss": 145358.7812, + "loss/crossentropy": 2.5645861625671387, + "loss/hidden": 0.37109375, + "loss/logits": 0.09653377532958984, + "loss/reg": 145358.3125, + "step": 427 + }, + { + "epoch": 0.0107, + "grad_norm": 1.1155734062194824, + "grad_norm_var": 0.06361928522481192, + "learning_rate": 0.0001, + "loss": 144907.2188, + "loss/crossentropy": 2.862295150756836, + "loss/hidden": 0.37109375, + "loss/logits": 0.08550693094730377, + "loss/reg": 144906.765625, + "step": 428 + }, + { + "epoch": 0.010725, + "grad_norm": 1.2797330617904663, + "grad_norm_var": 0.013739316120510084, + "learning_rate": 0.0001, + "loss": 144459.2812, + "loss/crossentropy": 2.3312559127807617, + "loss/hidden": 0.384765625, + "loss/logits": 0.1013282760977745, + "loss/reg": 144458.796875, + "step": 429 + }, + { + "epoch": 0.01075, + "grad_norm": 1.258832573890686, + "grad_norm_var": 0.015257258359294626, + "learning_rate": 0.0001, + "loss": 144007.125, + "loss/crossentropy": 2.3650715351104736, + "loss/hidden": 0.373046875, + "loss/logits": 0.09613895416259766, + "loss/reg": 144006.65625, + "step": 430 + }, + { + "epoch": 0.010775, + "grad_norm": 1.4123529195785522, + "grad_norm_var": 0.024139284326330317, + "learning_rate": 0.0001, + "loss": 143544.5938, + "loss/crossentropy": 2.725262403488159, + "loss/hidden": 0.375, + "loss/logits": 0.09534821659326553, + "loss/reg": 143544.125, + "step": 431 + }, + { + "epoch": 0.0108, + "grad_norm": 1.5468213558197021, + "grad_norm_var": 0.0370552443916324, + "learning_rate": 0.0001, + "loss": 143090.5781, + "loss/crossentropy": 3.220313549041748, + "loss/hidden": 0.388671875, + "loss/logits": 0.09105117619037628, + "loss/reg": 143090.09375, + "step": 432 + }, + { + "epoch": 0.010825, + "grad_norm": 0.9015154242515564, + "grad_norm_var": 0.03333559805386462, + "learning_rate": 0.0001, + "loss": 142640.375, + "loss/crossentropy": 2.438876152038574, + "loss/hidden": 0.357421875, + "loss/logits": 0.0950528234243393, + "loss/reg": 142639.921875, + "step": 433 + }, + { + "epoch": 0.01085, + "grad_norm": 0.9043869376182556, + "grad_norm_var": 0.035332891465965334, + "learning_rate": 0.0001, + "loss": 142186.2969, + "loss/crossentropy": 2.4754929542541504, + "loss/hidden": 0.365234375, + "loss/logits": 0.08686907589435577, + "loss/reg": 142185.84375, + "step": 434 + }, + { + "epoch": 0.010875, + "grad_norm": 0.8575437068939209, + "grad_norm_var": 0.03887631132086219, + "learning_rate": 0.0001, + "loss": 141733.2812, + "loss/crossentropy": 2.5210773944854736, + "loss/hidden": 0.365234375, + "loss/logits": 0.09325699508190155, + "loss/reg": 141732.828125, + "step": 435 + }, + { + "epoch": 0.0109, + "grad_norm": 0.992995023727417, + "grad_norm_var": 0.03906610899539905, + "learning_rate": 0.0001, + "loss": 141284.0312, + "loss/crossentropy": 2.807164192199707, + "loss/hidden": 0.365234375, + "loss/logits": 0.0953291729092598, + "loss/reg": 141283.578125, + "step": 436 + }, + { + "epoch": 0.010925, + "grad_norm": 0.9081130623817444, + "grad_norm_var": 0.04032633045922462, + "learning_rate": 0.0001, + "loss": 140836.0781, + "loss/crossentropy": 2.515272378921509, + "loss/hidden": 0.361328125, + "loss/logits": 0.09115155041217804, + "loss/reg": 140835.625, + "step": 437 + }, + { + "epoch": 0.01095, + "grad_norm": 0.8617503643035889, + "grad_norm_var": 0.04206330063874663, + "learning_rate": 0.0001, + "loss": 140398.0156, + "loss/crossentropy": 2.615291118621826, + "loss/hidden": 0.361328125, + "loss/logits": 0.09375904500484467, + "loss/reg": 140397.5625, + "step": 438 + }, + { + "epoch": 0.010975, + "grad_norm": 0.9350782632827759, + "grad_norm_var": 0.04282205208356348, + "learning_rate": 0.0001, + "loss": 139961.625, + "loss/crossentropy": 2.4983971118927, + "loss/hidden": 0.361328125, + "loss/logits": 0.08958140015602112, + "loss/reg": 139961.171875, + "step": 439 + }, + { + "epoch": 0.011, + "grad_norm": 0.9608613848686218, + "grad_norm_var": 0.04322082771104944, + "learning_rate": 0.0001, + "loss": 139524.0156, + "loss/crossentropy": 2.2612545490264893, + "loss/hidden": 0.33984375, + "loss/logits": 0.08482472598552704, + "loss/reg": 139523.59375, + "step": 440 + }, + { + "epoch": 0.011025, + "grad_norm": 0.9255537390708923, + "grad_norm_var": 0.043901110954668456, + "learning_rate": 0.0001, + "loss": 139087.9688, + "loss/crossentropy": 2.6385385990142822, + "loss/hidden": 0.365234375, + "loss/logits": 0.09528161585330963, + "loss/reg": 139087.515625, + "step": 441 + }, + { + "epoch": 0.01105, + "grad_norm": 0.9135704636573792, + "grad_norm_var": 0.04490813570542424, + "learning_rate": 0.0001, + "loss": 138650.1875, + "loss/crossentropy": 2.540478467941284, + "loss/hidden": 0.34375, + "loss/logits": 0.10281309485435486, + "loss/reg": 138649.734375, + "step": 442 + }, + { + "epoch": 0.011075, + "grad_norm": 0.779073178768158, + "grad_norm_var": 0.0495195390762743, + "learning_rate": 0.0001, + "loss": 138217.0625, + "loss/crossentropy": 2.3806254863739014, + "loss/hidden": 0.330078125, + "loss/logits": 0.08512654900550842, + "loss/reg": 138216.640625, + "step": 443 + }, + { + "epoch": 0.0111, + "grad_norm": 1.2005538940429688, + "grad_norm_var": 0.050888272553039814, + "learning_rate": 0.0001, + "loss": 137793.9844, + "loss/crossentropy": 2.5137827396392822, + "loss/hidden": 0.369140625, + "loss/logits": 0.09710798412561417, + "loss/reg": 137793.515625, + "step": 444 + }, + { + "epoch": 0.011125, + "grad_norm": 1.471663236618042, + "grad_norm_var": 0.059327554212960584, + "learning_rate": 0.0001, + "loss": 137320.3906, + "loss/crossentropy": 2.5025644302368164, + "loss/hidden": 0.384765625, + "loss/logits": 0.10144961625337601, + "loss/reg": 137319.90625, + "step": 445 + }, + { + "epoch": 0.01115, + "grad_norm": 1.9600776433944702, + "grad_norm_var": 0.10940810263046583, + "learning_rate": 0.0001, + "loss": 136887.5156, + "loss/crossentropy": 2.6639795303344727, + "loss/hidden": 0.38671875, + "loss/logits": 0.10284031182527542, + "loss/reg": 136887.03125, + "step": 446 + }, + { + "epoch": 0.011175, + "grad_norm": 1.396752119064331, + "grad_norm_var": 0.10876473470327969, + "learning_rate": 0.0001, + "loss": 136455.7344, + "loss/crossentropy": 2.8369083404541016, + "loss/hidden": 0.392578125, + "loss/logits": 0.1023794487118721, + "loss/reg": 136455.234375, + "step": 447 + }, + { + "epoch": 0.0112, + "grad_norm": 1.3984694480895996, + "grad_norm_var": 0.10119854931983961, + "learning_rate": 0.0001, + "loss": 136018.125, + "loss/crossentropy": 2.540158271789551, + "loss/hidden": 0.34375, + "loss/logits": 0.08261817693710327, + "loss/reg": 136017.703125, + "step": 448 + }, + { + "epoch": 0.011225, + "grad_norm": 1.2194510698318481, + "grad_norm_var": 0.09971698574367403, + "learning_rate": 0.0001, + "loss": 135585.1719, + "loss/crossentropy": 2.0608363151550293, + "loss/hidden": 0.380859375, + "loss/logits": 0.0951135903596878, + "loss/reg": 135584.703125, + "step": 449 + }, + { + "epoch": 0.01125, + "grad_norm": 0.8559896349906921, + "grad_norm_var": 0.10116030716406522, + "learning_rate": 0.0001, + "loss": 135155.8906, + "loss/crossentropy": 2.7330076694488525, + "loss/hidden": 0.337890625, + "loss/logits": 0.09428665041923523, + "loss/reg": 135155.453125, + "step": 450 + }, + { + "epoch": 0.011275, + "grad_norm": 0.9683396816253662, + "grad_norm_var": 0.09831116364115952, + "learning_rate": 0.0001, + "loss": 134730.7344, + "loss/crossentropy": 2.4699547290802, + "loss/hidden": 0.357421875, + "loss/logits": 0.09042903780937195, + "loss/reg": 134730.28125, + "step": 451 + }, + { + "epoch": 0.0113, + "grad_norm": 0.8751084804534912, + "grad_norm_var": 0.10100734765911383, + "learning_rate": 0.0001, + "loss": 134306.2344, + "loss/crossentropy": 2.533498764038086, + "loss/hidden": 0.365234375, + "loss/logits": 0.09291449189186096, + "loss/reg": 134305.78125, + "step": 452 + }, + { + "epoch": 0.011325, + "grad_norm": 1.0929409265518188, + "grad_norm_var": 0.09836679254087041, + "learning_rate": 0.0001, + "loss": 133874.5156, + "loss/crossentropy": 2.3817315101623535, + "loss/hidden": 0.373046875, + "loss/logits": 0.10951289534568787, + "loss/reg": 133874.03125, + "step": 453 + }, + { + "epoch": 0.01135, + "grad_norm": 0.7926502227783203, + "grad_norm_var": 0.10098423608836003, + "learning_rate": 0.0001, + "loss": 133446.7031, + "loss/crossentropy": 2.558635711669922, + "loss/hidden": 0.3359375, + "loss/logits": 0.09265337884426117, + "loss/reg": 133446.28125, + "step": 454 + }, + { + "epoch": 0.011375, + "grad_norm": 0.9337485432624817, + "grad_norm_var": 0.10101520586683463, + "learning_rate": 0.0001, + "loss": 133023.4844, + "loss/crossentropy": 2.320953845977783, + "loss/hidden": 0.357421875, + "loss/logits": 0.08878225088119507, + "loss/reg": 133023.03125, + "step": 455 + }, + { + "epoch": 0.0114, + "grad_norm": 0.7837616205215454, + "grad_norm_var": 0.10647470419620443, + "learning_rate": 0.0001, + "loss": 132599.6875, + "loss/crossentropy": 2.449989080429077, + "loss/hidden": 0.3359375, + "loss/logits": 0.08764292299747467, + "loss/reg": 132599.265625, + "step": 456 + }, + { + "epoch": 0.011425, + "grad_norm": 0.9680254459381104, + "grad_norm_var": 0.10561100442267085, + "learning_rate": 0.0001, + "loss": 132178.375, + "loss/crossentropy": 2.3862457275390625, + "loss/hidden": 0.369140625, + "loss/logits": 0.10386032611131668, + "loss/reg": 132177.90625, + "step": 457 + }, + { + "epoch": 0.01145, + "grad_norm": 1.173683762550354, + "grad_norm_var": 0.10335192378294071, + "learning_rate": 0.0001, + "loss": 131756.8281, + "loss/crossentropy": 2.3640592098236084, + "loss/hidden": 0.369140625, + "loss/logits": 0.09086473286151886, + "loss/reg": 131756.375, + "step": 458 + }, + { + "epoch": 0.011475, + "grad_norm": 1.2718472480773926, + "grad_norm_var": 0.09633271636354876, + "learning_rate": 0.0001, + "loss": 131337.7188, + "loss/crossentropy": 2.2730746269226074, + "loss/hidden": 0.373046875, + "loss/logits": 0.07853664457798004, + "loss/reg": 131337.265625, + "step": 459 + }, + { + "epoch": 0.0115, + "grad_norm": 1.3684788942337036, + "grad_norm_var": 0.09927873215353529, + "learning_rate": 0.0001, + "loss": 130913.3594, + "loss/crossentropy": 2.5492196083068848, + "loss/hidden": 0.392578125, + "loss/logits": 0.10558829456567764, + "loss/reg": 130912.859375, + "step": 460 + }, + { + "epoch": 0.011525, + "grad_norm": 1.0493955612182617, + "grad_norm_var": 0.09277364145863345, + "learning_rate": 0.0001, + "loss": 130492.3594, + "loss/crossentropy": 2.433809280395508, + "loss/hidden": 0.35546875, + "loss/logits": 0.08270187675952911, + "loss/reg": 130491.921875, + "step": 461 + }, + { + "epoch": 0.01155, + "grad_norm": 1.0531160831451416, + "grad_norm_var": 0.04402211242043747, + "learning_rate": 0.0001, + "loss": 130074.4922, + "loss/crossentropy": 2.5947558879852295, + "loss/hidden": 0.369140625, + "loss/logits": 0.08141292631626129, + "loss/reg": 130074.0390625, + "step": 462 + }, + { + "epoch": 0.011575, + "grad_norm": 0.7231173515319824, + "grad_norm_var": 0.04349435098153685, + "learning_rate": 0.0001, + "loss": 129660.0547, + "loss/crossentropy": 2.4421298503875732, + "loss/hidden": 0.330078125, + "loss/logits": 0.07788297533988953, + "loss/reg": 129659.6484375, + "step": 463 + }, + { + "epoch": 0.0116, + "grad_norm": 1.0745733976364136, + "grad_norm_var": 0.03426826155179915, + "learning_rate": 0.0001, + "loss": 129248.9766, + "loss/crossentropy": 2.3232181072235107, + "loss/hidden": 0.373046875, + "loss/logits": 0.09334367513656616, + "loss/reg": 129248.5078125, + "step": 464 + }, + { + "epoch": 0.011625, + "grad_norm": 1.112154245376587, + "grad_norm_var": 0.03203088759449593, + "learning_rate": 0.0001, + "loss": 128838.0469, + "loss/crossentropy": 2.710418462753296, + "loss/hidden": 0.396484375, + "loss/logits": 0.14225494861602783, + "loss/reg": 128837.5078125, + "step": 465 + }, + { + "epoch": 0.01165, + "grad_norm": 1.0500752925872803, + "grad_norm_var": 0.030501727928448578, + "learning_rate": 0.0001, + "loss": 128427.5312, + "loss/crossentropy": 2.709763288497925, + "loss/hidden": 0.361328125, + "loss/logits": 0.09049970656633377, + "loss/reg": 128427.078125, + "step": 466 + }, + { + "epoch": 0.011675, + "grad_norm": 1.289119005203247, + "grad_norm_var": 0.03480087410717029, + "learning_rate": 0.0001, + "loss": 128019.4141, + "loss/crossentropy": 2.2913756370544434, + "loss/hidden": 0.359375, + "loss/logits": 0.08161611109972, + "loss/reg": 128018.9765625, + "step": 467 + }, + { + "epoch": 0.0117, + "grad_norm": 1.0929843187332153, + "grad_norm_var": 0.033028832528283404, + "learning_rate": 0.0001, + "loss": 127613.3672, + "loss/crossentropy": 2.8107008934020996, + "loss/hidden": 0.365234375, + "loss/logits": 0.08901776373386383, + "loss/reg": 127612.9140625, + "step": 468 + }, + { + "epoch": 0.011725, + "grad_norm": 0.9957696795463562, + "grad_norm_var": 0.033086650592021684, + "learning_rate": 0.0001, + "loss": 127206.2734, + "loss/crossentropy": 2.3889429569244385, + "loss/hidden": 0.361328125, + "loss/logits": 0.10045656561851501, + "loss/reg": 127205.8125, + "step": 469 + }, + { + "epoch": 0.01175, + "grad_norm": 1.5600621700286865, + "grad_norm_var": 0.04399344490662707, + "learning_rate": 0.0001, + "loss": 126789.2266, + "loss/crossentropy": 2.939469814300537, + "loss/hidden": 0.373046875, + "loss/logits": 0.09594859182834625, + "loss/reg": 126788.7578125, + "step": 470 + }, + { + "epoch": 0.011775, + "grad_norm": 0.8998208045959473, + "grad_norm_var": 0.04478916176521275, + "learning_rate": 0.0001, + "loss": 126383.0859, + "loss/crossentropy": 2.659053325653076, + "loss/hidden": 0.333984375, + "loss/logits": 0.08160565793514252, + "loss/reg": 126382.671875, + "step": 471 + }, + { + "epoch": 0.0118, + "grad_norm": 1.1243245601654053, + "grad_norm_var": 0.03805856841151596, + "learning_rate": 0.0001, + "loss": 125978.3047, + "loss/crossentropy": 2.76472806930542, + "loss/hidden": 0.353515625, + "loss/logits": 0.0836871862411499, + "loss/reg": 125977.8671875, + "step": 472 + }, + { + "epoch": 0.011825, + "grad_norm": 0.9645205140113831, + "grad_norm_var": 0.03812704390658297, + "learning_rate": 0.0001, + "loss": 125572.5547, + "loss/crossentropy": 2.460484743118286, + "loss/hidden": 0.337890625, + "loss/logits": 0.08670766651630402, + "loss/reg": 125572.1328125, + "step": 473 + }, + { + "epoch": 0.01185, + "grad_norm": 0.9478604793548584, + "grad_norm_var": 0.03947780081620251, + "learning_rate": 0.0001, + "loss": 125171.1719, + "loss/crossentropy": 2.975003957748413, + "loss/hidden": 0.373046875, + "loss/logits": 0.08480578660964966, + "loss/reg": 125170.7109375, + "step": 474 + }, + { + "epoch": 0.011875, + "grad_norm": 1.0086603164672852, + "grad_norm_var": 0.03772665503536855, + "learning_rate": 0.0001, + "loss": 124768.9609, + "loss/crossentropy": 2.6586267948150635, + "loss/hidden": 0.3671875, + "loss/logits": 0.0969114601612091, + "loss/reg": 124768.5, + "step": 475 + }, + { + "epoch": 0.0119, + "grad_norm": 0.8238663673400879, + "grad_norm_var": 0.035470889545035314, + "learning_rate": 0.0001, + "loss": 124371.3203, + "loss/crossentropy": 2.324495553970337, + "loss/hidden": 0.337890625, + "loss/logits": 0.08633776009082794, + "loss/reg": 124370.8984375, + "step": 476 + }, + { + "epoch": 0.011925, + "grad_norm": 1.0551565885543823, + "grad_norm_var": 0.03547396768774901, + "learning_rate": 0.0001, + "loss": 123974.6953, + "loss/crossentropy": 2.6109156608581543, + "loss/hidden": 0.37109375, + "loss/logits": 0.09771596640348434, + "loss/reg": 123974.2265625, + "step": 477 + }, + { + "epoch": 0.01195, + "grad_norm": 0.9296664595603943, + "grad_norm_var": 0.036349632924163426, + "learning_rate": 0.0001, + "loss": 123575.5859, + "loss/crossentropy": 2.5287327766418457, + "loss/hidden": 0.361328125, + "loss/logits": 0.08497745543718338, + "loss/reg": 123575.140625, + "step": 478 + }, + { + "epoch": 0.011975, + "grad_norm": 1.175586223602295, + "grad_norm_var": 0.029983631986153354, + "learning_rate": 0.0001, + "loss": 123167.7812, + "loss/crossentropy": 2.451228618621826, + "loss/hidden": 0.357421875, + "loss/logits": 0.083621084690094, + "loss/reg": 123167.34375, + "step": 479 + }, + { + "epoch": 0.012, + "grad_norm": 0.8889312148094177, + "grad_norm_var": 0.03199992643621474, + "learning_rate": 0.0001, + "loss": 122769.4609, + "loss/crossentropy": 2.566922903060913, + "loss/hidden": 0.357421875, + "loss/logits": 0.08629916608333588, + "loss/reg": 122769.015625, + "step": 480 + }, + { + "epoch": 0.012025, + "grad_norm": 1.517897367477417, + "grad_norm_var": 0.04525076342957224, + "learning_rate": 0.0001, + "loss": 122374.2969, + "loss/crossentropy": 2.5881693363189697, + "loss/hidden": 0.384765625, + "loss/logits": 0.09450060874223709, + "loss/reg": 122373.8203125, + "step": 481 + }, + { + "epoch": 0.01205, + "grad_norm": 1.0205724239349365, + "grad_norm_var": 0.04543377173875403, + "learning_rate": 0.0001, + "loss": 121973.6172, + "loss/crossentropy": 2.7237818241119385, + "loss/hidden": 0.361328125, + "loss/logits": 0.09611287713050842, + "loss/reg": 121973.15625, + "step": 482 + }, + { + "epoch": 0.012075, + "grad_norm": 0.9960222840309143, + "grad_norm_var": 0.04266674285080444, + "learning_rate": 0.0001, + "loss": 121575.7031, + "loss/crossentropy": 2.5460588932037354, + "loss/hidden": 0.373046875, + "loss/logits": 0.08726921677589417, + "loss/reg": 121575.2421875, + "step": 483 + }, + { + "epoch": 0.0121, + "grad_norm": 0.9125394225120544, + "grad_norm_var": 0.04397089239013236, + "learning_rate": 0.0001, + "loss": 121182.1719, + "loss/crossentropy": 2.662997007369995, + "loss/hidden": 0.3515625, + "loss/logits": 0.0756755918264389, + "loss/reg": 121181.7421875, + "step": 484 + }, + { + "epoch": 0.012125, + "grad_norm": 0.8799184560775757, + "grad_norm_var": 0.045667945002744154, + "learning_rate": 0.0001, + "loss": 120790.0859, + "loss/crossentropy": 2.5711655616760254, + "loss/hidden": 0.365234375, + "loss/logits": 0.0949600487947464, + "loss/reg": 120789.625, + "step": 485 + }, + { + "epoch": 0.01215, + "grad_norm": 0.7621967196464539, + "grad_norm_var": 0.030564364061018937, + "learning_rate": 0.0001, + "loss": 120401.1562, + "loss/crossentropy": 2.4500555992126465, + "loss/hidden": 0.361328125, + "loss/logits": 0.09178435802459717, + "loss/reg": 120400.703125, + "step": 486 + }, + { + "epoch": 0.012175, + "grad_norm": 0.8588399291038513, + "grad_norm_var": 0.031185143693074582, + "learning_rate": 0.0001, + "loss": 120011.7656, + "loss/crossentropy": 2.455263137817383, + "loss/hidden": 0.357421875, + "loss/logits": 0.09996655583381653, + "loss/reg": 120011.3046875, + "step": 487 + }, + { + "epoch": 0.0122, + "grad_norm": 2.0252444744110107, + "grad_norm_var": 0.09784971121349349, + "learning_rate": 0.0001, + "loss": 119621.1328, + "loss/crossentropy": 2.5841636657714844, + "loss/hidden": 0.36328125, + "loss/logits": 0.0824744701385498, + "loss/reg": 119620.6875, + "step": 488 + }, + { + "epoch": 0.012225, + "grad_norm": 0.9055572748184204, + "grad_norm_var": 0.09872304292664413, + "learning_rate": 0.0001, + "loss": 119235.2109, + "loss/crossentropy": 2.594587564468384, + "loss/hidden": 0.36328125, + "loss/logits": 0.08088156580924988, + "loss/reg": 119234.765625, + "step": 489 + }, + { + "epoch": 0.01225, + "grad_norm": 1.1274175643920898, + "grad_norm_var": 0.09842966132213408, + "learning_rate": 0.0001, + "loss": 118846.4062, + "loss/crossentropy": 2.8951048851013184, + "loss/hidden": 0.388671875, + "loss/logits": 0.11064436286687851, + "loss/reg": 118845.90625, + "step": 490 + }, + { + "epoch": 0.012275, + "grad_norm": 0.8238899111747742, + "grad_norm_var": 0.10171747524936578, + "learning_rate": 0.0001, + "loss": 118458.2656, + "loss/crossentropy": 2.617831230163574, + "loss/hidden": 0.337890625, + "loss/logits": 0.08420297503471375, + "loss/reg": 118457.84375, + "step": 491 + }, + { + "epoch": 0.0123, + "grad_norm": 0.7952893972396851, + "grad_norm_var": 0.10260711635534307, + "learning_rate": 0.0001, + "loss": 118073.4609, + "loss/crossentropy": 2.5448977947235107, + "loss/hidden": 0.337890625, + "loss/logits": 0.09165728092193604, + "loss/reg": 118073.03125, + "step": 492 + }, + { + "epoch": 0.012325, + "grad_norm": 0.9254248142242432, + "grad_norm_var": 0.1034343819358753, + "learning_rate": 0.0001, + "loss": 117686.6094, + "loss/crossentropy": 2.6640286445617676, + "loss/hidden": 0.365234375, + "loss/logits": 0.08283071964979172, + "loss/reg": 117686.1640625, + "step": 493 + }, + { + "epoch": 0.01235, + "grad_norm": 0.8176378011703491, + "grad_norm_var": 0.10577815732465817, + "learning_rate": 0.0001, + "loss": 117297.7266, + "loss/crossentropy": 2.5091538429260254, + "loss/hidden": 0.349609375, + "loss/logits": 0.08984896540641785, + "loss/reg": 117297.2890625, + "step": 494 + }, + { + "epoch": 0.012375, + "grad_norm": 1.3219102621078491, + "grad_norm_var": 0.11001404873489135, + "learning_rate": 0.0001, + "loss": 116913.25, + "loss/crossentropy": 2.4136619567871094, + "loss/hidden": 0.39453125, + "loss/logits": 0.08754974603652954, + "loss/reg": 116912.765625, + "step": 495 + }, + { + "epoch": 0.0124, + "grad_norm": 0.9934073090553284, + "grad_norm_var": 0.10864469870001443, + "learning_rate": 0.0001, + "loss": 116528.2734, + "loss/crossentropy": 2.8950345516204834, + "loss/hidden": 0.36328125, + "loss/logits": 0.09001494199037552, + "loss/reg": 116527.8203125, + "step": 496 + }, + { + "epoch": 0.012425, + "grad_norm": 1.9397823810577393, + "grad_norm_var": 0.14649738254872915, + "learning_rate": 0.0001, + "loss": 116142.4297, + "loss/crossentropy": 2.3993234634399414, + "loss/hidden": 0.330078125, + "loss/logits": 0.0801636129617691, + "loss/reg": 116142.015625, + "step": 497 + }, + { + "epoch": 0.01245, + "grad_norm": 1.7955193519592285, + "grad_norm_var": 0.1790168054387628, + "learning_rate": 0.0001, + "loss": 115763.0078, + "loss/crossentropy": 2.5822932720184326, + "loss/hidden": 0.373046875, + "loss/logits": 0.07560418546199799, + "loss/reg": 115762.5625, + "step": 498 + }, + { + "epoch": 0.012475, + "grad_norm": 1.4114738702774048, + "grad_norm_var": 0.18307315732150767, + "learning_rate": 0.0001, + "loss": 115380.0938, + "loss/crossentropy": 2.757403612136841, + "loss/hidden": 0.390625, + "loss/logits": 0.10880538821220398, + "loss/reg": 115379.59375, + "step": 499 + }, + { + "epoch": 0.0125, + "grad_norm": 1.706606149673462, + "grad_norm_var": 0.19802862511289915, + "learning_rate": 0.0001, + "loss": 114999.9688, + "loss/crossentropy": 3.1950490474700928, + "loss/hidden": 0.41796875, + "loss/logits": 0.10724850744009018, + "loss/reg": 114999.4453125, + "step": 500 + }, + { + "epoch": 0.012525, + "grad_norm": 0.978772759437561, + "grad_norm_var": 0.1945110486983968, + "learning_rate": 0.0001, + "loss": 114607.5547, + "loss/crossentropy": 2.5394797325134277, + "loss/hidden": 0.373046875, + "loss/logits": 0.11042068153619766, + "loss/reg": 114607.0703125, + "step": 501 + }, + { + "epoch": 0.01255, + "grad_norm": 0.786778450012207, + "grad_norm_var": 0.1931161462249265, + "learning_rate": 0.0001, + "loss": 114225.5312, + "loss/crossentropy": 2.4225008487701416, + "loss/hidden": 0.373046875, + "loss/logits": 0.09658807516098022, + "loss/reg": 114225.0625, + "step": 502 + }, + { + "epoch": 0.012575, + "grad_norm": 1.7830466032028198, + "grad_norm_var": 0.20435632639445528, + "learning_rate": 0.0001, + "loss": 113844.4453, + "loss/crossentropy": 2.6307485103607178, + "loss/hidden": 0.369140625, + "loss/logits": 0.07897418737411499, + "loss/reg": 113844.0, + "step": 503 + }, + { + "epoch": 0.0126, + "grad_norm": 1.090800166130066, + "grad_norm_var": 0.16341348624494217, + "learning_rate": 0.0001, + "loss": 113465.0469, + "loss/crossentropy": 2.7216010093688965, + "loss/hidden": 0.3828125, + "loss/logits": 0.11136461049318314, + "loss/reg": 113464.5546875, + "step": 504 + }, + { + "epoch": 0.012625, + "grad_norm": 0.7676119804382324, + "grad_norm_var": 0.1700222008788211, + "learning_rate": 0.0001, + "loss": 113085.375, + "loss/crossentropy": 2.2064309120178223, + "loss/hidden": 0.337890625, + "loss/logits": 0.07960689067840576, + "loss/reg": 113084.9609375, + "step": 505 + }, + { + "epoch": 0.01265, + "grad_norm": 0.9918925166130066, + "grad_norm_var": 0.17232965666638395, + "learning_rate": 0.0001, + "loss": 112710.0703, + "loss/crossentropy": 2.38382625579834, + "loss/hidden": 0.369140625, + "loss/logits": 0.08772751688957214, + "loss/reg": 112709.6171875, + "step": 506 + }, + { + "epoch": 0.012675, + "grad_norm": 0.8411089181900024, + "grad_norm_var": 0.17152345377644476, + "learning_rate": 0.0001, + "loss": 112330.3438, + "loss/crossentropy": 2.376312017440796, + "loss/hidden": 0.328125, + "loss/logits": 0.07509603351354599, + "loss/reg": 112329.9375, + "step": 507 + }, + { + "epoch": 0.0127, + "grad_norm": 1.2928287982940674, + "grad_norm_var": 0.1611958341104075, + "learning_rate": 0.0001, + "loss": 111957.0781, + "loss/crossentropy": 2.7829861640930176, + "loss/hidden": 0.3671875, + "loss/logits": 0.09470954537391663, + "loss/reg": 111956.6171875, + "step": 508 + }, + { + "epoch": 0.012725, + "grad_norm": 1.4744898080825806, + "grad_norm_var": 0.15881738681891258, + "learning_rate": 0.0001, + "loss": 111584.1328, + "loss/crossentropy": 2.372309684753418, + "loss/hidden": 0.396484375, + "loss/logits": 0.11247822642326355, + "loss/reg": 111583.625, + "step": 509 + }, + { + "epoch": 0.01275, + "grad_norm": 0.9410738348960876, + "grad_norm_var": 0.1526603027667998, + "learning_rate": 0.0001, + "loss": 111213.9766, + "loss/crossentropy": 2.5752036571502686, + "loss/hidden": 0.373046875, + "loss/logits": 0.09293421357870102, + "loss/reg": 111213.5078125, + "step": 510 + }, + { + "epoch": 0.012775, + "grad_norm": 1.239321231842041, + "grad_norm_var": 0.15237534080648046, + "learning_rate": 0.0001, + "loss": 110848.4688, + "loss/crossentropy": 2.4036667346954346, + "loss/hidden": 0.4140625, + "loss/logits": 0.14227986335754395, + "loss/reg": 110847.9140625, + "step": 511 + }, + { + "epoch": 0.0128, + "grad_norm": 0.8401315212249756, + "grad_norm_var": 0.15913169358604756, + "learning_rate": 0.0001, + "loss": 110481.7969, + "loss/crossentropy": 2.3782424926757812, + "loss/hidden": 0.357421875, + "loss/logits": 0.08400236815214157, + "loss/reg": 110481.3515625, + "step": 512 + }, + { + "epoch": 0.012825, + "grad_norm": 0.8280027508735657, + "grad_norm_var": 0.13303329187288068, + "learning_rate": 0.0001, + "loss": 110117.1172, + "loss/crossentropy": 2.454601287841797, + "loss/hidden": 0.333984375, + "loss/logits": 0.07852231711149216, + "loss/reg": 110116.703125, + "step": 513 + }, + { + "epoch": 0.01285, + "grad_norm": 1.0113853216171265, + "grad_norm_var": 0.1063868120555579, + "learning_rate": 0.0001, + "loss": 109748.8125, + "loss/crossentropy": 2.382014036178589, + "loss/hidden": 0.376953125, + "loss/logits": 0.09340295195579529, + "loss/reg": 109748.34375, + "step": 514 + }, + { + "epoch": 0.012875, + "grad_norm": 0.9986621141433716, + "grad_norm_var": 0.10121920328154285, + "learning_rate": 0.0001, + "loss": 109380.7422, + "loss/crossentropy": 2.654278039932251, + "loss/hidden": 0.373046875, + "loss/logits": 0.10119034349918365, + "loss/reg": 109380.265625, + "step": 515 + }, + { + "epoch": 0.0129, + "grad_norm": 0.9399179816246033, + "grad_norm_var": 0.07577139049272527, + "learning_rate": 0.0001, + "loss": 109014.1719, + "loss/crossentropy": 2.8084640502929688, + "loss/hidden": 0.33984375, + "loss/logits": 0.09238427132368088, + "loss/reg": 109013.7421875, + "step": 516 + }, + { + "epoch": 0.012925, + "grad_norm": 0.9189648628234863, + "grad_norm_var": 0.07656584860278356, + "learning_rate": 0.0001, + "loss": 108654.2734, + "loss/crossentropy": 2.445586681365967, + "loss/hidden": 0.359375, + "loss/logits": 0.10389550030231476, + "loss/reg": 108653.8125, + "step": 517 + }, + { + "epoch": 0.01295, + "grad_norm": 0.8263663649559021, + "grad_norm_var": 0.07529222214620088, + "learning_rate": 0.0001, + "loss": 108293.4453, + "loss/crossentropy": 2.752142906188965, + "loss/hidden": 0.35546875, + "loss/logits": 0.09072582423686981, + "loss/reg": 108293.0, + "step": 518 + }, + { + "epoch": 0.012975, + "grad_norm": 1.0929358005523682, + "grad_norm_var": 0.03752412651066224, + "learning_rate": 0.0001, + "loss": 107936.4297, + "loss/crossentropy": 2.3689184188842773, + "loss/hidden": 0.423828125, + "loss/logits": 0.09004764258861542, + "loss/reg": 107935.9140625, + "step": 519 + }, + { + "epoch": 0.013, + "grad_norm": 1.1709879636764526, + "grad_norm_var": 0.038833003307898045, + "learning_rate": 0.0001, + "loss": 107557.6484, + "loss/crossentropy": 2.616063117980957, + "loss/hidden": 0.373046875, + "loss/logits": 0.10525017976760864, + "loss/reg": 107557.171875, + "step": 520 + }, + { + "epoch": 0.013025, + "grad_norm": 0.8341318964958191, + "grad_norm_var": 0.036951049164519564, + "learning_rate": 0.0001, + "loss": 107197.8438, + "loss/crossentropy": 2.571011781692505, + "loss/hidden": 0.359375, + "loss/logits": 0.10364649444818497, + "loss/reg": 107197.3828125, + "step": 521 + }, + { + "epoch": 0.01305, + "grad_norm": 1.233487606048584, + "grad_norm_var": 0.03985027429405255, + "learning_rate": 0.0001, + "loss": 106844.7031, + "loss/crossentropy": 2.4951682090759277, + "loss/hidden": 0.373046875, + "loss/logits": 0.09696684777736664, + "loss/reg": 106844.234375, + "step": 522 + }, + { + "epoch": 0.013075, + "grad_norm": 1.2658134698867798, + "grad_norm_var": 0.04041382203118962, + "learning_rate": 0.0001, + "loss": 106480.8828, + "loss/crossentropy": 2.8979501724243164, + "loss/hidden": 0.369140625, + "loss/logits": 0.08361580967903137, + "loss/reg": 106480.4296875, + "step": 523 + }, + { + "epoch": 0.0131, + "grad_norm": 0.8786676526069641, + "grad_norm_var": 0.03809952111847513, + "learning_rate": 0.0001, + "loss": 106128.5547, + "loss/crossentropy": 2.651653528213501, + "loss/hidden": 0.365234375, + "loss/logits": 0.09834752231836319, + "loss/reg": 106128.09375, + "step": 524 + }, + { + "epoch": 0.013125, + "grad_norm": 0.8635250926017761, + "grad_norm_var": 0.025293388164052393, + "learning_rate": 0.0001, + "loss": 105776.7734, + "loss/crossentropy": 2.737975597381592, + "loss/hidden": 0.376953125, + "loss/logits": 0.10877284407615662, + "loss/reg": 105776.2890625, + "step": 525 + }, + { + "epoch": 0.01315, + "grad_norm": 4.461543083190918, + "grad_norm_var": 0.7756616132701457, + "learning_rate": 0.0001, + "loss": 105422.5156, + "loss/crossentropy": 3.27087664604187, + "loss/hidden": 0.458984375, + "loss/logits": 0.15441305935382843, + "loss/reg": 105421.8984375, + "step": 526 + }, + { + "epoch": 0.013175, + "grad_norm": 0.9257360696792603, + "grad_norm_var": 0.780696206640098, + "learning_rate": 0.0001, + "loss": 105058.2188, + "loss/crossentropy": 2.386378765106201, + "loss/hidden": 0.34765625, + "loss/logits": 0.10172270983457565, + "loss/reg": 105057.765625, + "step": 527 + }, + { + "epoch": 0.0132, + "grad_norm": 0.9832150936126709, + "grad_norm_var": 0.7752411115564602, + "learning_rate": 0.0001, + "loss": 104708.0547, + "loss/crossentropy": 2.753894805908203, + "loss/hidden": 0.380859375, + "loss/logits": 0.10964615643024445, + "loss/reg": 104707.5625, + "step": 528 + }, + { + "epoch": 0.013225, + "grad_norm": 0.8955684900283813, + "grad_norm_var": 0.7721564224718672, + "learning_rate": 0.0001, + "loss": 104329.8594, + "loss/crossentropy": 2.3564887046813965, + "loss/hidden": 0.35546875, + "loss/logits": 0.08882109820842743, + "loss/reg": 104329.4140625, + "step": 529 + }, + { + "epoch": 0.01325, + "grad_norm": 0.8905146718025208, + "grad_norm_var": 0.7762109010940305, + "learning_rate": 0.0001, + "loss": 103978.125, + "loss/crossentropy": 2.3647301197052, + "loss/hidden": 0.373046875, + "loss/logits": 0.09351895749568939, + "loss/reg": 103977.65625, + "step": 530 + }, + { + "epoch": 0.013275, + "grad_norm": 1.150484561920166, + "grad_norm_var": 0.7736011035444074, + "learning_rate": 0.0001, + "loss": 103626.8047, + "loss/crossentropy": 2.584282636642456, + "loss/hidden": 0.375, + "loss/logits": 0.11200837045907974, + "loss/reg": 103626.3203125, + "step": 531 + }, + { + "epoch": 0.0133, + "grad_norm": 1.9985939264297485, + "grad_norm_var": 0.8057751180104772, + "learning_rate": 0.0001, + "loss": 103278.8828, + "loss/crossentropy": 2.514350652694702, + "loss/hidden": 0.40234375, + "loss/logits": 0.1426522135734558, + "loss/reg": 103278.3359375, + "step": 532 + }, + { + "epoch": 0.013325, + "grad_norm": 1.1248385906219482, + "grad_norm_var": 0.7986672489099291, + "learning_rate": 0.0001, + "loss": 102931.1797, + "loss/crossentropy": 2.7196877002716064, + "loss/hidden": 0.380859375, + "loss/logits": 0.102044478058815, + "loss/reg": 102930.6953125, + "step": 533 + }, + { + "epoch": 0.01335, + "grad_norm": 1.074903130531311, + "grad_norm_var": 0.7872541863960926, + "learning_rate": 0.0001, + "loss": 102585.3672, + "loss/crossentropy": 2.4676969051361084, + "loss/hidden": 0.375, + "loss/logits": 0.09518338739871979, + "loss/reg": 102584.8984375, + "step": 534 + }, + { + "epoch": 0.013375, + "grad_norm": 1.7951133251190186, + "grad_norm_var": 0.7984208988539524, + "learning_rate": 0.0001, + "loss": 102241.6562, + "loss/crossentropy": 3.1216862201690674, + "loss/hidden": 0.42578125, + "loss/logits": 0.10711027681827545, + "loss/reg": 102241.125, + "step": 535 + }, + { + "epoch": 0.0134, + "grad_norm": 1.094664454460144, + "grad_norm_var": 0.8005730586879667, + "learning_rate": 0.0001, + "loss": 101879.2578, + "loss/crossentropy": 2.677700996398926, + "loss/hidden": 0.380859375, + "loss/logits": 0.10375779867172241, + "loss/reg": 101878.7734375, + "step": 536 + }, + { + "epoch": 0.013425, + "grad_norm": 2.5311005115509033, + "grad_norm_var": 0.8656599200048455, + "learning_rate": 0.0001, + "loss": 101537.0859, + "loss/crossentropy": 2.8987925052642822, + "loss/hidden": 0.42578125, + "loss/logits": 0.24518340826034546, + "loss/reg": 101536.4140625, + "step": 537 + }, + { + "epoch": 0.01345, + "grad_norm": 1.2476500272750854, + "grad_norm_var": 0.8652674144164189, + "learning_rate": 0.0001, + "loss": 101195.7656, + "loss/crossentropy": 2.3096818923950195, + "loss/hidden": 0.388671875, + "loss/logits": 0.08652019500732422, + "loss/reg": 101195.2890625, + "step": 538 + }, + { + "epoch": 0.013475, + "grad_norm": 1.0766700506210327, + "grad_norm_var": 0.8721199107049263, + "learning_rate": 0.0001, + "loss": 100856.5469, + "loss/crossentropy": 2.541144609451294, + "loss/hidden": 0.396484375, + "loss/logits": 0.10943476855754852, + "loss/reg": 100856.0390625, + "step": 539 + }, + { + "epoch": 0.0135, + "grad_norm": 1.6914061307907104, + "grad_norm_var": 0.8528948007387608, + "learning_rate": 0.0001, + "loss": 100516.1016, + "loss/crossentropy": 2.6160571575164795, + "loss/hidden": 0.4296875, + "loss/logits": 0.12198935449123383, + "loss/reg": 100515.546875, + "step": 540 + }, + { + "epoch": 0.013525, + "grad_norm": 1.2223988771438599, + "grad_norm_var": 0.8310705723616335, + "learning_rate": 0.0001, + "loss": 100175.4688, + "loss/crossentropy": 2.515537977218628, + "loss/hidden": 0.40234375, + "loss/logits": 0.08977995067834854, + "loss/reg": 100174.9765625, + "step": 541 + }, + { + "epoch": 0.01355, + "grad_norm": 0.8259720206260681, + "grad_norm_var": 0.2265506831187555, + "learning_rate": 0.0001, + "loss": 99839.1875, + "loss/crossentropy": 2.3948302268981934, + "loss/hidden": 0.361328125, + "loss/logits": 0.08301985263824463, + "loss/reg": 99838.7421875, + "step": 542 + }, + { + "epoch": 0.013575, + "grad_norm": 0.8785867691040039, + "grad_norm_var": 0.2289359161668815, + "learning_rate": 0.0001, + "loss": 99498.9609, + "loss/crossentropy": 2.3601667881011963, + "loss/hidden": 0.375, + "loss/logits": 0.08771521598100662, + "loss/reg": 99498.5, + "step": 543 + }, + { + "epoch": 0.0136, + "grad_norm": 0.8317751288414001, + "grad_norm_var": 0.23636409549236057, + "learning_rate": 0.0001, + "loss": 99163.625, + "loss/crossentropy": 2.643152952194214, + "loss/hidden": 0.365234375, + "loss/logits": 0.08720413595438004, + "loss/reg": 99163.171875, + "step": 544 + }, + { + "epoch": 0.013625, + "grad_norm": 0.9161708354949951, + "grad_norm_var": 0.23536031020803988, + "learning_rate": 0.0001, + "loss": 98827.1016, + "loss/crossentropy": 2.6195507049560547, + "loss/hidden": 0.369140625, + "loss/logits": 0.0870186984539032, + "loss/reg": 98826.6484375, + "step": 545 + }, + { + "epoch": 0.01365, + "grad_norm": 1.1316457986831665, + "grad_norm_var": 0.22673158391756795, + "learning_rate": 0.0001, + "loss": 98492.0391, + "loss/crossentropy": 2.3038763999938965, + "loss/hidden": 0.37890625, + "loss/logits": 0.09377077966928482, + "loss/reg": 98491.5625, + "step": 546 + }, + { + "epoch": 0.013675, + "grad_norm": 1.3292113542556763, + "grad_norm_var": 0.2254748810643837, + "learning_rate": 0.0001, + "loss": 98157.9375, + "loss/crossentropy": 2.382305860519409, + "loss/hidden": 0.373046875, + "loss/logits": 0.09558240324258804, + "loss/reg": 98157.46875, + "step": 547 + }, + { + "epoch": 0.0137, + "grad_norm": 1.2655940055847168, + "grad_norm_var": 0.1906005621512387, + "learning_rate": 0.0001, + "loss": 97828.7734, + "loss/crossentropy": 2.9607603549957275, + "loss/hidden": 0.390625, + "loss/logits": 0.09649337083101273, + "loss/reg": 97828.2890625, + "step": 548 + }, + { + "epoch": 0.013725, + "grad_norm": 1.2224743366241455, + "grad_norm_var": 0.18953631993640882, + "learning_rate": 0.0001, + "loss": 97497.375, + "loss/crossentropy": 2.421534776687622, + "loss/hidden": 0.40625, + "loss/logits": 0.10133738815784454, + "loss/reg": 97496.8671875, + "step": 549 + }, + { + "epoch": 0.01375, + "grad_norm": 0.9172767996788025, + "grad_norm_var": 0.19494695427785905, + "learning_rate": 0.0001, + "loss": 97166.6797, + "loss/crossentropy": 2.6120481491088867, + "loss/hidden": 0.384765625, + "loss/logits": 0.09978368878364563, + "loss/reg": 97166.1953125, + "step": 550 + }, + { + "epoch": 0.013775, + "grad_norm": 0.9448373317718506, + "grad_norm_var": 0.17817502771646507, + "learning_rate": 0.0001, + "loss": 96838.8984, + "loss/crossentropy": 2.3174571990966797, + "loss/hidden": 0.376953125, + "loss/logits": 0.09136012196540833, + "loss/reg": 96838.4296875, + "step": 551 + }, + { + "epoch": 0.0138, + "grad_norm": 0.877375602722168, + "grad_norm_var": 0.1840462984259357, + "learning_rate": 0.0001, + "loss": 96509.8906, + "loss/crossentropy": 2.413999319076538, + "loss/hidden": 0.375, + "loss/logits": 0.10230405628681183, + "loss/reg": 96509.4140625, + "step": 552 + }, + { + "epoch": 0.013825, + "grad_norm": 0.716687023639679, + "grad_norm_var": 0.06339745650757174, + "learning_rate": 0.0001, + "loss": 96183.0312, + "loss/crossentropy": 2.5708112716674805, + "loss/hidden": 0.359375, + "loss/logits": 0.0769326239824295, + "loss/reg": 96182.59375, + "step": 553 + }, + { + "epoch": 0.01385, + "grad_norm": 1.237869143486023, + "grad_norm_var": 0.06316978103030278, + "learning_rate": 0.0001, + "loss": 95858.75, + "loss/crossentropy": 2.6847381591796875, + "loss/hidden": 0.43359375, + "loss/logits": 0.11960765719413757, + "loss/reg": 95858.1953125, + "step": 554 + }, + { + "epoch": 0.013875, + "grad_norm": 1.0143837928771973, + "grad_norm_var": 0.06333918788560121, + "learning_rate": 0.0001, + "loss": 95535.5703, + "loss/crossentropy": 2.547424077987671, + "loss/hidden": 0.373046875, + "loss/logits": 0.10015293955802917, + "loss/reg": 95535.09375, + "step": 555 + }, + { + "epoch": 0.0139, + "grad_norm": 1.0703991651535034, + "grad_norm_var": 0.03549074686594148, + "learning_rate": 0.0001, + "loss": 95211.0859, + "loss/crossentropy": 2.3821773529052734, + "loss/hidden": 0.376953125, + "loss/logits": 0.10645414888858795, + "loss/reg": 95210.6015625, + "step": 556 + }, + { + "epoch": 0.013925, + "grad_norm": 0.9475630521774292, + "grad_norm_var": 0.03298411734819121, + "learning_rate": 0.0001, + "loss": 94887.0703, + "loss/crossentropy": 2.0802979469299316, + "loss/hidden": 0.380859375, + "loss/logits": 0.08815540373325348, + "loss/reg": 94886.6015625, + "step": 557 + }, + { + "epoch": 0.01395, + "grad_norm": 0.9503076076507568, + "grad_norm_var": 0.030932836160949256, + "learning_rate": 0.0001, + "loss": 94565.9141, + "loss/crossentropy": 2.40145206451416, + "loss/hidden": 0.3515625, + "loss/logits": 0.10239367187023163, + "loss/reg": 94565.4609375, + "step": 558 + }, + { + "epoch": 0.013975, + "grad_norm": 1.001888632774353, + "grad_norm_var": 0.029627885822190794, + "learning_rate": 0.0001, + "loss": 94249.375, + "loss/crossentropy": 2.5415737628936768, + "loss/hidden": 0.376953125, + "loss/logits": 0.0858616754412651, + "loss/reg": 94248.9140625, + "step": 559 + }, + { + "epoch": 0.014, + "grad_norm": 0.8086162209510803, + "grad_norm_var": 0.030253320954721833, + "learning_rate": 0.0001, + "loss": 93932.4844, + "loss/crossentropy": 2.635380506515503, + "loss/hidden": 0.359375, + "loss/logits": 0.07743102312088013, + "loss/reg": 93932.046875, + "step": 560 + }, + { + "epoch": 0.014025, + "grad_norm": 0.9812893271446228, + "grad_norm_var": 0.029599325823544357, + "learning_rate": 0.0001, + "loss": 93618.0938, + "loss/crossentropy": 2.4184508323669434, + "loss/hidden": 0.369140625, + "loss/logits": 0.08096189796924591, + "loss/reg": 93617.640625, + "step": 561 + }, + { + "epoch": 0.01405, + "grad_norm": 1.1916530132293701, + "grad_norm_var": 0.0306689382673141, + "learning_rate": 0.0001, + "loss": 93306.0234, + "loss/crossentropy": 2.4902424812316895, + "loss/hidden": 0.3984375, + "loss/logits": 0.10383652150630951, + "loss/reg": 93305.5234375, + "step": 562 + }, + { + "epoch": 0.014075, + "grad_norm": 0.9122821688652039, + "grad_norm_var": 0.024891042003138592, + "learning_rate": 0.0001, + "loss": 92991.9062, + "loss/crossentropy": 2.4827306270599365, + "loss/hidden": 0.373046875, + "loss/logits": 0.09174686670303345, + "loss/reg": 92991.4453125, + "step": 563 + }, + { + "epoch": 0.0141, + "grad_norm": 0.9577797055244446, + "grad_norm_var": 0.020067592909861532, + "learning_rate": 0.0001, + "loss": 92681.2266, + "loss/crossentropy": 2.670959949493408, + "loss/hidden": 0.361328125, + "loss/logits": 0.09282038360834122, + "loss/reg": 92680.7734375, + "step": 564 + }, + { + "epoch": 0.014125, + "grad_norm": 1.2030467987060547, + "grad_norm_var": 0.01947485873974557, + "learning_rate": 0.0001, + "loss": 92370.6641, + "loss/crossentropy": 2.9116733074188232, + "loss/hidden": 0.390625, + "loss/logits": 0.10250720381736755, + "loss/reg": 92370.171875, + "step": 565 + }, + { + "epoch": 0.01415, + "grad_norm": 0.925117552280426, + "grad_norm_var": 0.0194096484452747, + "learning_rate": 0.0001, + "loss": 92061.8906, + "loss/crossentropy": 2.763044834136963, + "loss/hidden": 0.3828125, + "loss/logits": 0.10554227977991104, + "loss/reg": 92061.3984375, + "step": 566 + }, + { + "epoch": 0.014175, + "grad_norm": 0.9973451495170593, + "grad_norm_var": 0.019309056507003423, + "learning_rate": 0.0001, + "loss": 91752.0859, + "loss/crossentropy": 2.4450998306274414, + "loss/hidden": 0.390625, + "loss/logits": 0.13355585932731628, + "loss/reg": 91751.5625, + "step": 567 + }, + { + "epoch": 0.0142, + "grad_norm": 1.3838785886764526, + "grad_norm_var": 0.027933021499690118, + "learning_rate": 0.0001, + "loss": 91444.9141, + "loss/crossentropy": 2.6224660873413086, + "loss/hidden": 0.37890625, + "loss/logits": 0.08861524611711502, + "loss/reg": 91444.4453125, + "step": 568 + }, + { + "epoch": 0.014225, + "grad_norm": 1.522208571434021, + "grad_norm_var": 0.03604393237560212, + "learning_rate": 0.0001, + "loss": 91137.6016, + "loss/crossentropy": 2.6689343452453613, + "loss/hidden": 0.392578125, + "loss/logits": 0.10325664281845093, + "loss/reg": 91137.109375, + "step": 569 + }, + { + "epoch": 0.01425, + "grad_norm": 0.8374614119529724, + "grad_norm_var": 0.037054228421395864, + "learning_rate": 0.0001, + "loss": 90830.6172, + "loss/crossentropy": 2.6271815299987793, + "loss/hidden": 0.373046875, + "loss/logits": 0.0959094762802124, + "loss/reg": 90830.1484375, + "step": 570 + }, + { + "epoch": 0.014275, + "grad_norm": 0.8968411684036255, + "grad_norm_var": 0.03838309652858253, + "learning_rate": 0.0001, + "loss": 90525.5781, + "loss/crossentropy": 2.5133578777313232, + "loss/hidden": 0.375, + "loss/logits": 0.10282105207443237, + "loss/reg": 90525.1015625, + "step": 571 + }, + { + "epoch": 0.0143, + "grad_norm": 1.0100209712982178, + "grad_norm_var": 0.038339889865839166, + "learning_rate": 0.0001, + "loss": 90223.0859, + "loss/crossentropy": 2.4223990440368652, + "loss/hidden": 0.376953125, + "loss/logits": 0.10173159837722778, + "loss/reg": 90222.609375, + "step": 572 + }, + { + "epoch": 0.014325, + "grad_norm": 0.8718961477279663, + "grad_norm_var": 0.03955925754722214, + "learning_rate": 0.0001, + "loss": 89917.3984, + "loss/crossentropy": 2.210176944732666, + "loss/hidden": 0.369140625, + "loss/logits": 0.09007933735847473, + "loss/reg": 89916.9375, + "step": 573 + }, + { + "epoch": 0.01435, + "grad_norm": 1.0079243183135986, + "grad_norm_var": 0.039168142839422854, + "learning_rate": 0.0001, + "loss": 89616.25, + "loss/crossentropy": 2.6199238300323486, + "loss/hidden": 0.376953125, + "loss/logits": 0.09161718934774399, + "loss/reg": 89615.78125, + "step": 574 + }, + { + "epoch": 0.014375, + "grad_norm": 1.3659257888793945, + "grad_norm_var": 0.04599762261593446, + "learning_rate": 0.0001, + "loss": 89312.8125, + "loss/crossentropy": 2.835010051727295, + "loss/hidden": 0.4140625, + "loss/logits": 0.11617124080657959, + "loss/reg": 89312.28125, + "step": 575 + }, + { + "epoch": 0.0144, + "grad_norm": 0.9582456350326538, + "grad_norm_var": 0.04248980195343724, + "learning_rate": 0.0001, + "loss": 89009.8359, + "loss/crossentropy": 2.6877806186676025, + "loss/hidden": 0.375, + "loss/logits": 0.10095453262329102, + "loss/reg": 89009.359375, + "step": 576 + }, + { + "epoch": 0.014425, + "grad_norm": 1.1128160953521729, + "grad_norm_var": 0.04212170640374529, + "learning_rate": 0.0001, + "loss": 88710.6953, + "loss/crossentropy": 2.6519720554351807, + "loss/hidden": 0.3984375, + "loss/logits": 0.118956059217453, + "loss/reg": 88710.1796875, + "step": 577 + }, + { + "epoch": 0.01445, + "grad_norm": 0.9801219701766968, + "grad_norm_var": 0.04154788913363323, + "learning_rate": 0.0001, + "loss": 88412.6406, + "loss/crossentropy": 2.6760315895080566, + "loss/hidden": 0.3671875, + "loss/logits": 0.08030475676059723, + "loss/reg": 88412.1953125, + "step": 578 + }, + { + "epoch": 0.014475, + "grad_norm": 0.9177776575088501, + "grad_norm_var": 0.041442321658872024, + "learning_rate": 0.0001, + "loss": 88107.7578, + "loss/crossentropy": 2.2624671459198, + "loss/hidden": 0.3828125, + "loss/logits": 0.0892476812005043, + "loss/reg": 88107.2890625, + "step": 579 + }, + { + "epoch": 0.0145, + "grad_norm": 1.1240119934082031, + "grad_norm_var": 0.04091981190463655, + "learning_rate": 0.0001, + "loss": 87812.875, + "loss/crossentropy": 2.497429370880127, + "loss/hidden": 0.3828125, + "loss/logits": 0.10727068781852722, + "loss/reg": 87812.3828125, + "step": 580 + }, + { + "epoch": 0.014525, + "grad_norm": 0.884501576423645, + "grad_norm_var": 0.041596667959251814, + "learning_rate": 0.0001, + "loss": 87488.25, + "loss/crossentropy": 2.6077654361724854, + "loss/hidden": 0.369140625, + "loss/logits": 0.1036926656961441, + "loss/reg": 87487.7734375, + "step": 581 + }, + { + "epoch": 0.01455, + "grad_norm": 0.8187013864517212, + "grad_norm_var": 0.044072914499201234, + "learning_rate": 0.0001, + "loss": 87187.7188, + "loss/crossentropy": 2.4284141063690186, + "loss/hidden": 0.34765625, + "loss/logits": 0.08528505265712738, + "loss/reg": 87187.2890625, + "step": 582 + }, + { + "epoch": 0.014575, + "grad_norm": 1.1420753002166748, + "grad_norm_var": 0.0444990484171594, + "learning_rate": 0.0001, + "loss": 86893.1641, + "loss/crossentropy": 2.579559564590454, + "loss/hidden": 0.373046875, + "loss/logits": 0.10599015653133392, + "loss/reg": 86892.6875, + "step": 583 + }, + { + "epoch": 0.0146, + "grad_norm": 0.8006011247634888, + "grad_norm_var": 0.03996373630048636, + "learning_rate": 0.0001, + "loss": 86596.6172, + "loss/crossentropy": 2.7767887115478516, + "loss/hidden": 0.3359375, + "loss/logits": 0.09008356928825378, + "loss/reg": 86596.1875, + "step": 584 + }, + { + "epoch": 0.014625, + "grad_norm": 0.9800305366516113, + "grad_norm_var": 0.02172002933442022, + "learning_rate": 0.0001, + "loss": 86303.8984, + "loss/crossentropy": 2.4792940616607666, + "loss/hidden": 0.376953125, + "loss/logits": 0.09733758866786957, + "loss/reg": 86303.421875, + "step": 585 + }, + { + "epoch": 0.01465, + "grad_norm": 1.0853314399719238, + "grad_norm_var": 0.02078939040552138, + "learning_rate": 0.0001, + "loss": 86007.5703, + "loss/crossentropy": 2.667954683303833, + "loss/hidden": 0.369140625, + "loss/logits": 0.08128130435943604, + "loss/reg": 86007.1171875, + "step": 586 + }, + { + "epoch": 0.014675, + "grad_norm": 1.3247121572494507, + "grad_norm_var": 0.02650028433099957, + "learning_rate": 0.0001, + "loss": 85712.7266, + "loss/crossentropy": 2.5810751914978027, + "loss/hidden": 0.375, + "loss/logits": 0.10023487359285355, + "loss/reg": 85712.25, + "step": 587 + }, + { + "epoch": 0.0147, + "grad_norm": 1.313215732574463, + "grad_norm_var": 0.03167885628815436, + "learning_rate": 0.0001, + "loss": 85422.0781, + "loss/crossentropy": 2.421856164932251, + "loss/hidden": 0.396484375, + "loss/logits": 0.10593342781066895, + "loss/reg": 85421.578125, + "step": 588 + }, + { + "epoch": 0.014725, + "grad_norm": 0.9700228571891785, + "grad_norm_var": 0.030042102630343695, + "learning_rate": 0.0001, + "loss": 85133.0859, + "loss/crossentropy": 2.4789178371429443, + "loss/hidden": 0.375, + "loss/logits": 0.08909668773412704, + "loss/reg": 85132.625, + "step": 589 + }, + { + "epoch": 0.01475, + "grad_norm": 1.1963694095611572, + "grad_norm_var": 0.03122634137597822, + "learning_rate": 0.0001, + "loss": 84843.0078, + "loss/crossentropy": 2.433040142059326, + "loss/hidden": 0.404296875, + "loss/logits": 0.10144652426242828, + "loss/reg": 84842.5, + "step": 590 + }, + { + "epoch": 0.014775, + "grad_norm": 0.992373526096344, + "grad_norm_var": 0.024755451206358654, + "learning_rate": 0.0001, + "loss": 84551.25, + "loss/crossentropy": 2.5630834102630615, + "loss/hidden": 0.39453125, + "loss/logits": 0.09752138704061508, + "loss/reg": 84550.7578125, + "step": 591 + }, + { + "epoch": 0.0148, + "grad_norm": 0.8379811644554138, + "grad_norm_var": 0.026931197603058725, + "learning_rate": 0.0001, + "loss": 84259.4141, + "loss/crossentropy": 2.4513022899627686, + "loss/hidden": 0.37109375, + "loss/logits": 0.10591454803943634, + "loss/reg": 84258.9375, + "step": 592 + }, + { + "epoch": 0.014825, + "grad_norm": 0.9364086985588074, + "grad_norm_var": 0.026929201080542895, + "learning_rate": 0.0001, + "loss": 83951.6641, + "loss/crossentropy": 2.608539342880249, + "loss/hidden": 0.369140625, + "loss/logits": 0.0893184095621109, + "loss/reg": 83951.203125, + "step": 593 + }, + { + "epoch": 0.01485, + "grad_norm": 0.8221688270568848, + "grad_norm_var": 0.029307624736031577, + "learning_rate": 0.0001, + "loss": 83660.5469, + "loss/crossentropy": 2.3135383129119873, + "loss/hidden": 0.3828125, + "loss/logits": 0.09200223535299301, + "loss/reg": 83660.0703125, + "step": 594 + }, + { + "epoch": 0.014875, + "grad_norm": 0.9461011290550232, + "grad_norm_var": 0.029012726690810557, + "learning_rate": 0.0001, + "loss": 83372.8125, + "loss/crossentropy": 2.4558818340301514, + "loss/hidden": 0.37109375, + "loss/logits": 0.09155316650867462, + "loss/reg": 83372.3515625, + "step": 595 + }, + { + "epoch": 0.0149, + "grad_norm": 1.0248985290527344, + "grad_norm_var": 0.028132075434765116, + "learning_rate": 0.0001, + "loss": 83085.7266, + "loss/crossentropy": 2.8080692291259766, + "loss/hidden": 0.375, + "loss/logits": 0.09577822685241699, + "loss/reg": 83085.2578125, + "step": 596 + }, + { + "epoch": 0.014925, + "grad_norm": 1.0959073305130005, + "grad_norm_var": 0.0275367479929386, + "learning_rate": 0.0001, + "loss": 82797.8594, + "loss/crossentropy": 2.641843795776367, + "loss/hidden": 0.380859375, + "loss/logits": 0.09072021394968033, + "loss/reg": 82797.390625, + "step": 597 + }, + { + "epoch": 0.01495, + "grad_norm": 0.8215295672416687, + "grad_norm_var": 0.027462120182847074, + "learning_rate": 0.0001, + "loss": 82510.1328, + "loss/crossentropy": 2.4811935424804688, + "loss/hidden": 0.365234375, + "loss/logits": 0.09065172076225281, + "loss/reg": 82509.6796875, + "step": 598 + }, + { + "epoch": 0.014975, + "grad_norm": 0.9130824208259583, + "grad_norm_var": 0.02695446037684032, + "learning_rate": 0.0001, + "loss": 82227.9922, + "loss/crossentropy": 2.62929105758667, + "loss/hidden": 0.380859375, + "loss/logits": 0.0900956392288208, + "loss/reg": 82227.5234375, + "step": 599 + }, + { + "epoch": 0.015, + "grad_norm": 0.9190696477890015, + "grad_norm_var": 0.024622010106409423, + "learning_rate": 0.0001, + "loss": 81942.7578, + "loss/crossentropy": 2.5592992305755615, + "loss/hidden": 0.376953125, + "loss/logits": 0.10351692140102386, + "loss/reg": 81942.2734375, + "step": 600 + }, + { + "epoch": 0.015025, + "grad_norm": 0.9625332951545715, + "grad_norm_var": 0.024713862414948787, + "learning_rate": 0.0001, + "loss": 81661.8047, + "loss/crossentropy": 2.3075287342071533, + "loss/hidden": 0.392578125, + "loss/logits": 0.11193102598190308, + "loss/reg": 81661.296875, + "step": 601 + }, + { + "epoch": 0.01505, + "grad_norm": 1.1114728450775146, + "grad_norm_var": 0.025018770937851778, + "learning_rate": 0.0001, + "loss": 81382.1797, + "loss/crossentropy": 2.69278621673584, + "loss/hidden": 0.380859375, + "loss/logits": 0.09133666008710861, + "loss/reg": 81381.7109375, + "step": 602 + }, + { + "epoch": 0.015075, + "grad_norm": 0.9171868562698364, + "grad_norm_var": 0.018392723266432872, + "learning_rate": 0.0001, + "loss": 81098.6562, + "loss/crossentropy": 2.861598253250122, + "loss/hidden": 0.3671875, + "loss/logits": 0.09767521917819977, + "loss/reg": 81098.1875, + "step": 603 + }, + { + "epoch": 0.0151, + "grad_norm": 0.7906949520111084, + "grad_norm_var": 0.012678853894032234, + "learning_rate": 0.0001, + "loss": 80821.0938, + "loss/crossentropy": 2.511326789855957, + "loss/hidden": 0.380859375, + "loss/logits": 0.09441511332988739, + "loss/reg": 80820.6171875, + "step": 604 + }, + { + "epoch": 0.015125, + "grad_norm": 0.981177568435669, + "grad_norm_var": 0.012711037559269285, + "learning_rate": 0.0001, + "loss": 80546.1406, + "loss/crossentropy": 2.412504196166992, + "loss/hidden": 0.384765625, + "loss/logits": 0.09406470507383347, + "loss/reg": 80545.6640625, + "step": 605 + }, + { + "epoch": 0.01515, + "grad_norm": 1.2353384494781494, + "grad_norm_var": 0.014063660269151497, + "learning_rate": 0.0001, + "loss": 80270.0938, + "loss/crossentropy": 2.577329635620117, + "loss/hidden": 0.376953125, + "loss/logits": 0.09719591587781906, + "loss/reg": 80269.6171875, + "step": 606 + }, + { + "epoch": 0.015175, + "grad_norm": 1.114750623703003, + "grad_norm_var": 0.015581013741776673, + "learning_rate": 0.0001, + "loss": 79995.7422, + "loss/crossentropy": 2.8609046936035156, + "loss/hidden": 0.375, + "loss/logits": 0.09995967149734497, + "loss/reg": 79995.265625, + "step": 607 + }, + { + "epoch": 0.0152, + "grad_norm": 0.9995872378349304, + "grad_norm_var": 0.014489421651203335, + "learning_rate": 0.0001, + "loss": 79713.2734, + "loss/crossentropy": 2.586801767349243, + "loss/hidden": 0.39453125, + "loss/logits": 0.10346552729606628, + "loss/reg": 79712.7734375, + "step": 608 + }, + { + "epoch": 0.015225, + "grad_norm": 0.9819250106811523, + "grad_norm_var": 0.014387769896785689, + "learning_rate": 0.0001, + "loss": 79437.2422, + "loss/crossentropy": 2.422132730484009, + "loss/hidden": 0.380859375, + "loss/logits": 0.09481838345527649, + "loss/reg": 79436.765625, + "step": 609 + }, + { + "epoch": 0.01525, + "grad_norm": 1.50832998752594, + "grad_norm_var": 0.0296176065794699, + "learning_rate": 0.0001, + "loss": 79164.0312, + "loss/crossentropy": 2.769239664077759, + "loss/hidden": 0.423828125, + "loss/logits": 0.0994962751865387, + "loss/reg": 79163.5078125, + "step": 610 + }, + { + "epoch": 0.015275, + "grad_norm": 0.6924430727958679, + "grad_norm_var": 0.03614592535571727, + "learning_rate": 0.0001, + "loss": 78892.0547, + "loss/crossentropy": 2.398139476776123, + "loss/hidden": 0.365234375, + "loss/logits": 0.09804777801036835, + "loss/reg": 78891.59375, + "step": 611 + }, + { + "epoch": 0.0153, + "grad_norm": 0.72183758020401, + "grad_norm_var": 0.04105679580103398, + "learning_rate": 0.0001, + "loss": 78621.7109, + "loss/crossentropy": 2.3019931316375732, + "loss/hidden": 0.365234375, + "loss/logits": 0.0825042724609375, + "loss/reg": 78621.265625, + "step": 612 + }, + { + "epoch": 0.015325, + "grad_norm": 1.4297187328338623, + "grad_norm_var": 0.05293835745936608, + "learning_rate": 0.0001, + "loss": 78352.9922, + "loss/crossentropy": 2.805884599685669, + "loss/hidden": 0.388671875, + "loss/logits": 0.09254398196935654, + "loss/reg": 78352.5078125, + "step": 613 + }, + { + "epoch": 0.01535, + "grad_norm": 1.1380010843276978, + "grad_norm_var": 0.051401707871366976, + "learning_rate": 0.0001, + "loss": 78081.8672, + "loss/crossentropy": 2.540323257446289, + "loss/hidden": 0.380859375, + "loss/logits": 0.11731082946062088, + "loss/reg": 78081.3671875, + "step": 614 + }, + { + "epoch": 0.015375, + "grad_norm": 0.9623900055885315, + "grad_norm_var": 0.05081082889014064, + "learning_rate": 0.0001, + "loss": 77810.625, + "loss/crossentropy": 2.7089173793792725, + "loss/hidden": 0.404296875, + "loss/logits": 0.08911550790071487, + "loss/reg": 77810.1328125, + "step": 615 + }, + { + "epoch": 0.0154, + "grad_norm": 0.7471880912780762, + "grad_norm_var": 0.05518013577421012, + "learning_rate": 0.0001, + "loss": 77542.5547, + "loss/crossentropy": 2.477466583251953, + "loss/hidden": 0.3671875, + "loss/logits": 0.09427085518836975, + "loss/reg": 77542.09375, + "step": 616 + }, + { + "epoch": 0.015425, + "grad_norm": 0.8823354840278625, + "grad_norm_var": 0.056179618541439526, + "learning_rate": 0.0001, + "loss": 77277.7422, + "loss/crossentropy": 2.588768243789673, + "loss/hidden": 0.392578125, + "loss/logits": 0.11282309889793396, + "loss/reg": 77277.234375, + "step": 617 + }, + { + "epoch": 0.01545, + "grad_norm": 0.8651421070098877, + "grad_norm_var": 0.05675088538023313, + "learning_rate": 0.0001, + "loss": 77011.3125, + "loss/crossentropy": 2.5131797790527344, + "loss/hidden": 0.37109375, + "loss/logits": 0.09642226248979568, + "loss/reg": 77010.84375, + "step": 618 + }, + { + "epoch": 0.015475, + "grad_norm": 0.8223963379859924, + "grad_norm_var": 0.05833387600102255, + "learning_rate": 0.0001, + "loss": 76746.0625, + "loss/crossentropy": 2.4448771476745605, + "loss/hidden": 0.369140625, + "loss/logits": 0.09524812549352646, + "loss/reg": 76745.6015625, + "step": 619 + }, + { + "epoch": 0.0155, + "grad_norm": 1.0320149660110474, + "grad_norm_var": 0.055493867252299364, + "learning_rate": 0.0001, + "loss": 76476.75, + "loss/crossentropy": 2.574714183807373, + "loss/hidden": 0.390625, + "loss/logits": 0.09856754541397095, + "loss/reg": 76476.2578125, + "step": 620 + }, + { + "epoch": 0.015525, + "grad_norm": 1.0086482763290405, + "grad_norm_var": 0.05544586106039677, + "learning_rate": 0.0001, + "loss": 76212.7031, + "loss/crossentropy": 2.639218807220459, + "loss/hidden": 0.392578125, + "loss/logits": 0.11160705238580704, + "loss/reg": 76212.1953125, + "step": 621 + }, + { + "epoch": 0.01555, + "grad_norm": 0.8823007345199585, + "grad_norm_var": 0.05257570773830693, + "learning_rate": 0.0001, + "loss": 75951.4297, + "loss/crossentropy": 2.27701735496521, + "loss/hidden": 0.34375, + "loss/logits": 0.09013539552688599, + "loss/reg": 75950.9921875, + "step": 622 + }, + { + "epoch": 0.015575, + "grad_norm": 1.14113187789917, + "grad_norm_var": 0.05306922626909578, + "learning_rate": 0.0001, + "loss": 75692.5703, + "loss/crossentropy": 2.671344518661499, + "loss/hidden": 0.396484375, + "loss/logits": 0.10680802166461945, + "loss/reg": 75692.0703125, + "step": 623 + }, + { + "epoch": 0.0156, + "grad_norm": 0.8108043670654297, + "grad_norm_var": 0.05501662590291508, + "learning_rate": 0.0001, + "loss": 75429.5703, + "loss/crossentropy": 2.757770538330078, + "loss/hidden": 0.37890625, + "loss/logits": 0.08836045861244202, + "loss/reg": 75429.1015625, + "step": 624 + }, + { + "epoch": 0.015625, + "grad_norm": 0.8286943435668945, + "grad_norm_var": 0.05637659602012198, + "learning_rate": 0.0001, + "loss": 75170.6719, + "loss/crossentropy": 2.3864805698394775, + "loss/hidden": 0.373046875, + "loss/logits": 0.09211406856775284, + "loss/reg": 75170.203125, + "step": 625 + }, + { + "epoch": 0.01565, + "grad_norm": 1.0009613037109375, + "grad_norm_var": 0.03585083684432448, + "learning_rate": 0.0001, + "loss": 74912.9531, + "loss/crossentropy": 2.4257941246032715, + "loss/hidden": 0.392578125, + "loss/logits": 0.10842062532901764, + "loss/reg": 74912.453125, + "step": 626 + }, + { + "epoch": 0.015675, + "grad_norm": 1.0110238790512085, + "grad_norm_var": 0.031875046203807676, + "learning_rate": 0.0001, + "loss": 74653.6172, + "loss/crossentropy": 2.8187096118927, + "loss/hidden": 0.40625, + "loss/logits": 0.0988621786236763, + "loss/reg": 74653.109375, + "step": 627 + }, + { + "epoch": 0.0157, + "grad_norm": 1.074616551399231, + "grad_norm_var": 0.028672561024481746, + "learning_rate": 0.0001, + "loss": 74396.4531, + "loss/crossentropy": 2.6383824348449707, + "loss/hidden": 0.380859375, + "loss/logits": 0.09884043782949448, + "loss/reg": 74395.9765625, + "step": 628 + }, + { + "epoch": 0.015725, + "grad_norm": 1.1295496225357056, + "grad_norm_var": 0.016198375136296392, + "learning_rate": 0.0001, + "loss": 74140.1016, + "loss/crossentropy": 2.5384268760681152, + "loss/hidden": 0.384765625, + "loss/logits": 0.08849247545003891, + "loss/reg": 74139.625, + "step": 629 + }, + { + "epoch": 0.01575, + "grad_norm": 1.1694039106369019, + "grad_norm_var": 0.017011273804826255, + "learning_rate": 0.0001, + "loss": 73885.1016, + "loss/crossentropy": 2.6295008659362793, + "loss/hidden": 0.373046875, + "loss/logits": 0.0844004675745964, + "loss/reg": 73884.640625, + "step": 630 + }, + { + "epoch": 0.015775, + "grad_norm": 1.0288974046707153, + "grad_norm_var": 0.017304152285151454, + "learning_rate": 0.0001, + "loss": 73633.0469, + "loss/crossentropy": 2.3022804260253906, + "loss/hidden": 0.3671875, + "loss/logits": 0.08712589740753174, + "loss/reg": 73632.59375, + "step": 631 + }, + { + "epoch": 0.0158, + "grad_norm": 1.0885484218597412, + "grad_norm_var": 0.014687348514027516, + "learning_rate": 0.0001, + "loss": 73382.4141, + "loss/crossentropy": 2.400463104248047, + "loss/hidden": 0.380859375, + "loss/logits": 0.09456802904605865, + "loss/reg": 73381.9375, + "step": 632 + }, + { + "epoch": 0.015825, + "grad_norm": 0.6965525150299072, + "grad_norm_var": 0.019413162707278872, + "learning_rate": 0.0001, + "loss": 73129.8203, + "loss/crossentropy": 2.318265914916992, + "loss/hidden": 0.34375, + "loss/logits": 0.08227289468050003, + "loss/reg": 73129.390625, + "step": 633 + }, + { + "epoch": 0.01585, + "grad_norm": 0.971870481967926, + "grad_norm_var": 0.0185700529308282, + "learning_rate": 0.0001, + "loss": 72881.6953, + "loss/crossentropy": 2.730379104614258, + "loss/hidden": 0.380859375, + "loss/logits": 0.10445239394903183, + "loss/reg": 72881.2109375, + "step": 634 + }, + { + "epoch": 0.015875, + "grad_norm": 0.8180504441261292, + "grad_norm_var": 0.018663187891327895, + "learning_rate": 0.0001, + "loss": 72633.0391, + "loss/crossentropy": 2.577846050262451, + "loss/hidden": 0.376953125, + "loss/logits": 0.08856044709682465, + "loss/reg": 72632.5703125, + "step": 635 + }, + { + "epoch": 0.0159, + "grad_norm": 1.065203070640564, + "grad_norm_var": 0.01895858446336399, + "learning_rate": 0.0001, + "loss": 72386.2109, + "loss/crossentropy": 2.308788537979126, + "loss/hidden": 0.396484375, + "loss/logits": 0.09516717493534088, + "loss/reg": 72385.71875, + "step": 636 + }, + { + "epoch": 0.015925, + "grad_norm": 1.204892873764038, + "grad_norm_var": 0.022039542379022472, + "learning_rate": 0.0001, + "loss": 72130.0, + "loss/crossentropy": 3.0494790077209473, + "loss/hidden": 0.38671875, + "loss/logits": 0.08954030275344849, + "loss/reg": 72129.5234375, + "step": 637 + }, + { + "epoch": 0.01595, + "grad_norm": 0.8242658972740173, + "grad_norm_var": 0.02312331942637121, + "learning_rate": 0.0001, + "loss": 71880.1094, + "loss/crossentropy": 2.5868396759033203, + "loss/hidden": 0.37109375, + "loss/logits": 0.08857108652591705, + "loss/reg": 71879.6484375, + "step": 638 + }, + { + "epoch": 0.015975, + "grad_norm": 1.5980547666549683, + "grad_norm_var": 0.04528622991240388, + "learning_rate": 0.0001, + "loss": 71631.4375, + "loss/crossentropy": 2.7629361152648926, + "loss/hidden": 0.380859375, + "loss/logits": 0.09402740001678467, + "loss/reg": 71630.9609375, + "step": 639 + }, + { + "epoch": 0.016, + "grad_norm": 1.148252248764038, + "grad_norm_var": 0.04298691384647809, + "learning_rate": 0.0001, + "loss": 71380.9609, + "loss/crossentropy": 2.600595474243164, + "loss/hidden": 0.408203125, + "loss/logits": 0.10225421190261841, + "loss/reg": 71380.453125, + "step": 640 + }, + { + "epoch": 0.016025, + "grad_norm": 0.9008048176765442, + "grad_norm_var": 0.04126894208351336, + "learning_rate": 0.0001, + "loss": 71136.0469, + "loss/crossentropy": 2.4793899059295654, + "loss/hidden": 0.3828125, + "loss/logits": 0.09964653849601746, + "loss/reg": 71135.5625, + "step": 641 + }, + { + "epoch": 0.01605, + "grad_norm": 1.5230423212051392, + "grad_norm_var": 0.05519127779878321, + "learning_rate": 0.0001, + "loss": 70889.0625, + "loss/crossentropy": 2.731199264526367, + "loss/hidden": 0.42578125, + "loss/logits": 0.11591259390115738, + "loss/reg": 70888.5234375, + "step": 642 + }, + { + "epoch": 0.016075, + "grad_norm": 0.9102566838264465, + "grad_norm_var": 0.05672999381592079, + "learning_rate": 0.0001, + "loss": 70621.5625, + "loss/crossentropy": 2.760890245437622, + "loss/hidden": 0.396484375, + "loss/logits": 0.10278172791004181, + "loss/reg": 70621.0625, + "step": 643 + }, + { + "epoch": 0.0161, + "grad_norm": 1.081149935722351, + "grad_norm_var": 0.056734926693826286, + "learning_rate": 0.0001, + "loss": 70373.5, + "loss/crossentropy": 2.400461196899414, + "loss/hidden": 0.40625, + "loss/logits": 0.1075059324502945, + "loss/reg": 70372.984375, + "step": 644 + }, + { + "epoch": 0.016125, + "grad_norm": 1.0392025709152222, + "grad_norm_var": 0.05655694627189723, + "learning_rate": 0.0001, + "loss": 70126.2656, + "loss/crossentropy": 2.6415491104125977, + "loss/hidden": 0.380859375, + "loss/logits": 0.09059594571590424, + "loss/reg": 70125.796875, + "step": 645 + }, + { + "epoch": 0.01615, + "grad_norm": 0.9613660573959351, + "grad_norm_var": 0.05641525462659834, + "learning_rate": 0.0001, + "loss": 69883.6328, + "loss/crossentropy": 2.573458194732666, + "loss/hidden": 0.388671875, + "loss/logits": 0.08760470896959305, + "loss/reg": 69883.15625, + "step": 646 + }, + { + "epoch": 0.016175, + "grad_norm": 0.9740030169487, + "grad_norm_var": 0.056785681929638974, + "learning_rate": 0.0001, + "loss": 69640.0156, + "loss/crossentropy": 2.484391450881958, + "loss/hidden": 0.373046875, + "loss/logits": 0.09017197042703629, + "loss/reg": 69639.5546875, + "step": 647 + }, + { + "epoch": 0.0162, + "grad_norm": 0.9880450963973999, + "grad_norm_var": 0.0569050432928762, + "learning_rate": 0.0001, + "loss": 69395.2109, + "loss/crossentropy": 2.5021278858184814, + "loss/hidden": 0.375, + "loss/logits": 0.08895129710435867, + "loss/reg": 69394.75, + "step": 648 + }, + { + "epoch": 0.016225, + "grad_norm": 1.1739388704299927, + "grad_norm_var": 0.0490290639565254, + "learning_rate": 0.0001, + "loss": 69152.9688, + "loss/crossentropy": 2.2598655223846436, + "loss/hidden": 0.396484375, + "loss/logits": 0.09144014120101929, + "loss/reg": 69152.484375, + "step": 649 + }, + { + "epoch": 0.01625, + "grad_norm": 0.9586436748504639, + "grad_norm_var": 0.04921993477394175, + "learning_rate": 0.0001, + "loss": 68904.6719, + "loss/crossentropy": 2.626762866973877, + "loss/hidden": 0.38671875, + "loss/logits": 0.0878828912973404, + "loss/reg": 68904.1953125, + "step": 650 + }, + { + "epoch": 0.016275, + "grad_norm": 1.749466061592102, + "grad_norm_var": 0.07176990963569659, + "learning_rate": 0.0001, + "loss": 68660.1797, + "loss/crossentropy": 2.770278215408325, + "loss/hidden": 0.408203125, + "loss/logits": 0.09789373725652695, + "loss/reg": 68659.671875, + "step": 651 + }, + { + "epoch": 0.0163, + "grad_norm": 1.0988394021987915, + "grad_norm_var": 0.07154424731421181, + "learning_rate": 0.0001, + "loss": 68417.2422, + "loss/crossentropy": 2.7384262084960938, + "loss/hidden": 0.404296875, + "loss/logits": 0.10276539623737335, + "loss/reg": 68416.734375, + "step": 652 + }, + { + "epoch": 0.016325, + "grad_norm": 1.6708260774612427, + "grad_norm_var": 0.08955474275790679, + "learning_rate": 0.0001, + "loss": 68176.5625, + "loss/crossentropy": 2.5017573833465576, + "loss/hidden": 0.4140625, + "loss/logits": 0.10067269206047058, + "loss/reg": 68176.046875, + "step": 653 + }, + { + "epoch": 0.01635, + "grad_norm": 1.069149136543274, + "grad_norm_var": 0.08225869387716167, + "learning_rate": 0.0001, + "loss": 67935.7891, + "loss/crossentropy": 2.6416540145874023, + "loss/hidden": 0.390625, + "loss/logits": 0.10551495850086212, + "loss/reg": 67935.2890625, + "step": 654 + }, + { + "epoch": 0.016375, + "grad_norm": 0.8851730823516846, + "grad_norm_var": 0.07407705239805969, + "learning_rate": 0.0001, + "loss": 67690.7422, + "loss/crossentropy": 2.3436880111694336, + "loss/hidden": 0.373046875, + "loss/logits": 0.09293360263109207, + "loss/reg": 67690.2734375, + "step": 655 + }, + { + "epoch": 0.0164, + "grad_norm": 1.2111107110977173, + "grad_norm_var": 0.07444965399694643, + "learning_rate": 0.0001, + "loss": 67449.0312, + "loss/crossentropy": 2.6042330265045166, + "loss/hidden": 0.40234375, + "loss/logits": 0.09541107714176178, + "loss/reg": 67448.53125, + "step": 656 + }, + { + "epoch": 0.016425, + "grad_norm": 0.9232460260391235, + "grad_norm_var": 0.07377383114040156, + "learning_rate": 0.0001, + "loss": 67209.6172, + "loss/crossentropy": 2.577650785446167, + "loss/hidden": 0.37890625, + "loss/logits": 0.10014642030000687, + "loss/reg": 67209.140625, + "step": 657 + }, + { + "epoch": 0.01645, + "grad_norm": 1.239464282989502, + "grad_norm_var": 0.06426360887058274, + "learning_rate": 0.0001, + "loss": 66972.7734, + "loss/crossentropy": 2.5427756309509277, + "loss/hidden": 0.3984375, + "loss/logits": 0.0848100334405899, + "loss/reg": 66972.2890625, + "step": 658 + }, + { + "epoch": 0.016475, + "grad_norm": 1.168949842453003, + "grad_norm_var": 0.0611817648148463, + "learning_rate": 0.0001, + "loss": 66727.8984, + "loss/crossentropy": 2.9303178787231445, + "loss/hidden": 0.40625, + "loss/logits": 0.11067648977041245, + "loss/reg": 66727.3828125, + "step": 659 + }, + { + "epoch": 0.0165, + "grad_norm": 1.0272209644317627, + "grad_norm_var": 0.061765385112204155, + "learning_rate": 0.0001, + "loss": 66490.5938, + "loss/crossentropy": 2.40498423576355, + "loss/hidden": 0.408203125, + "loss/logits": 0.09394747018814087, + "loss/reg": 66490.09375, + "step": 660 + }, + { + "epoch": 0.016525, + "grad_norm": 1.2891979217529297, + "grad_norm_var": 0.06252279059305163, + "learning_rate": 0.0001, + "loss": 66257.8594, + "loss/crossentropy": 2.955780029296875, + "loss/hidden": 0.416015625, + "loss/logits": 0.10309955477714539, + "loss/reg": 66257.34375, + "step": 661 + }, + { + "epoch": 0.01655, + "grad_norm": 1.0556660890579224, + "grad_norm_var": 0.06071574024949246, + "learning_rate": 0.0001, + "loss": 66024.3906, + "loss/crossentropy": 2.543252944946289, + "loss/hidden": 0.3828125, + "loss/logits": 0.10051153600215912, + "loss/reg": 66023.90625, + "step": 662 + }, + { + "epoch": 0.016575, + "grad_norm": 1.3408629894256592, + "grad_norm_var": 0.06026498492915847, + "learning_rate": 0.0001, + "loss": 65792.9219, + "loss/crossentropy": 2.4806578159332275, + "loss/hidden": 0.390625, + "loss/logits": 0.09797219932079315, + "loss/reg": 65792.4296875, + "step": 663 + }, + { + "epoch": 0.0166, + "grad_norm": 1.8128626346588135, + "grad_norm_var": 0.08188244117617766, + "learning_rate": 0.0001, + "loss": 65555.9375, + "loss/crossentropy": 2.7088303565979004, + "loss/hidden": 0.4140625, + "loss/logits": 0.10116955637931824, + "loss/reg": 65555.421875, + "step": 664 + }, + { + "epoch": 0.016625, + "grad_norm": 1.0112638473510742, + "grad_norm_var": 0.08474505865472673, + "learning_rate": 0.0001, + "loss": 65322.957, + "loss/crossentropy": 2.596529722213745, + "loss/hidden": 0.384765625, + "loss/logits": 0.09807929396629333, + "loss/reg": 65322.47265625, + "step": 665 + }, + { + "epoch": 0.01665, + "grad_norm": 1.0874592065811157, + "grad_norm_var": 0.08130189736198545, + "learning_rate": 0.0001, + "loss": 65090.7266, + "loss/crossentropy": 2.3783011436462402, + "loss/hidden": 0.408203125, + "loss/logits": 0.094398632645607, + "loss/reg": 65090.22265625, + "step": 666 + }, + { + "epoch": 0.016675, + "grad_norm": 0.8276606798171997, + "grad_norm_var": 0.07026206154558637, + "learning_rate": 0.0001, + "loss": 64858.5859, + "loss/crossentropy": 2.398869037628174, + "loss/hidden": 0.376953125, + "loss/logits": 0.09084640443325043, + "loss/reg": 64858.1171875, + "step": 667 + }, + { + "epoch": 0.0167, + "grad_norm": 0.8844419121742249, + "grad_norm_var": 0.07516730394582714, + "learning_rate": 0.0001, + "loss": 64631.7969, + "loss/crossentropy": 2.5455760955810547, + "loss/hidden": 0.3828125, + "loss/logits": 0.09470289200544357, + "loss/reg": 64631.3203125, + "step": 668 + }, + { + "epoch": 0.016725, + "grad_norm": 0.8544840812683105, + "grad_norm_var": 0.060839842963105406, + "learning_rate": 0.0001, + "loss": 64403.6758, + "loss/crossentropy": 2.4772214889526367, + "loss/hidden": 0.390625, + "loss/logits": 0.09808434545993805, + "loss/reg": 64403.1875, + "step": 669 + }, + { + "epoch": 0.01675, + "grad_norm": 1.055088758468628, + "grad_norm_var": 0.060920371442028304, + "learning_rate": 0.0001, + "loss": 64176.7617, + "loss/crossentropy": 2.807988405227661, + "loss/hidden": 0.38671875, + "loss/logits": 0.08935073018074036, + "loss/reg": 64176.28515625, + "step": 670 + }, + { + "epoch": 0.016775, + "grad_norm": 0.8417630791664124, + "grad_norm_var": 0.06230839136668986, + "learning_rate": 0.0001, + "loss": 63948.7422, + "loss/crossentropy": 2.2669050693511963, + "loss/hidden": 0.3828125, + "loss/logits": 0.09268943965435028, + "loss/reg": 63948.265625, + "step": 671 + }, + { + "epoch": 0.0168, + "grad_norm": 0.9983002543449402, + "grad_norm_var": 0.062040693678979374, + "learning_rate": 0.0001, + "loss": 63724.1562, + "loss/crossentropy": 2.5512285232543945, + "loss/hidden": 0.375, + "loss/logits": 0.09171494841575623, + "loss/reg": 63723.69140625, + "step": 672 + }, + { + "epoch": 0.016825, + "grad_norm": 1.0449855327606201, + "grad_norm_var": 0.06028262317672586, + "learning_rate": 0.0001, + "loss": 63500.082, + "loss/crossentropy": 2.4477298259735107, + "loss/hidden": 0.396484375, + "loss/logits": 0.1075860857963562, + "loss/reg": 63499.578125, + "step": 673 + }, + { + "epoch": 0.01685, + "grad_norm": 0.907045841217041, + "grad_norm_var": 0.06084048188357287, + "learning_rate": 0.0001, + "loss": 63268.3047, + "loss/crossentropy": 2.5063469409942627, + "loss/hidden": 0.375, + "loss/logits": 0.09125159680843353, + "loss/reg": 63267.83984375, + "step": 674 + }, + { + "epoch": 0.016875, + "grad_norm": 1.0878510475158691, + "grad_norm_var": 0.06024055160010679, + "learning_rate": 0.0001, + "loss": 63049.1797, + "loss/crossentropy": 2.737177610397339, + "loss/hidden": 0.388671875, + "loss/logits": 0.09512165188789368, + "loss/reg": 63048.6953125, + "step": 675 + }, + { + "epoch": 0.0169, + "grad_norm": 1.0298672914505005, + "grad_norm_var": 0.06022575925050888, + "learning_rate": 0.0001, + "loss": 62829.1172, + "loss/crossentropy": 2.5746004581451416, + "loss/hidden": 0.404296875, + "loss/logits": 0.10296541452407837, + "loss/reg": 62828.609375, + "step": 676 + }, + { + "epoch": 0.016925, + "grad_norm": 1.0156315565109253, + "grad_norm_var": 0.05692787604339444, + "learning_rate": 0.0001, + "loss": 62606.5117, + "loss/crossentropy": 2.4797537326812744, + "loss/hidden": 0.384765625, + "loss/logits": 0.10525722801685333, + "loss/reg": 62606.0234375, + "step": 677 + }, + { + "epoch": 0.01695, + "grad_norm": 1.086360216140747, + "grad_norm_var": 0.05699581968737051, + "learning_rate": 0.0001, + "loss": 62389.1406, + "loss/crossentropy": 2.767706871032715, + "loss/hidden": 0.392578125, + "loss/logits": 0.09870638698339462, + "loss/reg": 62388.6484375, + "step": 678 + }, + { + "epoch": 0.016975, + "grad_norm": 0.9417318105697632, + "grad_norm_var": 0.05175923504988777, + "learning_rate": 0.0001, + "loss": 62164.2461, + "loss/crossentropy": 2.4770090579986572, + "loss/hidden": 0.39453125, + "loss/logits": 0.0980028361082077, + "loss/reg": 62163.75390625, + "step": 679 + }, + { + "epoch": 0.017, + "grad_norm": 1.042153000831604, + "grad_norm_var": 0.008479507941950818, + "learning_rate": 0.0001, + "loss": 61943.375, + "loss/crossentropy": 2.5883591175079346, + "loss/hidden": 0.40234375, + "loss/logits": 0.11304843425750732, + "loss/reg": 61942.859375, + "step": 680 + }, + { + "epoch": 0.017025, + "grad_norm": 1.0394786596298218, + "grad_norm_var": 0.008638391318302381, + "learning_rate": 0.0001, + "loss": 61725.0859, + "loss/crossentropy": 2.566668748855591, + "loss/hidden": 0.392578125, + "loss/logits": 0.10709897428750992, + "loss/reg": 61724.5859375, + "step": 681 + }, + { + "epoch": 0.01705, + "grad_norm": 0.9527825713157654, + "grad_norm_var": 0.007914538507960363, + "learning_rate": 0.0001, + "loss": 61510.9727, + "loss/crossentropy": 2.6995317935943604, + "loss/hidden": 0.392578125, + "loss/logits": 0.09625931084156036, + "loss/reg": 61510.484375, + "step": 682 + }, + { + "epoch": 0.017075, + "grad_norm": 0.9171804785728455, + "grad_norm_var": 0.006649581285260438, + "learning_rate": 0.0001, + "loss": 61296.6094, + "loss/crossentropy": 2.4217958450317383, + "loss/hidden": 0.396484375, + "loss/logits": 0.08172821253538132, + "loss/reg": 61296.1328125, + "step": 683 + }, + { + "epoch": 0.0171, + "grad_norm": 0.927759051322937, + "grad_norm_var": 0.006208036346572439, + "learning_rate": 0.0001, + "loss": 61084.2578, + "loss/crossentropy": 2.4237263202667236, + "loss/hidden": 0.373046875, + "loss/logits": 0.08073138445615768, + "loss/reg": 61083.8046875, + "step": 684 + }, + { + "epoch": 0.017125, + "grad_norm": 0.9073169827461243, + "grad_norm_var": 0.005470809927099637, + "learning_rate": 0.0001, + "loss": 60875.1406, + "loss/crossentropy": 2.6566085815429688, + "loss/hidden": 0.3828125, + "loss/logits": 0.09275166690349579, + "loss/reg": 60874.6640625, + "step": 685 + }, + { + "epoch": 0.01715, + "grad_norm": 1.0408992767333984, + "grad_norm_var": 0.0053549642915791265, + "learning_rate": 0.0001, + "loss": 60647.8398, + "loss/crossentropy": 2.3780102729797363, + "loss/hidden": 0.396484375, + "loss/logits": 0.09495106339454651, + "loss/reg": 60647.34765625, + "step": 686 + }, + { + "epoch": 0.017175, + "grad_norm": 0.8319820165634155, + "grad_norm_var": 0.005549465233255546, + "learning_rate": 0.0001, + "loss": 60435.8242, + "loss/crossentropy": 2.619631767272949, + "loss/hidden": 0.3828125, + "loss/logits": 0.08472824096679688, + "loss/reg": 60435.35546875, + "step": 687 + }, + { + "epoch": 0.0172, + "grad_norm": 0.7369972467422485, + "grad_norm_var": 0.009378194651902054, + "learning_rate": 0.0001, + "loss": 60226.2422, + "loss/crossentropy": 2.189018487930298, + "loss/hidden": 0.365234375, + "loss/logits": 0.08153541386127472, + "loss/reg": 60225.796875, + "step": 688 + }, + { + "epoch": 0.017225, + "grad_norm": 1.2337210178375244, + "grad_norm_var": 0.013507195588811511, + "learning_rate": 0.0001, + "loss": 60016.4414, + "loss/crossentropy": 2.679335117340088, + "loss/hidden": 0.416015625, + "loss/logits": 0.1080087423324585, + "loss/reg": 60015.91796875, + "step": 689 + }, + { + "epoch": 0.01725, + "grad_norm": 1.1025989055633545, + "grad_norm_var": 0.013964501931947012, + "learning_rate": 0.0001, + "loss": 59804.4219, + "loss/crossentropy": 2.2785794734954834, + "loss/hidden": 0.400390625, + "loss/logits": 0.09753485023975372, + "loss/reg": 59803.92578125, + "step": 690 + }, + { + "epoch": 0.017275, + "grad_norm": 1.3840587139129639, + "grad_norm_var": 0.023178690900109666, + "learning_rate": 0.0001, + "loss": 59593.8125, + "loss/crossentropy": 2.383326530456543, + "loss/hidden": 0.419921875, + "loss/logits": 0.12039785087108612, + "loss/reg": 59593.2734375, + "step": 691 + }, + { + "epoch": 0.0173, + "grad_norm": 1.042061686515808, + "grad_norm_var": 0.023217186149151425, + "learning_rate": 0.0001, + "loss": 59383.8281, + "loss/crossentropy": 2.518843173980713, + "loss/hidden": 0.392578125, + "loss/logits": 0.09896533191204071, + "loss/reg": 59383.3359375, + "step": 692 + }, + { + "epoch": 0.017325, + "grad_norm": 0.9572771191596985, + "grad_norm_var": 0.023406967698831653, + "learning_rate": 0.0001, + "loss": 59175.8281, + "loss/crossentropy": 2.5718486309051514, + "loss/hidden": 0.384765625, + "loss/logits": 0.09524203091859818, + "loss/reg": 59175.34765625, + "step": 693 + }, + { + "epoch": 0.01735, + "grad_norm": 0.9643810987472534, + "grad_norm_var": 0.023079086485804843, + "learning_rate": 0.0001, + "loss": 58969.3438, + "loss/crossentropy": 2.4464197158813477, + "loss/hidden": 0.375, + "loss/logits": 0.08171060681343079, + "loss/reg": 58968.88671875, + "step": 694 + }, + { + "epoch": 0.017375, + "grad_norm": 0.7643433809280396, + "grad_norm_var": 0.02645698184280339, + "learning_rate": 0.0001, + "loss": 58760.3047, + "loss/crossentropy": 2.5445268154144287, + "loss/hidden": 0.37109375, + "loss/logits": 0.08663898706436157, + "loss/reg": 58759.84765625, + "step": 695 + }, + { + "epoch": 0.0174, + "grad_norm": 1.0930417776107788, + "grad_norm_var": 0.026970586420081825, + "learning_rate": 0.0001, + "loss": 58553.8359, + "loss/crossentropy": 2.5427513122558594, + "loss/hidden": 0.400390625, + "loss/logits": 0.08456844091415405, + "loss/reg": 58553.3515625, + "step": 696 + }, + { + "epoch": 0.017425, + "grad_norm": 1.2481635808944702, + "grad_norm_var": 0.030971972800994032, + "learning_rate": 0.0001, + "loss": 58347.457, + "loss/crossentropy": 3.4390416145324707, + "loss/hidden": 0.423828125, + "loss/logits": 0.10264578461647034, + "loss/reg": 58346.9296875, + "step": 697 + }, + { + "epoch": 0.01745, + "grad_norm": 1.5968362092971802, + "grad_norm_var": 0.052281338212470024, + "learning_rate": 0.0001, + "loss": 58131.2969, + "loss/crossentropy": 2.9014079570770264, + "loss/hidden": 0.470703125, + "loss/logits": 0.1580468714237213, + "loss/reg": 58130.66796875, + "step": 698 + }, + { + "epoch": 0.017475, + "grad_norm": 1.3415122032165527, + "grad_norm_var": 0.05620201146869827, + "learning_rate": 0.0001, + "loss": 57911.957, + "loss/crossentropy": 2.282113552093506, + "loss/hidden": 0.435546875, + "loss/logits": 0.11629819869995117, + "loss/reg": 57911.40625, + "step": 699 + }, + { + "epoch": 0.0175, + "grad_norm": 0.9442893266677856, + "grad_norm_var": 0.055898291300073545, + "learning_rate": 0.0001, + "loss": 57706.0586, + "loss/crossentropy": 2.5043060779571533, + "loss/hidden": 0.37890625, + "loss/logits": 0.08389831334352493, + "loss/reg": 57705.59765625, + "step": 700 + }, + { + "epoch": 0.017525, + "grad_norm": 0.8646631240844727, + "grad_norm_var": 0.0569619053459128, + "learning_rate": 0.0001, + "loss": 57504.4492, + "loss/crossentropy": 2.701007127761841, + "loss/hidden": 0.396484375, + "loss/logits": 0.09037187695503235, + "loss/reg": 57503.9609375, + "step": 701 + }, + { + "epoch": 0.01755, + "grad_norm": 0.956434965133667, + "grad_norm_var": 0.057754406244796865, + "learning_rate": 0.0001, + "loss": 57301.9492, + "loss/crossentropy": 2.4940638542175293, + "loss/hidden": 0.39453125, + "loss/logits": 0.09759010374546051, + "loss/reg": 57301.45703125, + "step": 702 + }, + { + "epoch": 0.017575, + "grad_norm": 0.9836230278015137, + "grad_norm_var": 0.054451990386576776, + "learning_rate": 0.0001, + "loss": 57096.0312, + "loss/crossentropy": 2.60841703414917, + "loss/hidden": 0.41015625, + "loss/logits": 0.109431691467762, + "loss/reg": 57095.51171875, + "step": 703 + }, + { + "epoch": 0.0176, + "grad_norm": 0.8200333714485168, + "grad_norm_var": 0.05113104603631058, + "learning_rate": 0.0001, + "loss": 56892.293, + "loss/crossentropy": 2.5531857013702393, + "loss/hidden": 0.404296875, + "loss/logits": 0.08870550990104675, + "loss/reg": 56891.80078125, + "step": 704 + }, + { + "epoch": 0.017625, + "grad_norm": 0.8858484625816345, + "grad_norm_var": 0.05161385658113162, + "learning_rate": 0.0001, + "loss": 56688.5117, + "loss/crossentropy": 2.4622130393981934, + "loss/hidden": 0.3984375, + "loss/logits": 0.09578855335712433, + "loss/reg": 56688.015625, + "step": 705 + }, + { + "epoch": 0.01765, + "grad_norm": 0.9303566217422485, + "grad_norm_var": 0.05247420907563337, + "learning_rate": 0.0001, + "loss": 56485.6719, + "loss/crossentropy": 2.9453697204589844, + "loss/hidden": 0.3984375, + "loss/logits": 0.10039514303207397, + "loss/reg": 56485.171875, + "step": 706 + }, + { + "epoch": 0.017675, + "grad_norm": 1.0896490812301636, + "grad_norm_var": 0.04472156313393646, + "learning_rate": 0.0001, + "loss": 56285.3008, + "loss/crossentropy": 2.870180130004883, + "loss/hidden": 0.3984375, + "loss/logits": 0.08954556286334991, + "loss/reg": 56284.8125, + "step": 707 + }, + { + "epoch": 0.0177, + "grad_norm": 0.925428032875061, + "grad_norm_var": 0.04538664754722578, + "learning_rate": 0.0001, + "loss": 56084.5703, + "loss/crossentropy": 2.3981821537017822, + "loss/hidden": 0.384765625, + "loss/logits": 0.09831464290618896, + "loss/reg": 56084.0859375, + "step": 708 + }, + { + "epoch": 0.017725, + "grad_norm": 0.9551652669906616, + "grad_norm_var": 0.04540539527495232, + "learning_rate": 0.0001, + "loss": 55886.6172, + "loss/crossentropy": 2.4851620197296143, + "loss/hidden": 0.408203125, + "loss/logits": 0.09828799217939377, + "loss/reg": 55886.109375, + "step": 709 + }, + { + "epoch": 0.01775, + "grad_norm": 1.3149915933609009, + "grad_norm_var": 0.050360417836195145, + "learning_rate": 0.0001, + "loss": 55688.0508, + "loss/crossentropy": 2.6232731342315674, + "loss/hidden": 0.423828125, + "loss/logits": 0.10853007435798645, + "loss/reg": 55687.51953125, + "step": 710 + }, + { + "epoch": 0.017775, + "grad_norm": 1.0394874811172485, + "grad_norm_var": 0.04480868587850016, + "learning_rate": 0.0001, + "loss": 55489.6914, + "loss/crossentropy": 2.5964436531066895, + "loss/hidden": 0.39453125, + "loss/logits": 0.10230209678411484, + "loss/reg": 55489.1953125, + "step": 711 + }, + { + "epoch": 0.0178, + "grad_norm": 1.346766710281372, + "grad_norm_var": 0.04988758520215928, + "learning_rate": 0.0001, + "loss": 55292.3594, + "loss/crossentropy": 2.8174843788146973, + "loss/hidden": 0.40625, + "loss/logits": 0.10428360104560852, + "loss/reg": 55291.84765625, + "step": 712 + }, + { + "epoch": 0.017825, + "grad_norm": 1.1590180397033691, + "grad_norm_var": 0.04835816219355919, + "learning_rate": 0.0001, + "loss": 55098.832, + "loss/crossentropy": 2.6847729682922363, + "loss/hidden": 0.388671875, + "loss/logits": 0.09854494780302048, + "loss/reg": 55098.34375, + "step": 713 + }, + { + "epoch": 0.01785, + "grad_norm": 0.8711786866188049, + "grad_norm_var": 0.03050188085823664, + "learning_rate": 0.0001, + "loss": 54903.668, + "loss/crossentropy": 2.5830821990966797, + "loss/hidden": 0.388671875, + "loss/logits": 0.09112709015607834, + "loss/reg": 54903.1875, + "step": 714 + }, + { + "epoch": 0.017875, + "grad_norm": 1.0323970317840576, + "grad_norm_var": 0.02350200522760937, + "learning_rate": 0.0001, + "loss": 54711.5156, + "loss/crossentropy": 2.497689962387085, + "loss/hidden": 0.390625, + "loss/logits": 0.10076910257339478, + "loss/reg": 54711.0234375, + "step": 715 + }, + { + "epoch": 0.0179, + "grad_norm": 0.903690755367279, + "grad_norm_var": 0.02394696248407584, + "learning_rate": 0.0001, + "loss": 54519.4961, + "loss/crossentropy": 2.614609479904175, + "loss/hidden": 0.37890625, + "loss/logits": 0.08881643414497375, + "loss/reg": 54519.02734375, + "step": 716 + }, + { + "epoch": 0.017925, + "grad_norm": 1.0168510675430298, + "grad_norm_var": 0.02254846601204997, + "learning_rate": 0.0001, + "loss": 54330.6914, + "loss/crossentropy": 2.3349671363830566, + "loss/hidden": 0.384765625, + "loss/logits": 0.08678834140300751, + "loss/reg": 54330.21875, + "step": 717 + }, + { + "epoch": 0.01795, + "grad_norm": 0.8937983512878418, + "grad_norm_var": 0.023278044427324042, + "learning_rate": 0.0001, + "loss": 54141.4766, + "loss/crossentropy": 2.5494956970214844, + "loss/hidden": 0.39453125, + "loss/logits": 0.09560904651880264, + "loss/reg": 54140.98828125, + "step": 718 + }, + { + "epoch": 0.017975, + "grad_norm": 0.8298357725143433, + "grad_norm_var": 0.025307676785486895, + "learning_rate": 0.0001, + "loss": 53953.5859, + "loss/crossentropy": 2.6559760570526123, + "loss/hidden": 0.3828125, + "loss/logits": 0.09700687229633331, + "loss/reg": 53953.10546875, + "step": 719 + }, + { + "epoch": 0.018, + "grad_norm": 0.8975862264633179, + "grad_norm_var": 0.023813287575312345, + "learning_rate": 0.0001, + "loss": 53766.7773, + "loss/crossentropy": 2.5280327796936035, + "loss/hidden": 0.39453125, + "loss/logits": 0.10120564699172974, + "loss/reg": 53766.28125, + "step": 720 + }, + { + "epoch": 0.018025, + "grad_norm": 0.9913856983184814, + "grad_norm_var": 0.02282216583665496, + "learning_rate": 0.0001, + "loss": 53579.6211, + "loss/crossentropy": 2.3738551139831543, + "loss/hidden": 0.40625, + "loss/logits": 0.10090280324220657, + "loss/reg": 53579.11328125, + "step": 721 + }, + { + "epoch": 0.01805, + "grad_norm": 0.872941255569458, + "grad_norm_var": 0.023655882749711074, + "learning_rate": 0.0001, + "loss": 53392.3008, + "loss/crossentropy": 2.528557777404785, + "loss/hidden": 0.3828125, + "loss/logits": 0.10077151656150818, + "loss/reg": 53391.81640625, + "step": 722 + }, + { + "epoch": 0.018075, + "grad_norm": 1.0608396530151367, + "grad_norm_var": 0.02339704359274458, + "learning_rate": 0.0001, + "loss": 53203.9492, + "loss/crossentropy": 2.8958404064178467, + "loss/hidden": 0.388671875, + "loss/logits": 0.09159500896930695, + "loss/reg": 53203.46875, + "step": 723 + }, + { + "epoch": 0.0181, + "grad_norm": 0.9737719893455505, + "grad_norm_var": 0.02301757035596806, + "learning_rate": 0.0001, + "loss": 53018.8477, + "loss/crossentropy": 2.9025018215179443, + "loss/hidden": 0.396484375, + "loss/logits": 0.09637045860290527, + "loss/reg": 53018.35546875, + "step": 724 + }, + { + "epoch": 0.018125, + "grad_norm": 1.0595719814300537, + "grad_norm_var": 0.022935775569963925, + "learning_rate": 0.0001, + "loss": 52824.4766, + "loss/crossentropy": 2.671370029449463, + "loss/hidden": 0.40234375, + "loss/logits": 0.09349768608808517, + "loss/reg": 52823.98046875, + "step": 725 + }, + { + "epoch": 0.01815, + "grad_norm": 1.1013160943984985, + "grad_norm_var": 0.017285506348992032, + "learning_rate": 0.0001, + "loss": 52636.5117, + "loss/crossentropy": 2.4408681392669678, + "loss/hidden": 0.3984375, + "loss/logits": 0.09394092112779617, + "loss/reg": 52636.01953125, + "step": 726 + }, + { + "epoch": 0.018175, + "grad_norm": 1.0487664937973022, + "grad_norm_var": 0.017335841551676372, + "learning_rate": 0.0001, + "loss": 52449.5781, + "loss/crossentropy": 2.4244089126586914, + "loss/hidden": 0.404296875, + "loss/logits": 0.09841768443584442, + "loss/reg": 52449.07421875, + "step": 727 + }, + { + "epoch": 0.0182, + "grad_norm": 1.3364503383636475, + "grad_norm_var": 0.016870643779337263, + "learning_rate": 0.0001, + "loss": 52264.5, + "loss/crossentropy": 2.7356204986572266, + "loss/hidden": 0.400390625, + "loss/logits": 0.094476617872715, + "loss/reg": 52264.00390625, + "step": 728 + }, + { + "epoch": 0.018225, + "grad_norm": 1.2551052570343018, + "grad_norm_var": 0.01944541553566581, + "learning_rate": 0.0001, + "loss": 52080.7422, + "loss/crossentropy": 2.345259666442871, + "loss/hidden": 0.404296875, + "loss/logits": 0.10854319483041763, + "loss/reg": 52080.23046875, + "step": 729 + }, + { + "epoch": 0.01825, + "grad_norm": 0.994271457195282, + "grad_norm_var": 0.01812890564488113, + "learning_rate": 0.0001, + "loss": 51897.2383, + "loss/crossentropy": 2.5534796714782715, + "loss/hidden": 0.400390625, + "loss/logits": 0.09998567402362823, + "loss/reg": 51896.73828125, + "step": 730 + }, + { + "epoch": 0.018275, + "grad_norm": 1.036136507987976, + "grad_norm_var": 0.018137563129258405, + "learning_rate": 0.0001, + "loss": 51714.25, + "loss/crossentropy": 2.641772985458374, + "loss/hidden": 0.380859375, + "loss/logits": 0.09617019444704056, + "loss/reg": 51713.7734375, + "step": 731 + }, + { + "epoch": 0.0183, + "grad_norm": 0.9567418098449707, + "grad_norm_var": 0.01751183305709052, + "learning_rate": 0.0001, + "loss": 51531.0078, + "loss/crossentropy": 2.4739558696746826, + "loss/hidden": 0.380859375, + "loss/logits": 0.08270092308521271, + "loss/reg": 51530.54296875, + "step": 732 + }, + { + "epoch": 0.018325, + "grad_norm": 0.8940505981445312, + "grad_norm_var": 0.018511384254257735, + "learning_rate": 0.0001, + "loss": 51347.918, + "loss/crossentropy": 2.1752779483795166, + "loss/hidden": 0.388671875, + "loss/logits": 0.09863855689764023, + "loss/reg": 51347.4296875, + "step": 733 + }, + { + "epoch": 0.01835, + "grad_norm": 1.2298671007156372, + "grad_norm_var": 0.020244154044657077, + "learning_rate": 0.0001, + "loss": 51164.9102, + "loss/crossentropy": 2.7591755390167236, + "loss/hidden": 0.4296875, + "loss/logits": 0.11854663491249084, + "loss/reg": 51164.36328125, + "step": 734 + }, + { + "epoch": 0.018375, + "grad_norm": 1.0282326936721802, + "grad_norm_var": 0.017312361786124993, + "learning_rate": 0.0001, + "loss": 50983.2852, + "loss/crossentropy": 2.5418176651000977, + "loss/hidden": 0.4140625, + "loss/logits": 0.094975546002388, + "loss/reg": 50982.77734375, + "step": 735 + }, + { + "epoch": 0.0184, + "grad_norm": 1.6109849214553833, + "grad_norm_var": 0.034997721610513996, + "learning_rate": 0.0001, + "loss": 50794.875, + "loss/crossentropy": 2.864858627319336, + "loss/hidden": 0.421875, + "loss/logits": 0.09372644126415253, + "loss/reg": 50794.359375, + "step": 736 + }, + { + "epoch": 0.018425, + "grad_norm": 1.2849129438400269, + "grad_norm_var": 0.03649762544530522, + "learning_rate": 0.0001, + "loss": 50608.5859, + "loss/crossentropy": 2.546605348587036, + "loss/hidden": 0.40625, + "loss/logits": 0.10290227085351944, + "loss/reg": 50608.078125, + "step": 737 + }, + { + "epoch": 0.01845, + "grad_norm": 1.8890656232833862, + "grad_norm_var": 0.06904774488376778, + "learning_rate": 0.0001, + "loss": 50429.8125, + "loss/crossentropy": 2.8703765869140625, + "loss/hidden": 0.443359375, + "loss/logits": 0.10859087854623795, + "loss/reg": 50429.26171875, + "step": 738 + }, + { + "epoch": 0.018475, + "grad_norm": 1.1573561429977417, + "grad_norm_var": 0.0681929487293542, + "learning_rate": 0.0001, + "loss": 50247.793, + "loss/crossentropy": 2.488643169403076, + "loss/hidden": 0.4140625, + "loss/logits": 0.09659995883703232, + "loss/reg": 50247.28125, + "step": 739 + }, + { + "epoch": 0.0185, + "grad_norm": 1.135470986366272, + "grad_norm_var": 0.06541238942306339, + "learning_rate": 0.0001, + "loss": 50067.6562, + "loss/crossentropy": 2.4839093685150146, + "loss/hidden": 0.41796875, + "loss/logits": 0.1023077517747879, + "loss/reg": 50067.13671875, + "step": 740 + }, + { + "epoch": 0.018525, + "grad_norm": 0.9855368733406067, + "grad_norm_var": 0.06702907714801304, + "learning_rate": 0.0001, + "loss": 49887.3594, + "loss/crossentropy": 2.554145097732544, + "loss/hidden": 0.408203125, + "loss/logits": 0.08684562146663666, + "loss/reg": 49886.86328125, + "step": 741 + }, + { + "epoch": 0.01855, + "grad_norm": 0.9068771004676819, + "grad_norm_var": 0.07153601226928949, + "learning_rate": 0.0001, + "loss": 49709.043, + "loss/crossentropy": 2.65455961227417, + "loss/hidden": 0.392578125, + "loss/logits": 0.10292993485927582, + "loss/reg": 49708.546875, + "step": 742 + }, + { + "epoch": 0.018575, + "grad_norm": 0.9256609678268433, + "grad_norm_var": 0.07450373194885765, + "learning_rate": 0.0001, + "loss": 49530.3398, + "loss/crossentropy": 2.7470786571502686, + "loss/hidden": 0.412109375, + "loss/logits": 0.09205284714698792, + "loss/reg": 49529.8359375, + "step": 743 + }, + { + "epoch": 0.0186, + "grad_norm": 1.1988563537597656, + "grad_norm_var": 0.07252635823950025, + "learning_rate": 0.0001, + "loss": 49353.957, + "loss/crossentropy": 2.5547876358032227, + "loss/hidden": 0.435546875, + "loss/logits": 0.10985640436410904, + "loss/reg": 49353.41015625, + "step": 744 + }, + { + "epoch": 0.018625, + "grad_norm": 0.8618044853210449, + "grad_norm_var": 0.07697458673344261, + "learning_rate": 0.0001, + "loss": 49178.1562, + "loss/crossentropy": 2.3190128803253174, + "loss/hidden": 0.3984375, + "loss/logits": 0.09546959400177002, + "loss/reg": 49177.6640625, + "step": 745 + }, + { + "epoch": 0.01865, + "grad_norm": 1.092405080795288, + "grad_norm_var": 0.07578759425521137, + "learning_rate": 0.0001, + "loss": 49006.7148, + "loss/crossentropy": 2.772601842880249, + "loss/hidden": 0.42578125, + "loss/logits": 0.10889579355716705, + "loss/reg": 49006.1796875, + "step": 746 + }, + { + "epoch": 0.018675, + "grad_norm": 0.8835573196411133, + "grad_norm_var": 0.0792970681340984, + "learning_rate": 0.0001, + "loss": 48833.25, + "loss/crossentropy": 2.623377799987793, + "loss/hidden": 0.3984375, + "loss/logits": 0.09446098655462265, + "loss/reg": 48832.7578125, + "step": 747 + }, + { + "epoch": 0.0187, + "grad_norm": 2.9753003120422363, + "grad_norm_var": 0.28797693666548735, + "learning_rate": 0.0001, + "loss": 48662.4414, + "loss/crossentropy": 2.5330989360809326, + "loss/hidden": 0.466796875, + "loss/logits": 0.19345475733280182, + "loss/reg": 48661.78125, + "step": 748 + }, + { + "epoch": 0.018725, + "grad_norm": 1.1538830995559692, + "grad_norm_var": 0.2797350796604114, + "learning_rate": 0.0001, + "loss": 48490.5547, + "loss/crossentropy": 2.716426372528076, + "loss/hidden": 0.419921875, + "loss/logits": 0.09423987567424774, + "loss/reg": 48490.0390625, + "step": 749 + }, + { + "epoch": 0.01875, + "grad_norm": 0.8942911624908447, + "grad_norm_var": 0.28856832743585786, + "learning_rate": 0.0001, + "loss": 48314.0898, + "loss/crossentropy": 2.4474997520446777, + "loss/hidden": 0.40234375, + "loss/logits": 0.11440010368824005, + "loss/reg": 48313.57421875, + "step": 750 + }, + { + "epoch": 0.018775, + "grad_norm": 0.9876633882522583, + "grad_norm_var": 0.28986544400916986, + "learning_rate": 0.0001, + "loss": 48141.3477, + "loss/crossentropy": 2.454270839691162, + "loss/hidden": 0.384765625, + "loss/logits": 0.0848151445388794, + "loss/reg": 48140.87890625, + "step": 751 + }, + { + "epoch": 0.0188, + "grad_norm": 0.9700044393539429, + "grad_norm_var": 0.2843915862496468, + "learning_rate": 0.0001, + "loss": 47969.8867, + "loss/crossentropy": 2.5182764530181885, + "loss/hidden": 0.404296875, + "loss/logits": 0.10214034467935562, + "loss/reg": 47969.37890625, + "step": 752 + }, + { + "epoch": 0.018825, + "grad_norm": 1.2348213195800781, + "grad_norm_var": 0.28402413344607497, + "learning_rate": 0.0001, + "loss": 47798.5234, + "loss/crossentropy": 2.8100967407226562, + "loss/hidden": 0.439453125, + "loss/logits": 0.1163613498210907, + "loss/reg": 47797.96875, + "step": 753 + }, + { + "epoch": 0.01885, + "grad_norm": 0.9870022535324097, + "grad_norm_var": 0.2523991444644473, + "learning_rate": 0.0001, + "loss": 47626.3047, + "loss/crossentropy": 2.3341073989868164, + "loss/hidden": 0.41796875, + "loss/logits": 0.10574543476104736, + "loss/reg": 47625.78125, + "step": 754 + }, + { + "epoch": 0.018875, + "grad_norm": 1.1098355054855347, + "grad_norm_var": 0.25247406778994563, + "learning_rate": 0.0001, + "loss": 47454.8125, + "loss/crossentropy": 2.5026051998138428, + "loss/hidden": 0.423828125, + "loss/logits": 0.09431763738393784, + "loss/reg": 47454.29296875, + "step": 755 + }, + { + "epoch": 0.0189, + "grad_norm": 0.9873075485229492, + "grad_norm_var": 0.25401331385406956, + "learning_rate": 0.0001, + "loss": 47285.4258, + "loss/crossentropy": 2.3226237297058105, + "loss/hidden": 0.419921875, + "loss/logits": 0.09147959202528, + "loss/reg": 47284.9140625, + "step": 756 + }, + { + "epoch": 0.018925, + "grad_norm": 0.8810727596282959, + "grad_norm_var": 0.2567726447010756, + "learning_rate": 0.0001, + "loss": 47116.3984, + "loss/crossentropy": 2.390197992324829, + "loss/hidden": 0.392578125, + "loss/logits": 0.08687802404165268, + "loss/reg": 47115.91796875, + "step": 757 + }, + { + "epoch": 0.01895, + "grad_norm": 0.9770220518112183, + "grad_norm_var": 0.25501070736492004, + "learning_rate": 0.0001, + "loss": 46947.9883, + "loss/crossentropy": 2.5187008380889893, + "loss/hidden": 0.396484375, + "loss/logits": 0.0867588073015213, + "loss/reg": 46947.50390625, + "step": 758 + }, + { + "epoch": 0.018975, + "grad_norm": 1.1931730508804321, + "grad_norm_var": 0.25210469699044175, + "learning_rate": 0.0001, + "loss": 46780.5078, + "loss/crossentropy": 2.5223076343536377, + "loss/hidden": 0.388671875, + "loss/logits": 0.09107695519924164, + "loss/reg": 46780.02734375, + "step": 759 + }, + { + "epoch": 0.019, + "grad_norm": 0.8697965145111084, + "grad_norm_var": 0.25669575978909187, + "learning_rate": 0.0001, + "loss": 46615.582, + "loss/crossentropy": 2.593898296356201, + "loss/hidden": 0.416015625, + "loss/logits": 0.09482245147228241, + "loss/reg": 46615.0703125, + "step": 760 + }, + { + "epoch": 0.019025, + "grad_norm": 0.9785832166671753, + "grad_norm_var": 0.2533926460845291, + "learning_rate": 0.0001, + "loss": 46452.4062, + "loss/crossentropy": 2.3558356761932373, + "loss/hidden": 0.4140625, + "loss/logits": 0.09463217854499817, + "loss/reg": 46451.8984375, + "step": 761 + }, + { + "epoch": 0.01905, + "grad_norm": 0.8423299193382263, + "grad_norm_var": 0.258754260549059, + "learning_rate": 0.0001, + "loss": 46287.6953, + "loss/crossentropy": 2.4062812328338623, + "loss/hidden": 0.40625, + "loss/logits": 0.09871132671833038, + "loss/reg": 46287.19140625, + "step": 762 + }, + { + "epoch": 0.019075, + "grad_norm": 0.9173998236656189, + "grad_norm_var": 0.25775734273993856, + "learning_rate": 0.0001, + "loss": 46124.8672, + "loss/crossentropy": 2.4662153720855713, + "loss/hidden": 0.396484375, + "loss/logits": 0.10506129264831543, + "loss/reg": 46124.3671875, + "step": 763 + }, + { + "epoch": 0.0191, + "grad_norm": 0.8769504427909851, + "grad_norm_var": 0.014563933054716201, + "learning_rate": 0.0001, + "loss": 45961.957, + "loss/crossentropy": 2.389399290084839, + "loss/hidden": 0.380859375, + "loss/logits": 0.08493934571743011, + "loss/reg": 45961.4921875, + "step": 764 + }, + { + "epoch": 0.019125, + "grad_norm": 0.8609285950660706, + "grad_norm_var": 0.013578057378831923, + "learning_rate": 0.0001, + "loss": 45799.5195, + "loss/crossentropy": 2.642210006713867, + "loss/hidden": 0.396484375, + "loss/logits": 0.09388473629951477, + "loss/reg": 45799.02734375, + "step": 765 + }, + { + "epoch": 0.01915, + "grad_norm": 0.8592740297317505, + "grad_norm_var": 0.014022235668175101, + "learning_rate": 0.0001, + "loss": 45639.2656, + "loss/crossentropy": 2.7636122703552246, + "loss/hidden": 0.400390625, + "loss/logits": 0.10290414094924927, + "loss/reg": 45638.76171875, + "step": 766 + }, + { + "epoch": 0.019175, + "grad_norm": 1.0137566328048706, + "grad_norm_var": 0.014123379321947441, + "learning_rate": 0.0001, + "loss": 45479.9531, + "loss/crossentropy": 2.917463779449463, + "loss/hidden": 0.423828125, + "loss/logits": 0.09564699232578278, + "loss/reg": 45479.43359375, + "step": 767 + }, + { + "epoch": 0.0192, + "grad_norm": 0.886622965335846, + "grad_norm_var": 0.014585137676438467, + "learning_rate": 0.0001, + "loss": 45319.2539, + "loss/crossentropy": 2.5581212043762207, + "loss/hidden": 0.392578125, + "loss/logits": 0.09170180559158325, + "loss/reg": 45318.76953125, + "step": 768 + }, + { + "epoch": 0.019225, + "grad_norm": 0.9484365582466125, + "grad_norm_var": 0.009493744584728781, + "learning_rate": 0.0001, + "loss": 45158.2812, + "loss/crossentropy": 2.7654075622558594, + "loss/hidden": 0.38671875, + "loss/logits": 0.08669186383485794, + "loss/reg": 45157.80859375, + "step": 769 + }, + { + "epoch": 0.01925, + "grad_norm": 1.2060688734054565, + "grad_norm_var": 0.013593108736964095, + "learning_rate": 0.0001, + "loss": 44999.1328, + "loss/crossentropy": 2.64562726020813, + "loss/hidden": 0.40234375, + "loss/logits": 0.09077286720275879, + "loss/reg": 44998.640625, + "step": 770 + }, + { + "epoch": 0.019275, + "grad_norm": 1.0256370306015015, + "grad_norm_var": 0.012388143634409174, + "learning_rate": 0.0001, + "loss": 44841.543, + "loss/crossentropy": 2.2263450622558594, + "loss/hidden": 0.4140625, + "loss/logits": 0.09414076805114746, + "loss/reg": 44841.03515625, + "step": 771 + }, + { + "epoch": 0.0193, + "grad_norm": 0.8258554935455322, + "grad_norm_var": 0.013381517293385947, + "learning_rate": 0.0001, + "loss": 44683.5742, + "loss/crossentropy": 2.7400131225585938, + "loss/hidden": 0.39453125, + "loss/logits": 0.10209719091653824, + "loss/reg": 44683.078125, + "step": 772 + }, + { + "epoch": 0.019325, + "grad_norm": 1.02664315700531, + "grad_norm_var": 0.013413099005024283, + "learning_rate": 0.0001, + "loss": 44527.4648, + "loss/crossentropy": 2.6620962619781494, + "loss/hidden": 0.419921875, + "loss/logits": 0.10463500022888184, + "loss/reg": 44526.94140625, + "step": 773 + }, + { + "epoch": 0.01935, + "grad_norm": 0.7048200964927673, + "grad_norm_var": 0.017309307493440434, + "learning_rate": 0.0001, + "loss": 44371.5391, + "loss/crossentropy": 2.5526435375213623, + "loss/hidden": 0.3828125, + "loss/logits": 0.08704051375389099, + "loss/reg": 44371.0703125, + "step": 774 + }, + { + "epoch": 0.019375, + "grad_norm": 1.1765838861465454, + "grad_norm_var": 0.01676600216286676, + "learning_rate": 0.0001, + "loss": 44216.4805, + "loss/crossentropy": 2.7540805339813232, + "loss/hidden": 0.3984375, + "loss/logits": 0.09850931912660599, + "loss/reg": 44215.984375, + "step": 775 + }, + { + "epoch": 0.0194, + "grad_norm": 0.9540759921073914, + "grad_norm_var": 0.01643531272497493, + "learning_rate": 0.0001, + "loss": 44060.5078, + "loss/crossentropy": 2.4934980869293213, + "loss/hidden": 0.384765625, + "loss/logits": 0.09054020047187805, + "loss/reg": 44060.03125, + "step": 776 + }, + { + "epoch": 0.019425, + "grad_norm": 1.2347862720489502, + "grad_norm_var": 0.021719260912863508, + "learning_rate": 0.0001, + "loss": 43907.1484, + "loss/crossentropy": 2.648806571960449, + "loss/hidden": 0.373046875, + "loss/logits": 0.07670263200998306, + "loss/reg": 43906.69921875, + "step": 777 + }, + { + "epoch": 0.01945, + "grad_norm": 1.1064521074295044, + "grad_norm_var": 0.021935016499496845, + "learning_rate": 0.0001, + "loss": 43753.8047, + "loss/crossentropy": 2.6057424545288086, + "loss/hidden": 0.41796875, + "loss/logits": 0.1008932888507843, + "loss/reg": 43753.28515625, + "step": 778 + }, + { + "epoch": 0.019475, + "grad_norm": 0.8709408640861511, + "grad_norm_var": 0.02243612940938117, + "learning_rate": 0.0001, + "loss": 43601.9141, + "loss/crossentropy": 2.2285828590393066, + "loss/hidden": 0.40234375, + "loss/logits": 0.09650826454162598, + "loss/reg": 43601.4140625, + "step": 779 + }, + { + "epoch": 0.0195, + "grad_norm": 0.7791092991828918, + "grad_norm_var": 0.024295465395644304, + "learning_rate": 0.0001, + "loss": 43450.8594, + "loss/crossentropy": 2.456794500350952, + "loss/hidden": 0.3828125, + "loss/logits": 0.08525663614273071, + "loss/reg": 43450.390625, + "step": 780 + }, + { + "epoch": 0.019525, + "grad_norm": 0.9531808495521545, + "grad_norm_var": 0.023516516246377708, + "learning_rate": 0.0001, + "loss": 43299.4375, + "loss/crossentropy": 2.855341672897339, + "loss/hidden": 0.40625, + "loss/logits": 0.09789630770683289, + "loss/reg": 43298.93359375, + "step": 781 + }, + { + "epoch": 0.01955, + "grad_norm": 0.9563007354736328, + "grad_norm_var": 0.02263021016615583, + "learning_rate": 0.0001, + "loss": 43147.2109, + "loss/crossentropy": 2.42067813873291, + "loss/hidden": 0.4140625, + "loss/logits": 0.10505105555057526, + "loss/reg": 43146.69140625, + "step": 782 + }, + { + "epoch": 0.019575, + "grad_norm": 1.034609079360962, + "grad_norm_var": 0.02275310556488711, + "learning_rate": 0.0001, + "loss": 42994.3594, + "loss/crossentropy": 2.3903796672821045, + "loss/hidden": 0.404296875, + "loss/logits": 0.08946970105171204, + "loss/reg": 42993.8671875, + "step": 783 + }, + { + "epoch": 0.0196, + "grad_norm": 0.997693657875061, + "grad_norm_var": 0.02213191900271004, + "learning_rate": 0.0001, + "loss": 42843.2031, + "loss/crossentropy": 2.542088508605957, + "loss/hidden": 0.40625, + "loss/logits": 0.11267843097448349, + "loss/reg": 42842.68359375, + "step": 784 + }, + { + "epoch": 0.019625, + "grad_norm": 0.8993119597434998, + "grad_norm_var": 0.02253909786148013, + "learning_rate": 0.0001, + "loss": 42692.3203, + "loss/crossentropy": 2.712129592895508, + "loss/hidden": 0.3984375, + "loss/logits": 0.08530149608850479, + "loss/reg": 42691.8359375, + "step": 785 + }, + { + "epoch": 0.01965, + "grad_norm": 0.7412862181663513, + "grad_norm_var": 0.0223099495310689, + "learning_rate": 0.0001, + "loss": 42542.2109, + "loss/crossentropy": 2.539766788482666, + "loss/hidden": 0.40234375, + "loss/logits": 0.10429234802722931, + "loss/reg": 42541.703125, + "step": 786 + }, + { + "epoch": 0.019675, + "grad_norm": 0.8172112703323364, + "grad_norm_var": 0.023074676517893862, + "learning_rate": 0.0001, + "loss": 42390.5508, + "loss/crossentropy": 2.2713160514831543, + "loss/hidden": 0.400390625, + "loss/logits": 0.09550666809082031, + "loss/reg": 42390.0546875, + "step": 787 + }, + { + "epoch": 0.0197, + "grad_norm": 0.9562167525291443, + "grad_norm_var": 0.02211058591684741, + "learning_rate": 0.0001, + "loss": 42238.7461, + "loss/crossentropy": 2.5588197708129883, + "loss/hidden": 0.388671875, + "loss/logits": 0.09026595205068588, + "loss/reg": 42238.265625, + "step": 788 + }, + { + "epoch": 0.019725, + "grad_norm": 1.0070997476577759, + "grad_norm_var": 0.021936243600945163, + "learning_rate": 0.0001, + "loss": 42086.7109, + "loss/crossentropy": 2.6279189586639404, + "loss/hidden": 0.39453125, + "loss/logits": 0.08974668383598328, + "loss/reg": 42086.2265625, + "step": 789 + }, + { + "epoch": 0.01975, + "grad_norm": 0.924180269241333, + "grad_norm_var": 0.017791513662725324, + "learning_rate": 0.0001, + "loss": 41936.4023, + "loss/crossentropy": 2.5648348331451416, + "loss/hidden": 0.388671875, + "loss/logits": 0.08369086682796478, + "loss/reg": 41935.9296875, + "step": 790 + }, + { + "epoch": 0.019775, + "grad_norm": 0.8046944737434387, + "grad_norm_var": 0.015847980469538867, + "learning_rate": 0.0001, + "loss": 41785.668, + "loss/crossentropy": 2.410979986190796, + "loss/hidden": 0.38671875, + "loss/logits": 0.09604763239622116, + "loss/reg": 41785.18359375, + "step": 791 + }, + { + "epoch": 0.0198, + "grad_norm": 0.7474042177200317, + "grad_norm_var": 0.018124766264193903, + "learning_rate": 0.0001, + "loss": 41635.7461, + "loss/crossentropy": 2.548205852508545, + "loss/hidden": 0.37890625, + "loss/logits": 0.08188501000404358, + "loss/reg": 41635.28515625, + "step": 792 + }, + { + "epoch": 0.019825, + "grad_norm": 0.8547723889350891, + "grad_norm_var": 0.01155053097809707, + "learning_rate": 0.0001, + "loss": 41487.7812, + "loss/crossentropy": 2.542738199234009, + "loss/hidden": 0.3828125, + "loss/logits": 0.08695337921380997, + "loss/reg": 41487.3125, + "step": 793 + }, + { + "epoch": 0.01985, + "grad_norm": 1.011048674583435, + "grad_norm_var": 0.009533349350237034, + "learning_rate": 0.0001, + "loss": 41338.7852, + "loss/crossentropy": 2.647214889526367, + "loss/hidden": 0.421875, + "loss/logits": 0.10188287496566772, + "loss/reg": 41338.26171875, + "step": 794 + }, + { + "epoch": 0.019875, + "grad_norm": 0.9825843572616577, + "grad_norm_var": 0.009921607801237053, + "learning_rate": 0.0001, + "loss": 41189.8125, + "loss/crossentropy": 2.245267629623413, + "loss/hidden": 0.439453125, + "loss/logits": 0.10342052578926086, + "loss/reg": 41189.26953125, + "step": 795 + }, + { + "epoch": 0.0199, + "grad_norm": 0.9716632962226868, + "grad_norm_var": 0.00902815695549628, + "learning_rate": 0.0001, + "loss": 41040.1719, + "loss/crossentropy": 2.4565536975860596, + "loss/hidden": 0.4296875, + "loss/logits": 0.10906583070755005, + "loss/reg": 41039.6328125, + "step": 796 + }, + { + "epoch": 0.019925, + "grad_norm": 0.9376354813575745, + "grad_norm_var": 0.008966617327267067, + "learning_rate": 0.0001, + "loss": 40893.4688, + "loss/crossentropy": 2.5155081748962402, + "loss/hidden": 0.427734375, + "loss/logits": 0.09355796873569489, + "loss/reg": 40892.94921875, + "step": 797 + }, + { + "epoch": 0.01995, + "grad_norm": 1.3822612762451172, + "grad_norm_var": 0.022639252263949538, + "learning_rate": 0.0001, + "loss": 40746.293, + "loss/crossentropy": 2.3780517578125, + "loss/hidden": 0.416015625, + "loss/logits": 0.11463729292154312, + "loss/reg": 40745.76171875, + "step": 798 + }, + { + "epoch": 0.019975, + "grad_norm": 0.8885180354118347, + "grad_norm_var": 0.022166417254800307, + "learning_rate": 0.0001, + "loss": 40600.6367, + "loss/crossentropy": 2.2522025108337402, + "loss/hidden": 0.4140625, + "loss/logits": 0.11032229661941528, + "loss/reg": 40600.11328125, + "step": 799 + }, + { + "epoch": 0.02, + "grad_norm": 1.1068843603134155, + "grad_norm_var": 0.023857459775651315, + "learning_rate": 0.0001, + "loss": 40454.9258, + "loss/crossentropy": 2.6258373260498047, + "loss/hidden": 0.44140625, + "loss/logits": 0.09626742452383041, + "loss/reg": 40454.38671875, + "step": 800 + }, + { + "epoch": 0.020025, + "grad_norm": 0.9436428546905518, + "grad_norm_var": 0.023742458152267654, + "learning_rate": 0.0001, + "loss": 40309.0195, + "loss/crossentropy": 2.503645896911621, + "loss/hidden": 0.384765625, + "loss/logits": 0.09005875140428543, + "loss/reg": 40308.54296875, + "step": 801 + }, + { + "epoch": 0.02005, + "grad_norm": 1.5282552242279053, + "grad_norm_var": 0.04135576468478863, + "learning_rate": 0.0001, + "loss": 40164.7656, + "loss/crossentropy": 2.4166529178619385, + "loss/hidden": 0.44140625, + "loss/logits": 0.108254574239254, + "loss/reg": 40164.21484375, + "step": 802 + }, + { + "epoch": 0.020075, + "grad_norm": 1.1386072635650635, + "grad_norm_var": 0.04034277163636801, + "learning_rate": 0.0001, + "loss": 40020.0742, + "loss/crossentropy": 2.5895628929138184, + "loss/hidden": 0.3984375, + "loss/logits": 0.09454331547021866, + "loss/reg": 40019.58203125, + "step": 803 + }, + { + "epoch": 0.0201, + "grad_norm": 0.9719331860542297, + "grad_norm_var": 0.04024216977857732, + "learning_rate": 0.0001, + "loss": 39875.0859, + "loss/crossentropy": 2.553748846054077, + "loss/hidden": 0.4140625, + "loss/logits": 0.08906621485948563, + "loss/reg": 39874.58203125, + "step": 804 + }, + { + "epoch": 0.020125, + "grad_norm": 1.075038194656372, + "grad_norm_var": 0.04048105789431939, + "learning_rate": 0.0001, + "loss": 39732.1992, + "loss/crossentropy": 2.573615789413452, + "loss/hidden": 0.416015625, + "loss/logits": 0.09945542365312576, + "loss/reg": 39731.68359375, + "step": 805 + }, + { + "epoch": 0.02015, + "grad_norm": 1.2552541494369507, + "grad_norm_var": 0.043242255039610634, + "learning_rate": 0.0001, + "loss": 39588.8477, + "loss/crossentropy": 2.787565231323242, + "loss/hidden": 0.4296875, + "loss/logits": 0.1017235741019249, + "loss/reg": 39588.31640625, + "step": 806 + }, + { + "epoch": 0.020175, + "grad_norm": 0.9056074023246765, + "grad_norm_var": 0.0407461410468116, + "learning_rate": 0.0001, + "loss": 39446.2148, + "loss/crossentropy": 2.705078601837158, + "loss/hidden": 0.408203125, + "loss/logits": 0.09658461809158325, + "loss/reg": 39445.7109375, + "step": 807 + }, + { + "epoch": 0.0202, + "grad_norm": 0.9769497513771057, + "grad_norm_var": 0.0349672339588764, + "learning_rate": 0.0001, + "loss": 39305.0352, + "loss/crossentropy": 2.384089708328247, + "loss/hidden": 0.42578125, + "loss/logits": 0.1048816367983818, + "loss/reg": 39304.50390625, + "step": 808 + }, + { + "epoch": 0.020225, + "grad_norm": 0.9730967879295349, + "grad_norm_var": 0.03263341861796503, + "learning_rate": 0.0001, + "loss": 39162.4141, + "loss/crossentropy": 2.619083881378174, + "loss/hidden": 0.423828125, + "loss/logits": 0.1224329024553299, + "loss/reg": 39161.8671875, + "step": 809 + }, + { + "epoch": 0.02025, + "grad_norm": 0.903662919998169, + "grad_norm_var": 0.03413466668993359, + "learning_rate": 0.0001, + "loss": 39020.543, + "loss/crossentropy": 2.0379161834716797, + "loss/hidden": 0.38671875, + "loss/logits": 0.07385985553264618, + "loss/reg": 39020.08203125, + "step": 810 + }, + { + "epoch": 0.020275, + "grad_norm": 0.8019759654998779, + "grad_norm_var": 0.038009932433176734, + "learning_rate": 0.0001, + "loss": 38882.8398, + "loss/crossentropy": 2.404892921447754, + "loss/hidden": 0.392578125, + "loss/logits": 0.09625999629497528, + "loss/reg": 38882.3515625, + "step": 811 + }, + { + "epoch": 0.0203, + "grad_norm": 1.1033836603164673, + "grad_norm_var": 0.03776134243067393, + "learning_rate": 0.0001, + "loss": 38746.8789, + "loss/crossentropy": 2.6194732189178467, + "loss/hidden": 0.4140625, + "loss/logits": 0.08940684050321579, + "loss/reg": 38746.375, + "step": 812 + }, + { + "epoch": 0.020325, + "grad_norm": 0.9457865357398987, + "grad_norm_var": 0.037637079205596, + "learning_rate": 0.0001, + "loss": 38602.4922, + "loss/crossentropy": 2.4452311992645264, + "loss/hidden": 0.400390625, + "loss/logits": 0.099249929189682, + "loss/reg": 38601.9921875, + "step": 813 + }, + { + "epoch": 0.02035, + "grad_norm": 0.9055691957473755, + "grad_norm_var": 0.031121762090745856, + "learning_rate": 0.0001, + "loss": 38465.1914, + "loss/crossentropy": 2.4715399742126465, + "loss/hidden": 0.40234375, + "loss/logits": 0.10231912136077881, + "loss/reg": 38464.6875, + "step": 814 + }, + { + "epoch": 0.020375, + "grad_norm": 1.168678879737854, + "grad_norm_var": 0.03087272107349374, + "learning_rate": 0.0001, + "loss": 38328.918, + "loss/crossentropy": 2.5399720668792725, + "loss/hidden": 0.42578125, + "loss/logits": 0.10657870769500732, + "loss/reg": 38328.38671875, + "step": 815 + }, + { + "epoch": 0.0204, + "grad_norm": 1.066002368927002, + "grad_norm_var": 0.03063451244188884, + "learning_rate": 0.0001, + "loss": 38191.5703, + "loss/crossentropy": 2.6654410362243652, + "loss/hidden": 0.431640625, + "loss/logits": 0.10600557923316956, + "loss/reg": 38191.03125, + "step": 816 + }, + { + "epoch": 0.020425, + "grad_norm": 1.0306800603866577, + "grad_norm_var": 0.02997275433461438, + "learning_rate": 0.0001, + "loss": 38056.9844, + "loss/crossentropy": 2.3844645023345947, + "loss/hidden": 0.39453125, + "loss/logits": 0.08257506787776947, + "loss/reg": 38056.5078125, + "step": 817 + }, + { + "epoch": 0.02045, + "grad_norm": 1.1823235750198364, + "grad_norm_var": 0.015250149322878655, + "learning_rate": 0.0001, + "loss": 37920.3672, + "loss/crossentropy": 2.38747501373291, + "loss/hidden": 0.412109375, + "loss/logits": 0.10297746956348419, + "loss/reg": 37919.8515625, + "step": 818 + }, + { + "epoch": 0.020475, + "grad_norm": 1.275691270828247, + "grad_norm_var": 0.018495951763542337, + "learning_rate": 0.0001, + "loss": 37784.2227, + "loss/crossentropy": 3.0008273124694824, + "loss/hidden": 0.4296875, + "loss/logits": 0.09561491012573242, + "loss/reg": 37783.69921875, + "step": 819 + }, + { + "epoch": 0.0205, + "grad_norm": 0.8513725996017456, + "grad_norm_var": 0.020399711250510145, + "learning_rate": 0.0001, + "loss": 37646.6133, + "loss/crossentropy": 2.615851640701294, + "loss/hidden": 0.423828125, + "loss/logits": 0.11105596274137497, + "loss/reg": 37646.078125, + "step": 820 + }, + { + "epoch": 0.020525, + "grad_norm": 1.0822184085845947, + "grad_norm_var": 0.020449577202075336, + "learning_rate": 0.0001, + "loss": 37501.4727, + "loss/crossentropy": 2.793099880218506, + "loss/hidden": 0.423828125, + "loss/logits": 0.10729962587356567, + "loss/reg": 37500.94140625, + "step": 821 + }, + { + "epoch": 0.02055, + "grad_norm": 0.9906818866729736, + "grad_norm_var": 0.016764260003671202, + "learning_rate": 0.0001, + "loss": 37365.0977, + "loss/crossentropy": 2.5842463970184326, + "loss/hidden": 0.419921875, + "loss/logits": 0.09130015224218369, + "loss/reg": 37364.5859375, + "step": 822 + }, + { + "epoch": 0.020575, + "grad_norm": 0.7514151334762573, + "grad_norm_var": 0.02040114788076412, + "learning_rate": 0.0001, + "loss": 37230.4766, + "loss/crossentropy": 2.2744431495666504, + "loss/hidden": 0.380859375, + "loss/logits": 0.07892940938472748, + "loss/reg": 37230.015625, + "step": 823 + }, + { + "epoch": 0.0206, + "grad_norm": 1.0638694763183594, + "grad_norm_var": 0.020599328445768526, + "learning_rate": 0.0001, + "loss": 37097.5586, + "loss/crossentropy": 2.402249336242676, + "loss/hidden": 0.42578125, + "loss/logits": 0.10117131471633911, + "loss/reg": 37097.03125, + "step": 824 + }, + { + "epoch": 0.020625, + "grad_norm": 0.8464297652244568, + "grad_norm_var": 0.022158243548479048, + "learning_rate": 0.0001, + "loss": 36962.1289, + "loss/crossentropy": 2.519179344177246, + "loss/hidden": 0.419921875, + "loss/logits": 0.1031673401594162, + "loss/reg": 36961.60546875, + "step": 825 + }, + { + "epoch": 0.02065, + "grad_norm": 1.0223678350448608, + "grad_norm_var": 0.021544095854699682, + "learning_rate": 0.0001, + "loss": 36831.0234, + "loss/crossentropy": 2.6199722290039062, + "loss/hidden": 0.42578125, + "loss/logits": 0.1116126999258995, + "loss/reg": 36830.484375, + "step": 826 + }, + { + "epoch": 0.020675, + "grad_norm": 0.7614914178848267, + "grad_norm_var": 0.02274529440476408, + "learning_rate": 0.0001, + "loss": 36698.6719, + "loss/crossentropy": 2.5858829021453857, + "loss/hidden": 0.388671875, + "loss/logits": 0.07992447167634964, + "loss/reg": 36698.203125, + "step": 827 + }, + { + "epoch": 0.0207, + "grad_norm": 0.8607364892959595, + "grad_norm_var": 0.023177362648318874, + "learning_rate": 0.0001, + "loss": 36569.9805, + "loss/crossentropy": 2.647308588027954, + "loss/hidden": 0.41015625, + "loss/logits": 0.09774725884199142, + "loss/reg": 36569.47265625, + "step": 828 + }, + { + "epoch": 0.020725, + "grad_norm": 0.7578085660934448, + "grad_norm_var": 0.02643966563970855, + "learning_rate": 0.0001, + "loss": 36439.7461, + "loss/crossentropy": 2.263488531112671, + "loss/hidden": 0.392578125, + "loss/logits": 0.08892738819122314, + "loss/reg": 36439.265625, + "step": 829 + }, + { + "epoch": 0.02075, + "grad_norm": 1.2861242294311523, + "grad_norm_var": 0.03191310292394432, + "learning_rate": 0.0001, + "loss": 36311.9961, + "loss/crossentropy": 2.9907784461975098, + "loss/hidden": 0.41796875, + "loss/logits": 0.10988874733448029, + "loss/reg": 36311.46875, + "step": 830 + }, + { + "epoch": 0.020775, + "grad_norm": 1.1022545099258423, + "grad_norm_var": 0.030693779916705907, + "learning_rate": 0.0001, + "loss": 36182.2773, + "loss/crossentropy": 2.7796716690063477, + "loss/hidden": 0.412109375, + "loss/logits": 0.11308954656124115, + "loss/reg": 36181.75390625, + "step": 831 + }, + { + "epoch": 0.0208, + "grad_norm": 0.8730552792549133, + "grad_norm_var": 0.031212378059443265, + "learning_rate": 0.0001, + "loss": 36055.8984, + "loss/crossentropy": 2.366497039794922, + "loss/hidden": 0.390625, + "loss/logits": 0.09841403365135193, + "loss/reg": 36055.41015625, + "step": 832 + }, + { + "epoch": 0.020825, + "grad_norm": 0.9390790462493896, + "grad_norm_var": 0.03116249149287403, + "learning_rate": 0.0001, + "loss": 35918.8398, + "loss/crossentropy": 2.5822160243988037, + "loss/hidden": 0.408203125, + "loss/logits": 0.09956912696361542, + "loss/reg": 35918.33203125, + "step": 833 + }, + { + "epoch": 0.02085, + "grad_norm": 1.1822123527526855, + "grad_norm_var": 0.03115946122038332, + "learning_rate": 0.0001, + "loss": 35790.5547, + "loss/crossentropy": 2.655499219894409, + "loss/hidden": 0.421875, + "loss/logits": 0.09676744788885117, + "loss/reg": 35790.03515625, + "step": 834 + }, + { + "epoch": 0.020875, + "grad_norm": 1.1304856538772583, + "grad_norm_var": 0.026712285164661863, + "learning_rate": 0.0001, + "loss": 35663.8516, + "loss/crossentropy": 2.5775606632232666, + "loss/hidden": 0.421875, + "loss/logits": 0.09897801280021667, + "loss/reg": 35663.33203125, + "step": 835 + }, + { + "epoch": 0.0209, + "grad_norm": 0.9108983278274536, + "grad_norm_var": 0.026001350495837273, + "learning_rate": 0.0001, + "loss": 35537.7773, + "loss/crossentropy": 2.584146022796631, + "loss/hidden": 0.392578125, + "loss/logits": 0.08021892607212067, + "loss/reg": 35537.3046875, + "step": 836 + }, + { + "epoch": 0.020925, + "grad_norm": 0.7505898475646973, + "grad_norm_var": 0.028026628359788693, + "learning_rate": 0.0001, + "loss": 35412.0938, + "loss/crossentropy": 2.5469448566436768, + "loss/hidden": 0.39453125, + "loss/logits": 0.09684520214796066, + "loss/reg": 35411.6015625, + "step": 837 + }, + { + "epoch": 0.02095, + "grad_norm": 0.981681764125824, + "grad_norm_var": 0.027985084585314117, + "learning_rate": 0.0001, + "loss": 35287.625, + "loss/crossentropy": 2.333712339401245, + "loss/hidden": 0.4375, + "loss/logits": 0.12408019602298737, + "loss/reg": 35287.0625, + "step": 838 + }, + { + "epoch": 0.020975, + "grad_norm": 0.7908355593681335, + "grad_norm_var": 0.027031700151352804, + "learning_rate": 0.0001, + "loss": 35161.8086, + "loss/crossentropy": 2.6098623275756836, + "loss/hidden": 0.404296875, + "loss/logits": 0.1000092625617981, + "loss/reg": 35161.3046875, + "step": 839 + }, + { + "epoch": 0.021, + "grad_norm": 0.7975010871887207, + "grad_norm_var": 0.02755505059063334, + "learning_rate": 0.0001, + "loss": 35037.0391, + "loss/crossentropy": 2.4856231212615967, + "loss/hidden": 0.390625, + "loss/logits": 0.08976828306913376, + "loss/reg": 35036.55859375, + "step": 840 + }, + { + "epoch": 0.021025, + "grad_norm": 0.8434867262840271, + "grad_norm_var": 0.02759117022010192, + "learning_rate": 0.0001, + "loss": 34913.6719, + "loss/crossentropy": 2.45410418510437, + "loss/hidden": 0.41796875, + "loss/logits": 0.11138058453798294, + "loss/reg": 34913.140625, + "step": 841 + }, + { + "epoch": 0.02105, + "grad_norm": 0.7404183745384216, + "grad_norm_var": 0.029347114537091306, + "learning_rate": 0.0001, + "loss": 34790.2539, + "loss/crossentropy": 2.7616987228393555, + "loss/hidden": 0.40234375, + "loss/logits": 0.10481090843677521, + "loss/reg": 34789.74609375, + "step": 842 + }, + { + "epoch": 0.021075, + "grad_norm": 0.7900260090827942, + "grad_norm_var": 0.028797636490651493, + "learning_rate": 0.0001, + "loss": 34668.2109, + "loss/crossentropy": 2.348297357559204, + "loss/hidden": 0.3984375, + "loss/logits": 0.10565009713172913, + "loss/reg": 34667.70703125, + "step": 843 + }, + { + "epoch": 0.0211, + "grad_norm": 0.8611040115356445, + "grad_norm_var": 0.02879468818566166, + "learning_rate": 0.0001, + "loss": 34545.125, + "loss/crossentropy": 2.540072202682495, + "loss/hidden": 0.416015625, + "loss/logits": 0.085359126329422, + "loss/reg": 34544.625, + "step": 844 + }, + { + "epoch": 0.021125, + "grad_norm": 0.8963682651519775, + "grad_norm_var": 0.026977908947997755, + "learning_rate": 0.0001, + "loss": 34422.8633, + "loss/crossentropy": 2.5841212272644043, + "loss/hidden": 0.3984375, + "loss/logits": 0.10156680643558502, + "loss/reg": 34422.36328125, + "step": 845 + }, + { + "epoch": 0.02115, + "grad_norm": 1.0223283767700195, + "grad_norm_var": 0.01879276828590027, + "learning_rate": 0.0001, + "loss": 34301.707, + "loss/crossentropy": 2.905853509902954, + "loss/hidden": 0.4140625, + "loss/logits": 0.09710683673620224, + "loss/reg": 34301.1953125, + "step": 846 + }, + { + "epoch": 0.021175, + "grad_norm": 1.0493470430374146, + "grad_norm_var": 0.01763456168458267, + "learning_rate": 0.0001, + "loss": 34179.6914, + "loss/crossentropy": 2.8628482818603516, + "loss/hidden": 0.3984375, + "loss/logits": 0.0946056917309761, + "loss/reg": 34179.19921875, + "step": 847 + }, + { + "epoch": 0.0212, + "grad_norm": 1.0956487655639648, + "grad_norm_var": 0.019635895807511957, + "learning_rate": 0.0001, + "loss": 34060.0234, + "loss/crossentropy": 2.604365110397339, + "loss/hidden": 0.4296875, + "loss/logits": 0.1112070232629776, + "loss/reg": 34059.484375, + "step": 848 + }, + { + "epoch": 0.021225, + "grad_norm": 1.0503216981887817, + "grad_norm_var": 0.020634830180398453, + "learning_rate": 0.0001, + "loss": 33938.5898, + "loss/crossentropy": 2.8042867183685303, + "loss/hidden": 0.427734375, + "loss/logits": 0.10720456391572952, + "loss/reg": 33938.0546875, + "step": 849 + }, + { + "epoch": 0.02125, + "grad_norm": 0.9675441384315491, + "grad_norm_var": 0.016319762626638538, + "learning_rate": 0.0001, + "loss": 33818.2266, + "loss/crossentropy": 2.4751031398773193, + "loss/hidden": 0.41796875, + "loss/logits": 0.10572968423366547, + "loss/reg": 33817.703125, + "step": 850 + }, + { + "epoch": 0.021275, + "grad_norm": 1.0344288349151611, + "grad_norm_var": 0.014167482585636293, + "learning_rate": 0.0001, + "loss": 33699.5742, + "loss/crossentropy": 2.600318670272827, + "loss/hidden": 0.39453125, + "loss/logits": 0.08344602584838867, + "loss/reg": 33699.09765625, + "step": 851 + }, + { + "epoch": 0.0213, + "grad_norm": 1.407858967781067, + "grad_norm_var": 0.029569324902826346, + "learning_rate": 0.0001, + "loss": 33581.5234, + "loss/crossentropy": 2.62861704826355, + "loss/hidden": 0.4375, + "loss/logits": 0.11908341199159622, + "loss/reg": 33580.96875, + "step": 852 + }, + { + "epoch": 0.021325, + "grad_norm": 0.8948066830635071, + "grad_norm_var": 0.02717962112157412, + "learning_rate": 0.0001, + "loss": 33462.6289, + "loss/crossentropy": 2.597301721572876, + "loss/hidden": 0.423828125, + "loss/logits": 0.10772041976451874, + "loss/reg": 33462.09765625, + "step": 853 + }, + { + "epoch": 0.02135, + "grad_norm": 0.8743274211883545, + "grad_norm_var": 0.02746764893301516, + "learning_rate": 0.0001, + "loss": 33344.4766, + "loss/crossentropy": 2.502575397491455, + "loss/hidden": 0.41796875, + "loss/logits": 0.09394825994968414, + "loss/reg": 33343.96484375, + "step": 854 + }, + { + "epoch": 0.021375, + "grad_norm": 0.7814751267433167, + "grad_norm_var": 0.027665246593712566, + "learning_rate": 0.0001, + "loss": 33217.1211, + "loss/crossentropy": 2.4343154430389404, + "loss/hidden": 0.392578125, + "loss/logits": 0.09242767840623856, + "loss/reg": 33216.63671875, + "step": 855 + }, + { + "epoch": 0.0214, + "grad_norm": 0.8254016637802124, + "grad_norm_var": 0.027168216466293357, + "learning_rate": 0.0001, + "loss": 33098.625, + "loss/crossentropy": 2.597825527191162, + "loss/hidden": 0.408203125, + "loss/logits": 0.09620468318462372, + "loss/reg": 33098.12109375, + "step": 856 + }, + { + "epoch": 0.021425, + "grad_norm": 1.0467920303344727, + "grad_norm_var": 0.026974543250588436, + "learning_rate": 0.0001, + "loss": 32981.0117, + "loss/crossentropy": 2.531338691711426, + "loss/hidden": 0.4140625, + "loss/logits": 0.09141959249973297, + "loss/reg": 32980.5078125, + "step": 857 + }, + { + "epoch": 0.02145, + "grad_norm": 1.297360897064209, + "grad_norm_var": 0.030156381415047187, + "learning_rate": 0.0001, + "loss": 32864.7031, + "loss/crossentropy": 2.7989187240600586, + "loss/hidden": 0.4296875, + "loss/logits": 0.10818947851657867, + "loss/reg": 32864.1640625, + "step": 858 + }, + { + "epoch": 0.021475, + "grad_norm": 0.8410242795944214, + "grad_norm_var": 0.02893572218853119, + "learning_rate": 0.0001, + "loss": 32750.0977, + "loss/crossentropy": 2.44685697555542, + "loss/hidden": 0.416015625, + "loss/logits": 0.09526005387306213, + "loss/reg": 32749.5859375, + "step": 859 + }, + { + "epoch": 0.0215, + "grad_norm": 0.8180389404296875, + "grad_norm_var": 0.029829847108710426, + "learning_rate": 0.0001, + "loss": 32628.207, + "loss/crossentropy": 2.5319130420684814, + "loss/hidden": 0.39453125, + "loss/logits": 0.08277040719985962, + "loss/reg": 32627.73046875, + "step": 860 + }, + { + "epoch": 0.021525, + "grad_norm": 0.878432035446167, + "grad_norm_var": 0.030083301371783343, + "learning_rate": 0.0001, + "loss": 32512.7188, + "loss/crossentropy": 2.694110870361328, + "loss/hidden": 0.412109375, + "loss/logits": 0.0938844084739685, + "loss/reg": 32512.212890625, + "step": 861 + }, + { + "epoch": 0.02155, + "grad_norm": 0.8658337593078613, + "grad_norm_var": 0.030998263460790657, + "learning_rate": 0.0001, + "loss": 32397.1641, + "loss/crossentropy": 2.352792739868164, + "loss/hidden": 0.423828125, + "loss/logits": 0.1003153845667839, + "loss/reg": 32396.640625, + "step": 862 + }, + { + "epoch": 0.021575, + "grad_norm": 1.118543267250061, + "grad_norm_var": 0.031909279085837135, + "learning_rate": 0.0001, + "loss": 32281.918, + "loss/crossentropy": 2.7677767276763916, + "loss/hidden": 0.439453125, + "loss/logits": 0.10245128720998764, + "loss/reg": 32281.376953125, + "step": 863 + }, + { + "epoch": 0.0216, + "grad_norm": 0.9408777356147766, + "grad_norm_var": 0.031171847988469815, + "learning_rate": 0.0001, + "loss": 32169.8145, + "loss/crossentropy": 2.556101083755493, + "loss/hidden": 0.3984375, + "loss/logits": 0.09347447752952576, + "loss/reg": 32169.322265625, + "step": 864 + }, + { + "epoch": 0.021625, + "grad_norm": 0.9442301988601685, + "grad_norm_var": 0.0308479210707351, + "learning_rate": 0.0001, + "loss": 32055.9531, + "loss/crossentropy": 2.5342166423797607, + "loss/hidden": 0.431640625, + "loss/logits": 0.10387556254863739, + "loss/reg": 32055.41796875, + "step": 865 + }, + { + "epoch": 0.02165, + "grad_norm": 0.8331806659698486, + "grad_norm_var": 0.03203927249987128, + "learning_rate": 0.0001, + "loss": 31928.8379, + "loss/crossentropy": 2.78352952003479, + "loss/hidden": 0.412109375, + "loss/logits": 0.08486774563789368, + "loss/reg": 31928.341796875, + "step": 866 + }, + { + "epoch": 0.021675, + "grad_norm": 0.9290958642959595, + "grad_norm_var": 0.03172480831427588, + "learning_rate": 0.0001, + "loss": 31813.7832, + "loss/crossentropy": 2.6741678714752197, + "loss/hidden": 0.41796875, + "loss/logits": 0.10052283853292465, + "loss/reg": 31813.265625, + "step": 867 + }, + { + "epoch": 0.0217, + "grad_norm": 0.6767722368240356, + "grad_norm_var": 0.021091679013798256, + "learning_rate": 0.0001, + "loss": 31700.1758, + "loss/crossentropy": 2.521838665008545, + "loss/hidden": 0.392578125, + "loss/logits": 0.09481383860111237, + "loss/reg": 31699.6875, + "step": 868 + }, + { + "epoch": 0.021725, + "grad_norm": 0.7836902737617493, + "grad_norm_var": 0.02209418880242387, + "learning_rate": 0.0001, + "loss": 31586.8145, + "loss/crossentropy": 2.4294605255126953, + "loss/hidden": 0.400390625, + "loss/logits": 0.10107098519802094, + "loss/reg": 31586.3125, + "step": 869 + }, + { + "epoch": 0.02175, + "grad_norm": 0.7529267072677612, + "grad_norm_var": 0.023486597337748435, + "learning_rate": 0.0001, + "loss": 31472.9453, + "loss/crossentropy": 2.4297895431518555, + "loss/hidden": 0.390625, + "loss/logits": 0.09175887703895569, + "loss/reg": 31472.462890625, + "step": 870 + }, + { + "epoch": 0.021775, + "grad_norm": 0.7665960788726807, + "grad_norm_var": 0.023727348579464073, + "learning_rate": 0.0001, + "loss": 31361.6562, + "loss/crossentropy": 2.502922296524048, + "loss/hidden": 0.404296875, + "loss/logits": 0.10530885308980942, + "loss/reg": 31361.146484375, + "step": 871 + }, + { + "epoch": 0.0218, + "grad_norm": 0.8659310936927795, + "grad_norm_var": 0.023454315559804078, + "learning_rate": 0.0001, + "loss": 31250.8555, + "loss/crossentropy": 2.400033473968506, + "loss/hidden": 0.419921875, + "loss/logits": 0.10294472426176071, + "loss/reg": 31250.33203125, + "step": 872 + }, + { + "epoch": 0.021825, + "grad_norm": 0.857296884059906, + "grad_norm_var": 0.021925511634459778, + "learning_rate": 0.0001, + "loss": 31138.9648, + "loss/crossentropy": 2.500699281692505, + "loss/hidden": 0.4140625, + "loss/logits": 0.09416045993566513, + "loss/reg": 31138.45703125, + "step": 873 + }, + { + "epoch": 0.02185, + "grad_norm": 1.3556040525436401, + "grad_norm_var": 0.025335049988287513, + "learning_rate": 0.0001, + "loss": 31028.1133, + "loss/crossentropy": 2.4126169681549072, + "loss/hidden": 0.447265625, + "loss/logits": 0.09992693364620209, + "loss/reg": 31027.56640625, + "step": 874 + }, + { + "epoch": 0.021875, + "grad_norm": 1.1106455326080322, + "grad_norm_var": 0.028144669255741366, + "learning_rate": 0.0001, + "loss": 30916.2793, + "loss/crossentropy": 2.682753801345825, + "loss/hidden": 0.40234375, + "loss/logits": 0.09390898793935776, + "loss/reg": 30915.783203125, + "step": 875 + }, + { + "epoch": 0.0219, + "grad_norm": 0.9540243148803711, + "grad_norm_var": 0.027703643787985694, + "learning_rate": 0.0001, + "loss": 30805.8301, + "loss/crossentropy": 2.50162410736084, + "loss/hidden": 0.42578125, + "loss/logits": 0.10138815641403198, + "loss/reg": 30805.302734375, + "step": 876 + }, + { + "epoch": 0.021925, + "grad_norm": 0.9050595760345459, + "grad_norm_var": 0.027619531474660106, + "learning_rate": 0.0001, + "loss": 30695.3926, + "loss/crossentropy": 2.7011783123016357, + "loss/hidden": 0.392578125, + "loss/logits": 0.08969256281852722, + "loss/reg": 30694.91015625, + "step": 877 + }, + { + "epoch": 0.02195, + "grad_norm": 1.171053171157837, + "grad_norm_var": 0.03138944267110251, + "learning_rate": 0.0001, + "loss": 30585.9648, + "loss/crossentropy": 2.6654486656188965, + "loss/hidden": 0.41796875, + "loss/logits": 0.110439732670784, + "loss/reg": 30585.435546875, + "step": 878 + }, + { + "epoch": 0.021975, + "grad_norm": 0.8103191256523132, + "grad_norm_var": 0.02979827776598006, + "learning_rate": 0.0001, + "loss": 30475.1992, + "loss/crossentropy": 2.520002603530884, + "loss/hidden": 0.392578125, + "loss/logits": 0.08766569197177887, + "loss/reg": 30474.71875, + "step": 879 + }, + { + "epoch": 0.022, + "grad_norm": 0.8522130846977234, + "grad_norm_var": 0.029996476232309735, + "learning_rate": 0.0001, + "loss": 30366.8828, + "loss/crossentropy": 2.460566997528076, + "loss/hidden": 0.404296875, + "loss/logits": 0.096153125166893, + "loss/reg": 30366.3828125, + "step": 880 + }, + { + "epoch": 0.022025, + "grad_norm": 0.9695855379104614, + "grad_norm_var": 0.030150554154386323, + "learning_rate": 0.0001, + "loss": 30258.8926, + "loss/crossentropy": 2.2949836254119873, + "loss/hidden": 0.4140625, + "loss/logits": 0.08789625763893127, + "loss/reg": 30258.390625, + "step": 881 + }, + { + "epoch": 0.02205, + "grad_norm": 0.9067624807357788, + "grad_norm_var": 0.029714434778758524, + "learning_rate": 0.0001, + "loss": 30152.0566, + "loss/crossentropy": 2.5709807872772217, + "loss/hidden": 0.404296875, + "loss/logits": 0.08980227261781693, + "loss/reg": 30151.5625, + "step": 882 + }, + { + "epoch": 0.022075, + "grad_norm": 3.0597167015075684, + "grad_norm_var": 0.31695078213330063, + "learning_rate": 0.0001, + "loss": 30043.8809, + "loss/crossentropy": 2.7180075645446777, + "loss/hidden": 0.46875, + "loss/logits": 0.1211073249578476, + "loss/reg": 30043.291015625, + "step": 883 + }, + { + "epoch": 0.0221, + "grad_norm": 0.949224591255188, + "grad_norm_var": 0.308036031196324, + "learning_rate": 0.0001, + "loss": 29936.375, + "loss/crossentropy": 2.387786388397217, + "loss/hidden": 0.4296875, + "loss/logits": 0.09997786581516266, + "loss/reg": 29935.845703125, + "step": 884 + }, + { + "epoch": 0.022125, + "grad_norm": 1.128546118736267, + "grad_norm_var": 0.3024459582136742, + "learning_rate": 0.0001, + "loss": 29830.6816, + "loss/crossentropy": 2.5146074295043945, + "loss/hidden": 0.44140625, + "loss/logits": 0.12338151037693024, + "loss/reg": 29830.1171875, + "step": 885 + }, + { + "epoch": 0.02215, + "grad_norm": 0.7824976444244385, + "grad_norm_var": 0.3011776371345623, + "learning_rate": 0.0001, + "loss": 29722.6387, + "loss/crossentropy": 2.3945186138153076, + "loss/hidden": 0.3984375, + "loss/logits": 0.08712532371282578, + "loss/reg": 29722.15234375, + "step": 886 + }, + { + "epoch": 0.022175, + "grad_norm": 1.2043960094451904, + "grad_norm_var": 0.2942602550435301, + "learning_rate": 0.0001, + "loss": 29617.1387, + "loss/crossentropy": 2.932788372039795, + "loss/hidden": 0.4296875, + "loss/logits": 0.09943626821041107, + "loss/reg": 29616.609375, + "step": 887 + }, + { + "epoch": 0.0222, + "grad_norm": 1.1389050483703613, + "grad_norm_var": 0.28975465213318613, + "learning_rate": 0.0001, + "loss": 29510.8145, + "loss/crossentropy": 2.4214000701904297, + "loss/hidden": 0.419921875, + "loss/logits": 0.09680774062871933, + "loss/reg": 29510.296875, + "step": 888 + }, + { + "epoch": 0.022225, + "grad_norm": 0.7648321986198425, + "grad_norm_var": 0.2937095091789043, + "learning_rate": 0.0001, + "loss": 29406.875, + "loss/crossentropy": 2.5498549938201904, + "loss/hidden": 0.40234375, + "loss/logits": 0.09309667348861694, + "loss/reg": 29406.37890625, + "step": 889 + }, + { + "epoch": 0.02225, + "grad_norm": 1.1869903802871704, + "grad_norm_var": 0.2903910837602872, + "learning_rate": 0.0001, + "loss": 29301.8965, + "loss/crossentropy": 2.93878436088562, + "loss/hidden": 0.443359375, + "loss/logits": 0.10722556710243225, + "loss/reg": 29301.345703125, + "step": 890 + }, + { + "epoch": 0.022275, + "grad_norm": 0.8767397999763489, + "grad_norm_var": 0.29405314307700603, + "learning_rate": 0.0001, + "loss": 29197.5547, + "loss/crossentropy": 2.410872459411621, + "loss/hidden": 0.431640625, + "loss/logits": 0.10561473667621613, + "loss/reg": 29197.017578125, + "step": 891 + }, + { + "epoch": 0.0223, + "grad_norm": 0.821043848991394, + "grad_norm_var": 0.29781408579728175, + "learning_rate": 0.0001, + "loss": 29093.6172, + "loss/crossentropy": 2.512385845184326, + "loss/hidden": 0.4140625, + "loss/logits": 0.09107698500156403, + "loss/reg": 29093.111328125, + "step": 892 + }, + { + "epoch": 0.022325, + "grad_norm": 0.9101580381393433, + "grad_norm_var": 0.2976862548765704, + "learning_rate": 0.0001, + "loss": 28989.8184, + "loss/crossentropy": 2.5767509937286377, + "loss/hidden": 0.396484375, + "loss/logits": 0.10410018265247345, + "loss/reg": 28989.318359375, + "step": 893 + }, + { + "epoch": 0.02235, + "grad_norm": 0.9581095576286316, + "grad_norm_var": 0.2983840181156954, + "learning_rate": 0.0001, + "loss": 28887.0391, + "loss/crossentropy": 2.3874075412750244, + "loss/hidden": 0.423828125, + "loss/logits": 0.09900985658168793, + "loss/reg": 28886.515625, + "step": 894 + }, + { + "epoch": 0.022375, + "grad_norm": 0.9015693664550781, + "grad_norm_var": 0.29559285776519056, + "learning_rate": 0.0001, + "loss": 28784.6133, + "loss/crossentropy": 2.7540664672851562, + "loss/hidden": 0.427734375, + "loss/logits": 0.11583462357521057, + "loss/reg": 28784.0703125, + "step": 895 + }, + { + "epoch": 0.0224, + "grad_norm": 0.956749439239502, + "grad_norm_var": 0.2929865413509854, + "learning_rate": 0.0001, + "loss": 28682.1367, + "loss/crossentropy": 2.338555335998535, + "loss/hidden": 0.404296875, + "loss/logits": 0.11809753626585007, + "loss/reg": 28681.615234375, + "step": 896 + }, + { + "epoch": 0.022425, + "grad_norm": 1.0482844114303589, + "grad_norm_var": 0.29206037588530465, + "learning_rate": 0.0001, + "loss": 28580.9648, + "loss/crossentropy": 2.7794132232666016, + "loss/hidden": 0.419921875, + "loss/logits": 0.11443160474300385, + "loss/reg": 28580.4296875, + "step": 897 + }, + { + "epoch": 0.02245, + "grad_norm": 0.7482628226280212, + "grad_norm_var": 0.29770702312698105, + "learning_rate": 0.0001, + "loss": 28479.0078, + "loss/crossentropy": 2.2662155628204346, + "loss/hidden": 0.40234375, + "loss/logits": 0.08552417904138565, + "loss/reg": 28478.51953125, + "step": 898 + }, + { + "epoch": 0.022475, + "grad_norm": 0.782559335231781, + "grad_norm_var": 0.023674645747053195, + "learning_rate": 0.0001, + "loss": 28379.8262, + "loss/crossentropy": 2.4198241233825684, + "loss/hidden": 0.3984375, + "loss/logits": 0.09736079722642899, + "loss/reg": 28379.330078125, + "step": 899 + }, + { + "epoch": 0.0225, + "grad_norm": 0.8299167156219482, + "grad_norm_var": 0.024535734648487056, + "learning_rate": 0.0001, + "loss": 28279.8398, + "loss/crossentropy": 2.6819629669189453, + "loss/hidden": 0.408203125, + "loss/logits": 0.08226903527975082, + "loss/reg": 28279.349609375, + "step": 900 + }, + { + "epoch": 0.022525, + "grad_norm": 1.000473976135254, + "grad_norm_var": 0.022340753054793307, + "learning_rate": 0.0001, + "loss": 28180.084, + "loss/crossentropy": 2.6211347579956055, + "loss/hidden": 0.41796875, + "loss/logits": 0.12049969285726547, + "loss/reg": 28179.544921875, + "step": 901 + }, + { + "epoch": 0.02255, + "grad_norm": 1.1626721620559692, + "grad_norm_var": 0.023797399957489286, + "learning_rate": 0.0001, + "loss": 28080.8613, + "loss/crossentropy": 2.5171658992767334, + "loss/hidden": 0.43359375, + "loss/logits": 0.11568973958492279, + "loss/reg": 28080.3125, + "step": 902 + }, + { + "epoch": 0.022575, + "grad_norm": 0.8448003530502319, + "grad_norm_var": 0.01995660184935984, + "learning_rate": 0.0001, + "loss": 27981.7832, + "loss/crossentropy": 2.484351396560669, + "loss/hidden": 0.392578125, + "loss/logits": 0.08854812383651733, + "loss/reg": 27981.302734375, + "step": 903 + }, + { + "epoch": 0.0226, + "grad_norm": 0.9268690347671509, + "grad_norm_var": 0.016952505261794554, + "learning_rate": 0.0001, + "loss": 27883.5234, + "loss/crossentropy": 2.4348769187927246, + "loss/hidden": 0.421875, + "loss/logits": 0.1028188019990921, + "loss/reg": 27882.998046875, + "step": 904 + }, + { + "epoch": 0.022625, + "grad_norm": 1.214880347251892, + "grad_norm_var": 0.020300281621865146, + "learning_rate": 0.0001, + "loss": 27786.293, + "loss/crossentropy": 2.2924892902374268, + "loss/hidden": 0.421875, + "loss/logits": 0.09746156632900238, + "loss/reg": 27785.7734375, + "step": 905 + }, + { + "epoch": 0.02265, + "grad_norm": 1.4136064052581787, + "grad_norm_var": 0.03072723747541867, + "learning_rate": 0.0001, + "loss": 27689.4258, + "loss/crossentropy": 2.643136978149414, + "loss/hidden": 0.42578125, + "loss/logits": 0.09651921689510345, + "loss/reg": 27688.904296875, + "step": 906 + }, + { + "epoch": 0.022675, + "grad_norm": 0.8881863355636597, + "grad_norm_var": 0.03060485399788484, + "learning_rate": 0.0001, + "loss": 27592.2148, + "loss/crossentropy": 2.507758140563965, + "loss/hidden": 0.419921875, + "loss/logits": 0.09700015187263489, + "loss/reg": 27591.697265625, + "step": 907 + }, + { + "epoch": 0.0227, + "grad_norm": 0.7471372485160828, + "grad_norm_var": 0.03234519435069997, + "learning_rate": 0.0001, + "loss": 27496.6816, + "loss/crossentropy": 2.585388422012329, + "loss/hidden": 0.384765625, + "loss/logits": 0.08749718219041824, + "loss/reg": 27496.208984375, + "step": 908 + }, + { + "epoch": 0.022725, + "grad_norm": 0.8215914368629456, + "grad_norm_var": 0.033405009202436026, + "learning_rate": 0.0001, + "loss": 27400.1914, + "loss/crossentropy": 2.6576366424560547, + "loss/hidden": 0.42578125, + "loss/logits": 0.09847737848758698, + "loss/reg": 27399.66796875, + "step": 909 + }, + { + "epoch": 0.02275, + "grad_norm": 0.7689388990402222, + "grad_norm_var": 0.03550905349180364, + "learning_rate": 0.0001, + "loss": 27299.5469, + "loss/crossentropy": 2.4018616676330566, + "loss/hidden": 0.400390625, + "loss/logits": 0.09672288596630096, + "loss/reg": 27299.048828125, + "step": 910 + }, + { + "epoch": 0.022775, + "grad_norm": 1.1210814714431763, + "grad_norm_var": 0.037365672903085984, + "learning_rate": 0.0001, + "loss": 27203.0352, + "loss/crossentropy": 2.9312374591827393, + "loss/hidden": 0.427734375, + "loss/logits": 0.09397996217012405, + "loss/reg": 27202.513671875, + "step": 911 + }, + { + "epoch": 0.0228, + "grad_norm": 1.0522981882095337, + "grad_norm_var": 0.03796173486365054, + "learning_rate": 0.0001, + "loss": 27104.9355, + "loss/crossentropy": 2.7311244010925293, + "loss/hidden": 0.41796875, + "loss/logits": 0.09144529700279236, + "loss/reg": 27104.42578125, + "step": 912 + }, + { + "epoch": 0.022825, + "grad_norm": 2.082756757736206, + "grad_norm_var": 0.11692244041732346, + "learning_rate": 0.0001, + "loss": 27008.9414, + "loss/crossentropy": 2.760934352874756, + "loss/hidden": 0.455078125, + "loss/logits": 0.109530970454216, + "loss/reg": 27008.376953125, + "step": 913 + }, + { + "epoch": 0.02285, + "grad_norm": 1.009446382522583, + "grad_norm_var": 0.1115356391304008, + "learning_rate": 0.0001, + "loss": 26912.2383, + "loss/crossentropy": 2.480273962020874, + "loss/hidden": 0.427734375, + "loss/logits": 0.08872679620981216, + "loss/reg": 26911.72265625, + "step": 914 + }, + { + "epoch": 0.022875, + "grad_norm": 1.0722192525863647, + "grad_norm_var": 0.10677117689794256, + "learning_rate": 0.0001, + "loss": 26816.4785, + "loss/crossentropy": 2.6407546997070312, + "loss/hidden": 0.4609375, + "loss/logits": 0.11386647820472717, + "loss/reg": 26815.904296875, + "step": 915 + }, + { + "epoch": 0.0229, + "grad_norm": 0.8702617883682251, + "grad_norm_var": 0.10563626345524095, + "learning_rate": 0.0001, + "loss": 26718.4863, + "loss/crossentropy": 2.5538382530212402, + "loss/hidden": 0.4140625, + "loss/logits": 0.08729642629623413, + "loss/reg": 26717.984375, + "step": 916 + }, + { + "epoch": 0.022925, + "grad_norm": 1.1217052936553955, + "grad_norm_var": 0.10555503691690002, + "learning_rate": 0.0001, + "loss": 26623.6973, + "loss/crossentropy": 2.3956480026245117, + "loss/hidden": 0.462890625, + "loss/logits": 0.10590574890375137, + "loss/reg": 26623.12890625, + "step": 917 + }, + { + "epoch": 0.02295, + "grad_norm": 0.9607998728752136, + "grad_norm_var": 0.10560506580399846, + "learning_rate": 0.0001, + "loss": 26527.5898, + "loss/crossentropy": 2.681879758834839, + "loss/hidden": 0.39453125, + "loss/logits": 0.09322258085012436, + "loss/reg": 26527.1015625, + "step": 918 + }, + { + "epoch": 0.022975, + "grad_norm": 1.241347074508667, + "grad_norm_var": 0.10419840550631591, + "learning_rate": 0.0001, + "loss": 26432.4277, + "loss/crossentropy": 2.335767984390259, + "loss/hidden": 0.4296875, + "loss/logits": 0.09113731980323792, + "loss/reg": 26431.90625, + "step": 919 + }, + { + "epoch": 0.023, + "grad_norm": 0.9808320999145508, + "grad_norm_var": 0.1032637210382731, + "learning_rate": 0.0001, + "loss": 26331.9551, + "loss/crossentropy": 2.3959317207336426, + "loss/hidden": 0.427734375, + "loss/logits": 0.09294003993272781, + "loss/reg": 26331.43359375, + "step": 920 + }, + { + "epoch": 0.023025, + "grad_norm": 1.0856369733810425, + "grad_norm_var": 0.10207718996882348, + "learning_rate": 0.0001, + "loss": 26237.1758, + "loss/crossentropy": 2.653390407562256, + "loss/hidden": 0.431640625, + "loss/logits": 0.10660263895988464, + "loss/reg": 26236.63671875, + "step": 921 + }, + { + "epoch": 0.02305, + "grad_norm": 1.050618052482605, + "grad_norm_var": 0.09403867871835095, + "learning_rate": 0.0001, + "loss": 26142.5957, + "loss/crossentropy": 2.7401986122131348, + "loss/hidden": 0.423828125, + "loss/logits": 0.1003125011920929, + "loss/reg": 26142.072265625, + "step": 922 + }, + { + "epoch": 0.023075, + "grad_norm": 0.8106462359428406, + "grad_norm_var": 0.09613576783978507, + "learning_rate": 0.0001, + "loss": 26050.1152, + "loss/crossentropy": 2.6547510623931885, + "loss/hidden": 0.421875, + "loss/logits": 0.09801352024078369, + "loss/reg": 26049.595703125, + "step": 923 + }, + { + "epoch": 0.0231, + "grad_norm": 0.7937747240066528, + "grad_norm_var": 0.0943894508943714, + "learning_rate": 0.0001, + "loss": 25953.9883, + "loss/crossentropy": 2.4390735626220703, + "loss/hidden": 0.423828125, + "loss/logits": 0.0942004844546318, + "loss/reg": 25953.470703125, + "step": 924 + }, + { + "epoch": 0.023125, + "grad_norm": 0.851125955581665, + "grad_norm_var": 0.09353369249125756, + "learning_rate": 0.0001, + "loss": 25861.0723, + "loss/crossentropy": 2.7774431705474854, + "loss/hidden": 0.4296875, + "loss/logits": 0.1062496230006218, + "loss/reg": 25860.537109375, + "step": 925 + }, + { + "epoch": 0.02315, + "grad_norm": 0.9323251843452454, + "grad_norm_var": 0.08897920516345294, + "learning_rate": 0.0001, + "loss": 25768.9844, + "loss/crossentropy": 2.5458085536956787, + "loss/hidden": 0.43359375, + "loss/logits": 0.10097949951887131, + "loss/reg": 25768.44921875, + "step": 926 + }, + { + "epoch": 0.023175, + "grad_norm": 1.0372633934020996, + "grad_norm_var": 0.08878936235039156, + "learning_rate": 0.0001, + "loss": 25677.8711, + "loss/crossentropy": 2.54379940032959, + "loss/hidden": 0.431640625, + "loss/logits": 0.09122653305530548, + "loss/reg": 25677.34765625, + "step": 927 + }, + { + "epoch": 0.0232, + "grad_norm": 1.5073013305664062, + "grad_norm_var": 0.10128768195023512, + "learning_rate": 0.0001, + "loss": 25585.4941, + "loss/crossentropy": 2.9681735038757324, + "loss/hidden": 0.451171875, + "loss/logits": 0.09525079280138016, + "loss/reg": 25584.947265625, + "step": 928 + }, + { + "epoch": 0.023225, + "grad_norm": 1.118074655532837, + "grad_norm_var": 0.031501517644075355, + "learning_rate": 0.0001, + "loss": 25485.7266, + "loss/crossentropy": 2.705671548843384, + "loss/hidden": 0.4296875, + "loss/logits": 0.09706344455480576, + "loss/reg": 25485.19921875, + "step": 929 + }, + { + "epoch": 0.02325, + "grad_norm": 0.792195200920105, + "grad_norm_var": 0.03498046946117814, + "learning_rate": 0.0001, + "loss": 25394.082, + "loss/crossentropy": 2.669508218765259, + "loss/hidden": 0.42578125, + "loss/logits": 0.09703344851732254, + "loss/reg": 25393.55859375, + "step": 930 + }, + { + "epoch": 0.023275, + "grad_norm": 0.9936244487762451, + "grad_norm_var": 0.03475783662954665, + "learning_rate": 0.0001, + "loss": 25303.7051, + "loss/crossentropy": 2.5820255279541016, + "loss/hidden": 0.439453125, + "loss/logits": 0.09157466143369675, + "loss/reg": 25303.173828125, + "step": 931 + }, + { + "epoch": 0.0233, + "grad_norm": 0.9209782481193542, + "grad_norm_var": 0.03397892892039251, + "learning_rate": 0.0001, + "loss": 25212.748, + "loss/crossentropy": 2.3748815059661865, + "loss/hidden": 0.4140625, + "loss/logits": 0.09344466030597687, + "loss/reg": 25212.240234375, + "step": 932 + }, + { + "epoch": 0.023325, + "grad_norm": 1.307304859161377, + "grad_norm_var": 0.038837048116851194, + "learning_rate": 0.0001, + "loss": 25124.3164, + "loss/crossentropy": 2.3792972564697266, + "loss/hidden": 0.466796875, + "loss/logits": 0.12809084355831146, + "loss/reg": 25123.720703125, + "step": 933 + }, + { + "epoch": 0.02335, + "grad_norm": 0.8441806435585022, + "grad_norm_var": 0.04066961683942717, + "learning_rate": 0.0001, + "loss": 25034.1055, + "loss/crossentropy": 2.708909273147583, + "loss/hidden": 0.423828125, + "loss/logits": 0.0946684405207634, + "loss/reg": 25033.587890625, + "step": 934 + }, + { + "epoch": 0.023375, + "grad_norm": 1.0004233121871948, + "grad_norm_var": 0.03708106564219798, + "learning_rate": 0.0001, + "loss": 24944.4883, + "loss/crossentropy": 2.5024023056030273, + "loss/hidden": 0.435546875, + "loss/logits": 0.09347517043352127, + "loss/reg": 24943.958984375, + "step": 935 + }, + { + "epoch": 0.0234, + "grad_norm": 0.9585517644882202, + "grad_norm_var": 0.037173917869692256, + "learning_rate": 0.0001, + "loss": 24855.9492, + "loss/crossentropy": 2.396317958831787, + "loss/hidden": 0.4296875, + "loss/logits": 0.09736696630716324, + "loss/reg": 24855.421875, + "step": 936 + }, + { + "epoch": 0.023425, + "grad_norm": 0.9499222040176392, + "grad_norm_var": 0.03677999916784411, + "learning_rate": 0.0001, + "loss": 24767.418, + "loss/crossentropy": 2.8797192573547363, + "loss/hidden": 0.439453125, + "loss/logits": 0.10986952483654022, + "loss/reg": 24766.869140625, + "step": 937 + }, + { + "epoch": 0.02345, + "grad_norm": 0.7930538654327393, + "grad_norm_var": 0.03890523162507543, + "learning_rate": 0.0001, + "loss": 24678.4062, + "loss/crossentropy": 2.5853559970855713, + "loss/hidden": 0.421875, + "loss/logits": 0.09216812252998352, + "loss/reg": 24677.892578125, + "step": 938 + }, + { + "epoch": 0.023475, + "grad_norm": 1.0527880191802979, + "grad_norm_var": 0.03724183284043681, + "learning_rate": 0.0001, + "loss": 24591.1797, + "loss/crossentropy": 2.5778865814208984, + "loss/hidden": 0.4375, + "loss/logits": 0.10692226886749268, + "loss/reg": 24590.634765625, + "step": 939 + }, + { + "epoch": 0.0235, + "grad_norm": 1.0680421590805054, + "grad_norm_var": 0.03473803065608056, + "learning_rate": 0.0001, + "loss": 24503.9961, + "loss/crossentropy": 2.7273216247558594, + "loss/hidden": 0.435546875, + "loss/logits": 0.10270722210407257, + "loss/reg": 24503.45703125, + "step": 940 + }, + { + "epoch": 0.023525, + "grad_norm": 0.9111777544021606, + "grad_norm_var": 0.033707766249989495, + "learning_rate": 0.0001, + "loss": 24417.6191, + "loss/crossentropy": 2.6318414211273193, + "loss/hidden": 0.419921875, + "loss/logits": 0.09268413484096527, + "loss/reg": 24417.107421875, + "step": 941 + }, + { + "epoch": 0.02355, + "grad_norm": 1.0316659212112427, + "grad_norm_var": 0.03327319310434256, + "learning_rate": 0.0001, + "loss": 24331.5488, + "loss/crossentropy": 2.6445016860961914, + "loss/hidden": 0.40234375, + "loss/logits": 0.0935656726360321, + "loss/reg": 24331.052734375, + "step": 942 + }, + { + "epoch": 0.023575, + "grad_norm": 1.0706905126571655, + "grad_norm_var": 0.03342928936908199, + "learning_rate": 0.0001, + "loss": 24245.291, + "loss/crossentropy": 2.7278873920440674, + "loss/hidden": 0.423828125, + "loss/logits": 0.10185898840427399, + "loss/reg": 24244.765625, + "step": 943 + }, + { + "epoch": 0.0236, + "grad_norm": 0.9393065571784973, + "grad_norm_var": 0.01668818496218057, + "learning_rate": 0.0001, + "loss": 24158.9805, + "loss/crossentropy": 2.5154876708984375, + "loss/hidden": 0.431640625, + "loss/logits": 0.10179644078016281, + "loss/reg": 24158.447265625, + "step": 944 + }, + { + "epoch": 0.023625, + "grad_norm": 0.9378867149353027, + "grad_norm_var": 0.015508247044760977, + "learning_rate": 0.0001, + "loss": 24072.2188, + "loss/crossentropy": 2.459463119506836, + "loss/hidden": 0.427734375, + "loss/logits": 0.10269282758235931, + "loss/reg": 24071.6875, + "step": 945 + }, + { + "epoch": 0.02365, + "grad_norm": 0.9219256043434143, + "grad_norm_var": 0.013428570358553079, + "learning_rate": 0.0001, + "loss": 23987.8887, + "loss/crossentropy": 2.444058656692505, + "loss/hidden": 0.43359375, + "loss/logits": 0.10357135534286499, + "loss/reg": 23987.3515625, + "step": 946 + }, + { + "epoch": 0.023675, + "grad_norm": 0.8147755861282349, + "grad_norm_var": 0.015134934386543305, + "learning_rate": 0.0001, + "loss": 23903.2949, + "loss/crossentropy": 2.5425283908843994, + "loss/hidden": 0.421875, + "loss/logits": 0.09350921213626862, + "loss/reg": 23902.779296875, + "step": 947 + }, + { + "epoch": 0.0237, + "grad_norm": 1.0720521211624146, + "grad_norm_var": 0.015570571374023704, + "learning_rate": 0.0001, + "loss": 23819.9102, + "loss/crossentropy": 2.504666566848755, + "loss/hidden": 0.4296875, + "loss/logits": 0.1169600635766983, + "loss/reg": 23819.36328125, + "step": 948 + }, + { + "epoch": 0.023725, + "grad_norm": 1.1205313205718994, + "grad_norm_var": 0.009590193786136429, + "learning_rate": 0.0001, + "loss": 23735.5586, + "loss/crossentropy": 2.7423009872436523, + "loss/hidden": 0.4296875, + "loss/logits": 0.09194411337375641, + "loss/reg": 23735.037109375, + "step": 949 + }, + { + "epoch": 0.02375, + "grad_norm": 1.5967024564743042, + "grad_norm_var": 0.03256612602235875, + "learning_rate": 0.0001, + "loss": 23652.8613, + "loss/crossentropy": 2.4818520545959473, + "loss/hidden": 0.439453125, + "loss/logits": 0.10307301580905914, + "loss/reg": 23652.318359375, + "step": 950 + }, + { + "epoch": 0.023775, + "grad_norm": 1.3237601518630981, + "grad_norm_var": 0.03847322984858147, + "learning_rate": 0.0001, + "loss": 23570.2109, + "loss/crossentropy": 3.0349578857421875, + "loss/hidden": 0.470703125, + "loss/logits": 0.11495113372802734, + "loss/reg": 23569.625, + "step": 951 + }, + { + "epoch": 0.0238, + "grad_norm": 0.8153846859931946, + "grad_norm_var": 0.041216976286479565, + "learning_rate": 0.0001, + "loss": 23485.4648, + "loss/crossentropy": 2.4277260303497314, + "loss/hidden": 0.4296875, + "loss/logits": 0.10003092885017395, + "loss/reg": 23484.935546875, + "step": 952 + }, + { + "epoch": 0.023825, + "grad_norm": 1.0136277675628662, + "grad_norm_var": 0.04082246968663174, + "learning_rate": 0.0001, + "loss": 23403.4863, + "loss/crossentropy": 2.6772782802581787, + "loss/hidden": 0.44140625, + "loss/logits": 0.1092321127653122, + "loss/reg": 23402.935546875, + "step": 953 + }, + { + "epoch": 0.02385, + "grad_norm": 0.8215113878250122, + "grad_norm_var": 0.039973231307667695, + "learning_rate": 0.0001, + "loss": 23320.4492, + "loss/crossentropy": 2.495701789855957, + "loss/hidden": 0.423828125, + "loss/logits": 0.09646574407815933, + "loss/reg": 23319.9296875, + "step": 954 + }, + { + "epoch": 0.023875, + "grad_norm": 1.060551404953003, + "grad_norm_var": 0.03999852732846573, + "learning_rate": 0.0001, + "loss": 23237.9766, + "loss/crossentropy": 2.664013624191284, + "loss/hidden": 0.44921875, + "loss/logits": 0.11422315239906311, + "loss/reg": 23237.4140625, + "step": 955 + }, + { + "epoch": 0.0239, + "grad_norm": 0.9129847288131714, + "grad_norm_var": 0.04076586557442507, + "learning_rate": 0.0001, + "loss": 23155.3926, + "loss/crossentropy": 2.5501043796539307, + "loss/hidden": 0.435546875, + "loss/logits": 0.10178236663341522, + "loss/reg": 23154.85546875, + "step": 956 + }, + { + "epoch": 0.023925, + "grad_norm": 1.0732991695404053, + "grad_norm_var": 0.03999608608578099, + "learning_rate": 0.0001, + "loss": 23073.875, + "loss/crossentropy": 2.3961002826690674, + "loss/hidden": 0.4453125, + "loss/logits": 0.10903213918209076, + "loss/reg": 23073.3203125, + "step": 957 + }, + { + "epoch": 0.02395, + "grad_norm": 1.1402229070663452, + "grad_norm_var": 0.04071453073050209, + "learning_rate": 0.0001, + "loss": 22992.459, + "loss/crossentropy": 2.8703441619873047, + "loss/hidden": 0.4453125, + "loss/logits": 0.12756890058517456, + "loss/reg": 22991.88671875, + "step": 958 + }, + { + "epoch": 0.023975, + "grad_norm": 1.2905657291412354, + "grad_norm_var": 0.04464461561989617, + "learning_rate": 0.0001, + "loss": 22911.1367, + "loss/crossentropy": 2.1613481044769287, + "loss/hidden": 0.4296875, + "loss/logits": 0.10078255087137222, + "loss/reg": 22910.60546875, + "step": 959 + }, + { + "epoch": 0.024, + "grad_norm": 0.9392194151878357, + "grad_norm_var": 0.04464594223829518, + "learning_rate": 0.0001, + "loss": 22831.9512, + "loss/crossentropy": 2.4932808876037598, + "loss/hidden": 0.423828125, + "loss/logits": 0.08269266784191132, + "loss/reg": 22831.4453125, + "step": 960 + }, + { + "epoch": 0.024025, + "grad_norm": 1.8595917224884033, + "grad_norm_var": 0.08354169265646237, + "learning_rate": 0.0001, + "loss": 22752.7676, + "loss/crossentropy": 2.7113559246063232, + "loss/hidden": 0.490234375, + "loss/logits": 0.11296282708644867, + "loss/reg": 22752.1640625, + "step": 961 + }, + { + "epoch": 0.02405, + "grad_norm": 1.1560962200164795, + "grad_norm_var": 0.08106413461811185, + "learning_rate": 0.0001, + "loss": 22672.8867, + "loss/crossentropy": 2.613546848297119, + "loss/hidden": 0.4296875, + "loss/logits": 0.09537872672080994, + "loss/reg": 22672.361328125, + "step": 962 + }, + { + "epoch": 0.024075, + "grad_norm": 0.9497840404510498, + "grad_norm_var": 0.07660671350333838, + "learning_rate": 0.0001, + "loss": 22593.0859, + "loss/crossentropy": 2.448693037033081, + "loss/hidden": 0.4296875, + "loss/logits": 0.08571556955575943, + "loss/reg": 22592.5703125, + "step": 963 + }, + { + "epoch": 0.0241, + "grad_norm": 1.1213970184326172, + "grad_norm_var": 0.0763505458177983, + "learning_rate": 0.0001, + "loss": 22512.8164, + "loss/crossentropy": 2.826176166534424, + "loss/hidden": 0.46875, + "loss/logits": 0.10593421757221222, + "loss/reg": 22512.2421875, + "step": 964 + }, + { + "epoch": 0.024125, + "grad_norm": 1.1012262105941772, + "grad_norm_var": 0.07641674904413306, + "learning_rate": 0.0001, + "loss": 22433.457, + "loss/crossentropy": 2.9246420860290527, + "loss/hidden": 0.435546875, + "loss/logits": 0.09543861448764801, + "loss/reg": 22432.92578125, + "step": 965 + }, + { + "epoch": 0.02415, + "grad_norm": 1.0128264427185059, + "grad_norm_var": 0.06185758565790037, + "learning_rate": 0.0001, + "loss": 22355.3984, + "loss/crossentropy": 2.4664463996887207, + "loss/hidden": 0.44140625, + "loss/logits": 0.10364780575037003, + "loss/reg": 22354.853515625, + "step": 966 + }, + { + "epoch": 0.024175, + "grad_norm": 1.354954481124878, + "grad_norm_var": 0.06285114347466039, + "learning_rate": 0.0001, + "loss": 22278.1406, + "loss/crossentropy": 2.9408609867095947, + "loss/hidden": 0.453125, + "loss/logits": 0.11183148622512817, + "loss/reg": 22277.576171875, + "step": 967 + }, + { + "epoch": 0.0242, + "grad_norm": 1.3423038721084595, + "grad_norm_var": 0.06010591884042792, + "learning_rate": 0.0001, + "loss": 22200.9824, + "loss/crossentropy": 2.842505931854248, + "loss/hidden": 0.462890625, + "loss/logits": 0.09240270406007767, + "loss/reg": 22200.427734375, + "step": 968 + }, + { + "epoch": 0.024225, + "grad_norm": 0.9177575707435608, + "grad_norm_var": 0.062223966885823205, + "learning_rate": 0.0001, + "loss": 22115.6211, + "loss/crossentropy": 2.5726301670074463, + "loss/hidden": 0.421875, + "loss/logits": 0.08873818814754486, + "loss/reg": 22115.111328125, + "step": 969 + }, + { + "epoch": 0.02425, + "grad_norm": 0.8709549903869629, + "grad_norm_var": 0.06035364597454086, + "learning_rate": 0.0001, + "loss": 22038.9238, + "loss/crossentropy": 2.6931722164154053, + "loss/hidden": 0.43359375, + "loss/logits": 0.1035221666097641, + "loss/reg": 22038.38671875, + "step": 970 + }, + { + "epoch": 0.024275, + "grad_norm": 1.5222187042236328, + "grad_norm_var": 0.0693084192185554, + "learning_rate": 0.0001, + "loss": 21961.6309, + "loss/crossentropy": 2.6958839893341064, + "loss/hidden": 0.466796875, + "loss/logits": 0.11546257138252258, + "loss/reg": 21961.048828125, + "step": 971 + }, + { + "epoch": 0.0243, + "grad_norm": 1.1752046346664429, + "grad_norm_var": 0.06495775773170204, + "learning_rate": 0.0001, + "loss": 21884.3262, + "loss/crossentropy": 2.49680233001709, + "loss/hidden": 0.46484375, + "loss/logits": 0.11024495959281921, + "loss/reg": 21883.751953125, + "step": 972 + }, + { + "epoch": 0.024325, + "grad_norm": 0.93658846616745, + "grad_norm_var": 0.06801115334468061, + "learning_rate": 0.0001, + "loss": 21807.9648, + "loss/crossentropy": 2.5653347969055176, + "loss/hidden": 0.427734375, + "loss/logits": 0.08618831634521484, + "loss/reg": 21807.451171875, + "step": 973 + }, + { + "epoch": 0.02435, + "grad_norm": 1.0110540390014648, + "grad_norm_var": 0.06953546700071542, + "learning_rate": 0.0001, + "loss": 21731.2305, + "loss/crossentropy": 2.6715872287750244, + "loss/hidden": 0.43359375, + "loss/logits": 0.10801969468593597, + "loss/reg": 21730.689453125, + "step": 974 + }, + { + "epoch": 0.024375, + "grad_norm": 0.7632473111152649, + "grad_norm_var": 0.07774221154281466, + "learning_rate": 0.0001, + "loss": 21654.6543, + "loss/crossentropy": 2.3023362159729004, + "loss/hidden": 0.39453125, + "loss/logits": 0.08346745371818542, + "loss/reg": 21654.17578125, + "step": 975 + }, + { + "epoch": 0.0244, + "grad_norm": 0.892348051071167, + "grad_norm_var": 0.07905400420977847, + "learning_rate": 0.0001, + "loss": 21579.6055, + "loss/crossentropy": 2.7380852699279785, + "loss/hidden": 0.423828125, + "loss/logits": 0.09384720027446747, + "loss/reg": 21579.087890625, + "step": 976 + }, + { + "epoch": 0.024425, + "grad_norm": 1.179625153541565, + "grad_norm_var": 0.04128092908533317, + "learning_rate": 0.0001, + "loss": 21505.2305, + "loss/crossentropy": 2.5506160259246826, + "loss/hidden": 0.43359375, + "loss/logits": 0.11979290843009949, + "loss/reg": 21504.677734375, + "step": 977 + }, + { + "epoch": 0.02445, + "grad_norm": 0.9429466128349304, + "grad_norm_var": 0.042006827373164884, + "learning_rate": 0.0001, + "loss": 21429.9922, + "loss/crossentropy": 2.449470281600952, + "loss/hidden": 0.443359375, + "loss/logits": 0.11196134239435196, + "loss/reg": 21429.4375, + "step": 978 + }, + { + "epoch": 0.024475, + "grad_norm": 0.9729964733123779, + "grad_norm_var": 0.04167338083207805, + "learning_rate": 0.0001, + "loss": 21356.1621, + "loss/crossentropy": 2.6006290912628174, + "loss/hidden": 0.435546875, + "loss/logits": 0.09295792877674103, + "loss/reg": 21355.6328125, + "step": 979 + }, + { + "epoch": 0.0245, + "grad_norm": 1.66546630859375, + "grad_norm_var": 0.06391321931942666, + "learning_rate": 0.0001, + "loss": 21283.2324, + "loss/crossentropy": 2.4464926719665527, + "loss/hidden": 0.49609375, + "loss/logits": 0.15045657753944397, + "loss/reg": 21282.5859375, + "step": 980 + }, + { + "epoch": 0.024525, + "grad_norm": 1.159827709197998, + "grad_norm_var": 0.06410729346084636, + "learning_rate": 0.0001, + "loss": 21208.8262, + "loss/crossentropy": 2.473205089569092, + "loss/hidden": 0.451171875, + "loss/logits": 0.12011029571294785, + "loss/reg": 21208.255859375, + "step": 981 + }, + { + "epoch": 0.02455, + "grad_norm": 1.228529930114746, + "grad_norm_var": 0.06429186090149078, + "learning_rate": 0.0001, + "loss": 21135.3398, + "loss/crossentropy": 2.5950937271118164, + "loss/hidden": 0.462890625, + "loss/logits": 0.10239904373884201, + "loss/reg": 21134.775390625, + "step": 982 + }, + { + "epoch": 0.024575, + "grad_norm": 1.1297028064727783, + "grad_norm_var": 0.06043656618327204, + "learning_rate": 0.0001, + "loss": 21061.5859, + "loss/crossentropy": 2.5663630962371826, + "loss/hidden": 0.44921875, + "loss/logits": 0.11027559638023376, + "loss/reg": 21061.02734375, + "step": 983 + }, + { + "epoch": 0.0246, + "grad_norm": 0.8357163667678833, + "grad_norm_var": 0.06057721585537804, + "learning_rate": 0.0001, + "loss": 20987.8066, + "loss/crossentropy": 2.509295701980591, + "loss/hidden": 0.435546875, + "loss/logits": 0.0971388965845108, + "loss/reg": 20987.2734375, + "step": 984 + }, + { + "epoch": 0.024625, + "grad_norm": 0.9624331593513489, + "grad_norm_var": 0.05976374893259659, + "learning_rate": 0.0001, + "loss": 20915.5469, + "loss/crossentropy": 2.6645829677581787, + "loss/hidden": 0.4296875, + "loss/logits": 0.09732533991336823, + "loss/reg": 20915.01953125, + "step": 985 + }, + { + "epoch": 0.02465, + "grad_norm": 0.921421468257904, + "grad_norm_var": 0.058529388388714056, + "learning_rate": 0.0001, + "loss": 20843.3809, + "loss/crossentropy": 2.5907278060913086, + "loss/hidden": 0.423828125, + "loss/logits": 0.08527834713459015, + "loss/reg": 20842.87109375, + "step": 986 + }, + { + "epoch": 0.024675, + "grad_norm": 0.936118483543396, + "grad_norm_var": 0.045535446772327605, + "learning_rate": 0.0001, + "loss": 20768.4512, + "loss/crossentropy": 2.6672019958496094, + "loss/hidden": 0.427734375, + "loss/logits": 0.10828252136707306, + "loss/reg": 20767.916015625, + "step": 987 + }, + { + "epoch": 0.0247, + "grad_norm": 0.9817948341369629, + "grad_norm_var": 0.04450477613690406, + "learning_rate": 0.0001, + "loss": 20696.8027, + "loss/crossentropy": 2.604332447052002, + "loss/hidden": 0.435546875, + "loss/logits": 0.09101539850234985, + "loss/reg": 20696.275390625, + "step": 988 + }, + { + "epoch": 0.024725, + "grad_norm": 0.8676700592041016, + "grad_norm_var": 0.045682872994934995, + "learning_rate": 0.0001, + "loss": 20624.293, + "loss/crossentropy": 2.532348394393921, + "loss/hidden": 0.421875, + "loss/logits": 0.07284273207187653, + "loss/reg": 20623.798828125, + "step": 989 + }, + { + "epoch": 0.02475, + "grad_norm": 0.9078096151351929, + "grad_norm_var": 0.04658485699583158, + "learning_rate": 0.0001, + "loss": 20553.4355, + "loss/crossentropy": 2.8214199542999268, + "loss/hidden": 0.4296875, + "loss/logits": 0.09346893429756165, + "loss/reg": 20552.912109375, + "step": 990 + }, + { + "epoch": 0.024775, + "grad_norm": 0.9056034088134766, + "grad_norm_var": 0.042945254504367, + "learning_rate": 0.0001, + "loss": 20482.8789, + "loss/crossentropy": 2.4334371089935303, + "loss/hidden": 0.44921875, + "loss/logits": 0.0925212949514389, + "loss/reg": 20482.337890625, + "step": 991 + }, + { + "epoch": 0.0248, + "grad_norm": 1.03048837184906, + "grad_norm_var": 0.041591031305633054, + "learning_rate": 0.0001, + "loss": 20411.1191, + "loss/crossentropy": 2.5874595642089844, + "loss/hidden": 0.466796875, + "loss/logits": 0.08578906953334808, + "loss/reg": 20410.56640625, + "step": 992 + }, + { + "epoch": 0.024825, + "grad_norm": 0.9951749444007874, + "grad_norm_var": 0.04026533365937889, + "learning_rate": 0.0001, + "loss": 20341.043, + "loss/crossentropy": 2.9441099166870117, + "loss/hidden": 0.431640625, + "loss/logits": 0.10317258536815643, + "loss/reg": 20340.5078125, + "step": 993 + }, + { + "epoch": 0.02485, + "grad_norm": 0.7870285511016846, + "grad_norm_var": 0.04354733117220501, + "learning_rate": 0.0001, + "loss": 20270.502, + "loss/crossentropy": 2.565255641937256, + "loss/hidden": 0.431640625, + "loss/logits": 0.10767176747322083, + "loss/reg": 20269.962890625, + "step": 994 + }, + { + "epoch": 0.024875, + "grad_norm": 1.132638692855835, + "grad_norm_var": 0.044182544542965836, + "learning_rate": 0.0001, + "loss": 20200.2891, + "loss/crossentropy": 2.9740242958068848, + "loss/hidden": 0.4453125, + "loss/logits": 0.10422468930482864, + "loss/reg": 20199.740234375, + "step": 995 + }, + { + "epoch": 0.0249, + "grad_norm": 0.9809736013412476, + "grad_norm_var": 0.015283599422464636, + "learning_rate": 0.0001, + "loss": 20130.6875, + "loss/crossentropy": 2.6761510372161865, + "loss/hidden": 0.443359375, + "loss/logits": 0.11210547387599945, + "loss/reg": 20130.1328125, + "step": 996 + }, + { + "epoch": 0.024925, + "grad_norm": 1.2750529050827026, + "grad_norm_var": 0.018796527883273944, + "learning_rate": 0.0001, + "loss": 20061.7246, + "loss/crossentropy": 2.713996171951294, + "loss/hidden": 0.4765625, + "loss/logits": 0.12018194794654846, + "loss/reg": 20061.126953125, + "step": 997 + }, + { + "epoch": 0.02495, + "grad_norm": 1.0360372066497803, + "grad_norm_var": 0.015051539828577971, + "learning_rate": 0.0001, + "loss": 19993.0957, + "loss/crossentropy": 2.730985641479492, + "loss/hidden": 0.431640625, + "loss/logits": 0.08067913353443146, + "loss/reg": 19992.583984375, + "step": 998 + }, + { + "epoch": 0.024975, + "grad_norm": 0.8395718336105347, + "grad_norm_var": 0.01453511121293259, + "learning_rate": 0.0001, + "loss": 19919.8027, + "loss/crossentropy": 2.7646329402923584, + "loss/hidden": 0.4296875, + "loss/logits": 0.09436003118753433, + "loss/reg": 19919.279296875, + "step": 999 + }, + { + "epoch": 0.025, + "grad_norm": 1.1801493167877197, + "grad_norm_var": 0.016140098413421084, + "learning_rate": 0.0001, + "loss": 19851.1016, + "loss/crossentropy": 3.0086069107055664, + "loss/hidden": 0.47265625, + "loss/logits": 0.10071013867855072, + "loss/reg": 19850.52734375, + "step": 1000 + }, + { + "epoch": 0.025025, + "grad_norm": 0.9698167443275452, + "grad_norm_var": 0.016122521849981183, + "learning_rate": 0.0001, + "loss": 19781.9707, + "loss/crossentropy": 2.813955068588257, + "loss/hidden": 0.427734375, + "loss/logits": 0.0990939736366272, + "loss/reg": 19781.443359375, + "step": 1001 + }, + { + "epoch": 0.02505, + "grad_norm": 0.8948052525520325, + "grad_norm_var": 0.016389621821141617, + "learning_rate": 0.0001, + "loss": 19714.3203, + "loss/crossentropy": 2.843393325805664, + "loss/hidden": 0.427734375, + "loss/logits": 0.09265361726284027, + "loss/reg": 19713.80078125, + "step": 1002 + }, + { + "epoch": 0.025075, + "grad_norm": 0.9143559336662292, + "grad_norm_var": 0.016553939447787987, + "learning_rate": 0.0001, + "loss": 19647.4551, + "loss/crossentropy": 2.903536558151245, + "loss/hidden": 0.419921875, + "loss/logits": 0.08771992474794388, + "loss/reg": 19646.947265625, + "step": 1003 + }, + { + "epoch": 0.0251, + "grad_norm": 0.8601119518280029, + "grad_norm_var": 0.017469476934411195, + "learning_rate": 0.0001, + "loss": 19580.8223, + "loss/crossentropy": 2.6393449306488037, + "loss/hidden": 0.42578125, + "loss/logits": 0.08560352772474289, + "loss/reg": 19580.310546875, + "step": 1004 + }, + { + "epoch": 0.025125, + "grad_norm": 1.1371523141860962, + "grad_norm_var": 0.01820280433152129, + "learning_rate": 0.0001, + "loss": 19514.7129, + "loss/crossentropy": 2.5199880599975586, + "loss/hidden": 0.43359375, + "loss/logits": 0.08653436601161957, + "loss/reg": 19514.193359375, + "step": 1005 + }, + { + "epoch": 0.02515, + "grad_norm": 1.3092695474624634, + "grad_norm_var": 0.023853796794785314, + "learning_rate": 0.0001, + "loss": 19447.8926, + "loss/crossentropy": 2.913530111312866, + "loss/hidden": 0.4296875, + "loss/logits": 0.09430601447820663, + "loss/reg": 19447.369140625, + "step": 1006 + }, + { + "epoch": 0.025175, + "grad_norm": 1.0356576442718506, + "grad_norm_var": 0.023005009456557406, + "learning_rate": 0.0001, + "loss": 19380.7793, + "loss/crossentropy": 2.6900110244750977, + "loss/hidden": 0.43359375, + "loss/logits": 0.07939346134662628, + "loss/reg": 19380.265625, + "step": 1007 + }, + { + "epoch": 0.0252, + "grad_norm": 0.8629568219184875, + "grad_norm_var": 0.024606272502219525, + "learning_rate": 0.0001, + "loss": 19315.0293, + "loss/crossentropy": 3.0058045387268066, + "loss/hidden": 0.419921875, + "loss/logits": 0.08857347071170807, + "loss/reg": 19314.521484375, + "step": 1008 + }, + { + "epoch": 0.025225, + "grad_norm": 1.0065983533859253, + "grad_norm_var": 0.024587016577246778, + "learning_rate": 0.0001, + "loss": 19249.4434, + "loss/crossentropy": 2.8189473152160645, + "loss/hidden": 0.46875, + "loss/logits": 0.1279560625553131, + "loss/reg": 19248.845703125, + "step": 1009 + }, + { + "epoch": 0.02525, + "grad_norm": 0.8587822318077087, + "grad_norm_var": 0.022738422167231778, + "learning_rate": 0.0001, + "loss": 19183.877, + "loss/crossentropy": 2.836906671524048, + "loss/hidden": 0.42578125, + "loss/logits": 0.08942188322544098, + "loss/reg": 19183.361328125, + "step": 1010 + }, + { + "epoch": 0.025275, + "grad_norm": 0.9735855460166931, + "grad_norm_var": 0.021896248833306705, + "learning_rate": 0.0001, + "loss": 19119.3652, + "loss/crossentropy": 2.576018810272217, + "loss/hidden": 0.44921875, + "loss/logits": 0.09336192905902863, + "loss/reg": 19118.822265625, + "step": 1011 + }, + { + "epoch": 0.0253, + "grad_norm": 0.8476600646972656, + "grad_norm_var": 0.02349506820003923, + "learning_rate": 0.0001, + "loss": 19054.9141, + "loss/crossentropy": 2.7196743488311768, + "loss/hidden": 0.43359375, + "loss/logits": 0.08736908435821533, + "loss/reg": 19054.392578125, + "step": 1012 + }, + { + "epoch": 0.025325, + "grad_norm": 1.0779842138290405, + "grad_norm_var": 0.018697648296842277, + "learning_rate": 0.0001, + "loss": 18990.8809, + "loss/crossentropy": 2.90828537940979, + "loss/hidden": 0.435546875, + "loss/logits": 0.09901103377342224, + "loss/reg": 18990.345703125, + "step": 1013 + }, + { + "epoch": 0.02535, + "grad_norm": 1.0291345119476318, + "grad_norm_var": 0.01865621313936613, + "learning_rate": 0.0001, + "loss": 18926.7422, + "loss/crossentropy": 2.5710670948028564, + "loss/hidden": 0.435546875, + "loss/logits": 0.09601636230945587, + "loss/reg": 18926.2109375, + "step": 1014 + }, + { + "epoch": 0.025375, + "grad_norm": 1.037187933921814, + "grad_norm_var": 0.017203204215706987, + "learning_rate": 0.0001, + "loss": 18863.2285, + "loss/crossentropy": 2.9131431579589844, + "loss/hidden": 0.435546875, + "loss/logits": 0.09500589966773987, + "loss/reg": 18862.697265625, + "step": 1015 + }, + { + "epoch": 0.0254, + "grad_norm": 0.9352390766143799, + "grad_norm_var": 0.015059517364305879, + "learning_rate": 0.0001, + "loss": 18796.5547, + "loss/crossentropy": 2.563702344894409, + "loss/hidden": 0.4296875, + "loss/logits": 0.09276743978261948, + "loss/reg": 18796.033203125, + "step": 1016 + }, + { + "epoch": 0.025425, + "grad_norm": 1.01382577419281, + "grad_norm_var": 0.015095031731177247, + "learning_rate": 0.0001, + "loss": 18733.0645, + "loss/crossentropy": 2.8074276447296143, + "loss/hidden": 0.4296875, + "loss/logits": 0.09178042411804199, + "loss/reg": 18732.54296875, + "step": 1017 + }, + { + "epoch": 0.02545, + "grad_norm": 0.8776202201843262, + "grad_norm_var": 0.01532506926756169, + "learning_rate": 0.0001, + "loss": 18668.4941, + "loss/crossentropy": 2.913670063018799, + "loss/hidden": 0.421875, + "loss/logits": 0.0906159058213234, + "loss/reg": 18667.982421875, + "step": 1018 + }, + { + "epoch": 0.025475, + "grad_norm": 0.9331912994384766, + "grad_norm_var": 0.015167140696689533, + "learning_rate": 0.0001, + "loss": 18606.2109, + "loss/crossentropy": 3.2162771224975586, + "loss/hidden": 0.443359375, + "loss/logits": 0.09538498520851135, + "loss/reg": 18605.671875, + "step": 1019 + }, + { + "epoch": 0.0255, + "grad_norm": 0.7805931568145752, + "grad_norm_var": 0.01691029687888581, + "learning_rate": 0.0001, + "loss": 18543.0703, + "loss/crossentropy": 2.379256248474121, + "loss/hidden": 0.41796875, + "loss/logits": 0.08216649293899536, + "loss/reg": 18542.5703125, + "step": 1020 + }, + { + "epoch": 0.025525, + "grad_norm": 1.035799264907837, + "grad_norm_var": 0.015459385370742717, + "learning_rate": 0.0001, + "loss": 18481.0234, + "loss/crossentropy": 2.7980172634124756, + "loss/hidden": 0.4296875, + "loss/logits": 0.09035004675388336, + "loss/reg": 18480.50390625, + "step": 1021 + }, + { + "epoch": 0.02555, + "grad_norm": 1.1896371841430664, + "grad_norm_var": 0.011036991252401155, + "learning_rate": 0.0001, + "loss": 18417.9668, + "loss/crossentropy": 2.533511161804199, + "loss/hidden": 0.5, + "loss/logits": 0.10565692186355591, + "loss/reg": 18417.361328125, + "step": 1022 + }, + { + "epoch": 0.025575, + "grad_norm": 1.1064552068710327, + "grad_norm_var": 0.011984528561376573, + "learning_rate": 0.0001, + "loss": 18354.793, + "loss/crossentropy": 2.736083507537842, + "loss/hidden": 0.435546875, + "loss/logits": 0.08818010985851288, + "loss/reg": 18354.26953125, + "step": 1023 + }, + { + "epoch": 0.0256, + "grad_norm": 0.9937368631362915, + "grad_norm_var": 0.011136538349077263, + "learning_rate": 0.0001, + "loss": 18292.7598, + "loss/crossentropy": 2.8839433193206787, + "loss/hidden": 0.43359375, + "loss/logits": 0.08257324993610382, + "loss/reg": 18292.244140625, + "step": 1024 + }, + { + "epoch": 0.025625, + "grad_norm": 1.1220980882644653, + "grad_norm_var": 0.012363521642834444, + "learning_rate": 0.0001, + "loss": 18230.7578, + "loss/crossentropy": 2.7472383975982666, + "loss/hidden": 0.466796875, + "loss/logits": 0.09016968309879303, + "loss/reg": 18230.201171875, + "step": 1025 + }, + { + "epoch": 0.02565, + "grad_norm": 0.7697837948799133, + "grad_norm_var": 0.014395284169400214, + "learning_rate": 0.0001, + "loss": 18169.1855, + "loss/crossentropy": 2.7969915866851807, + "loss/hidden": 0.423828125, + "loss/logits": 0.0929093211889267, + "loss/reg": 18168.66796875, + "step": 1026 + }, + { + "epoch": 0.025675, + "grad_norm": 0.9489984512329102, + "grad_norm_var": 0.014463014787380013, + "learning_rate": 0.0001, + "loss": 18107.4141, + "loss/crossentropy": 2.691138505935669, + "loss/hidden": 0.427734375, + "loss/logits": 0.07996124029159546, + "loss/reg": 18106.90625, + "step": 1027 + }, + { + "epoch": 0.0257, + "grad_norm": 0.977716863155365, + "grad_norm_var": 0.013204760693864634, + "learning_rate": 0.0001, + "loss": 18045.3945, + "loss/crossentropy": 2.789412260055542, + "loss/hidden": 0.4296875, + "loss/logits": 0.08714571595191956, + "loss/reg": 18044.876953125, + "step": 1028 + }, + { + "epoch": 0.025725, + "grad_norm": 1.0606443881988525, + "grad_norm_var": 0.013018545875161122, + "learning_rate": 0.0001, + "loss": 17983.6035, + "loss/crossentropy": 2.911621570587158, + "loss/hidden": 0.43359375, + "loss/logits": 0.08721598237752914, + "loss/reg": 17983.08203125, + "step": 1029 + }, + { + "epoch": 0.02575, + "grad_norm": 1.029701590538025, + "grad_norm_var": 0.013021658868217875, + "learning_rate": 0.0001, + "loss": 17921.9004, + "loss/crossentropy": 2.683985948562622, + "loss/hidden": 0.451171875, + "loss/logits": 0.10340292751789093, + "loss/reg": 17921.345703125, + "step": 1030 + }, + { + "epoch": 0.025775, + "grad_norm": 0.9352401494979858, + "grad_norm_var": 0.013006222781651882, + "learning_rate": 0.0001, + "loss": 17860.1621, + "loss/crossentropy": 2.820894718170166, + "loss/hidden": 0.423828125, + "loss/logits": 0.09438331425189972, + "loss/reg": 17859.64453125, + "step": 1031 + }, + { + "epoch": 0.0258, + "grad_norm": 1.0464032888412476, + "grad_norm_var": 0.013087071999497308, + "learning_rate": 0.0001, + "loss": 17799.5801, + "loss/crossentropy": 2.778580665588379, + "loss/hidden": 0.431640625, + "loss/logits": 0.09593655169010162, + "loss/reg": 17799.052734375, + "step": 1032 + }, + { + "epoch": 0.025825, + "grad_norm": 0.9822232723236084, + "grad_norm_var": 0.013044211622835651, + "learning_rate": 0.0001, + "loss": 17738.7188, + "loss/crossentropy": 3.0002071857452393, + "loss/hidden": 0.43359375, + "loss/logits": 0.08786912262439728, + "loss/reg": 17738.197265625, + "step": 1033 + }, + { + "epoch": 0.02585, + "grad_norm": 0.7570947408676147, + "grad_norm_var": 0.015707684759895278, + "learning_rate": 0.0001, + "loss": 17678.4766, + "loss/crossentropy": 2.6046817302703857, + "loss/hidden": 0.4140625, + "loss/logits": 0.07064966857433319, + "loss/reg": 17677.9921875, + "step": 1034 + }, + { + "epoch": 0.025875, + "grad_norm": 1.0941604375839233, + "grad_norm_var": 0.01633682153482544, + "learning_rate": 0.0001, + "loss": 17619.3457, + "loss/crossentropy": 2.923370838165283, + "loss/hidden": 0.427734375, + "loss/logits": 0.0829104632139206, + "loss/reg": 17618.8359375, + "step": 1035 + }, + { + "epoch": 0.0259, + "grad_norm": 0.908995509147644, + "grad_norm_var": 0.0137925518327591, + "learning_rate": 0.0001, + "loss": 17559.7441, + "loss/crossentropy": 2.863892078399658, + "loss/hidden": 0.427734375, + "loss/logits": 0.09001794457435608, + "loss/reg": 17559.2265625, + "step": 1036 + }, + { + "epoch": 0.025925, + "grad_norm": 0.9213618040084839, + "grad_norm_var": 0.014025414720603162, + "learning_rate": 0.0001, + "loss": 17500.082, + "loss/crossentropy": 2.89579176902771, + "loss/hidden": 0.423828125, + "loss/logits": 0.08792084455490112, + "loss/reg": 17499.5703125, + "step": 1037 + }, + { + "epoch": 0.02595, + "grad_norm": 0.781911313533783, + "grad_norm_var": 0.013576918695598754, + "learning_rate": 0.0001, + "loss": 17440.498, + "loss/crossentropy": 2.9025070667266846, + "loss/hidden": 0.41796875, + "loss/logits": 0.08638165146112442, + "loss/reg": 17439.994140625, + "step": 1038 + }, + { + "epoch": 0.025975, + "grad_norm": 1.1047182083129883, + "grad_norm_var": 0.013544295980080993, + "learning_rate": 0.0001, + "loss": 17382.502, + "loss/crossentropy": 2.6538867950439453, + "loss/hidden": 0.447265625, + "loss/logits": 0.08192922174930573, + "loss/reg": 17381.97265625, + "step": 1039 + }, + { + "epoch": 0.026, + "grad_norm": 0.8481978178024292, + "grad_norm_var": 0.014304181767292675, + "learning_rate": 0.0001, + "loss": 17323.1992, + "loss/crossentropy": 2.7071163654327393, + "loss/hidden": 0.44140625, + "loss/logits": 0.08912265300750732, + "loss/reg": 17322.66796875, + "step": 1040 + }, + { + "epoch": 0.026025, + "grad_norm": 0.892241358757019, + "grad_norm_var": 0.012502882263243704, + "learning_rate": 0.0001, + "loss": 17265.3945, + "loss/crossentropy": 3.038763999938965, + "loss/hidden": 0.431640625, + "loss/logits": 0.08531604707241058, + "loss/reg": 17264.876953125, + "step": 1041 + }, + { + "epoch": 0.02605, + "grad_norm": 1.0308339595794678, + "grad_norm_var": 0.010795231235618511, + "learning_rate": 0.0001, + "loss": 17199.1367, + "loss/crossentropy": 2.9324772357940674, + "loss/hidden": 0.4296875, + "loss/logits": 0.08472186326980591, + "loss/reg": 17198.623046875, + "step": 1042 + }, + { + "epoch": 0.026075, + "grad_norm": 0.9274694919586182, + "grad_norm_var": 0.010848683179298983, + "learning_rate": 0.0001, + "loss": 17139.7949, + "loss/crossentropy": 2.8589069843292236, + "loss/hidden": 0.4375, + "loss/logits": 0.08558094501495361, + "loss/reg": 17139.271484375, + "step": 1043 + }, + { + "epoch": 0.0261, + "grad_norm": 1.3131765127182007, + "grad_norm_var": 0.01884521092897805, + "learning_rate": 0.0001, + "loss": 17080.8555, + "loss/crossentropy": 2.664062023162842, + "loss/hidden": 0.427734375, + "loss/logits": 0.08755569905042648, + "loss/reg": 17080.33984375, + "step": 1044 + }, + { + "epoch": 0.026125, + "grad_norm": 1.1463737487792969, + "grad_norm_var": 0.020258964221098318, + "learning_rate": 0.0001, + "loss": 17022.043, + "loss/crossentropy": 2.744938373565674, + "loss/hidden": 0.439453125, + "loss/logits": 0.09745976328849792, + "loss/reg": 17021.505859375, + "step": 1045 + }, + { + "epoch": 0.02615, + "grad_norm": 1.1437360048294067, + "grad_norm_var": 0.02178928738922002, + "learning_rate": 0.0001, + "loss": 16964.625, + "loss/crossentropy": 3.15686297416687, + "loss/hidden": 0.43359375, + "loss/logits": 0.10271066427230835, + "loss/reg": 16964.087890625, + "step": 1046 + }, + { + "epoch": 0.026175, + "grad_norm": 1.1650415658950806, + "grad_norm_var": 0.023423204944099264, + "learning_rate": 0.0001, + "loss": 16906.3867, + "loss/crossentropy": 2.66678786277771, + "loss/hidden": 0.443359375, + "loss/logits": 0.11273590475320816, + "loss/reg": 16905.830078125, + "step": 1047 + }, + { + "epoch": 0.0262, + "grad_norm": 1.231393814086914, + "grad_norm_var": 0.026608036552323223, + "learning_rate": 0.0001, + "loss": 16848.3691, + "loss/crossentropy": 2.6733105182647705, + "loss/hidden": 0.466796875, + "loss/logits": 0.09650056064128876, + "loss/reg": 16847.806640625, + "step": 1048 + }, + { + "epoch": 0.026225, + "grad_norm": 0.9987756609916687, + "grad_norm_var": 0.026551590936127904, + "learning_rate": 0.0001, + "loss": 16789.7051, + "loss/crossentropy": 2.7861099243164062, + "loss/hidden": 0.455078125, + "loss/logits": 0.08518000692129135, + "loss/reg": 16789.1640625, + "step": 1049 + }, + { + "epoch": 0.02625, + "grad_norm": 0.8940253853797913, + "grad_norm_var": 0.022985704412560048, + "learning_rate": 0.0001, + "loss": 16732.5352, + "loss/crossentropy": 2.9168479442596436, + "loss/hidden": 0.4296875, + "loss/logits": 0.08574089407920837, + "loss/reg": 16732.01953125, + "step": 1050 + }, + { + "epoch": 0.026275, + "grad_norm": 0.9486713409423828, + "grad_norm_var": 0.022969959845412412, + "learning_rate": 0.0001, + "loss": 16675.8574, + "loss/crossentropy": 2.9892466068267822, + "loss/hidden": 0.435546875, + "loss/logits": 0.09711939096450806, + "loss/reg": 16675.32421875, + "step": 1051 + }, + { + "epoch": 0.0263, + "grad_norm": 1.0788503885269165, + "grad_norm_var": 0.02234845552669828, + "learning_rate": 0.0001, + "loss": 16619.1816, + "loss/crossentropy": 3.05783748626709, + "loss/hidden": 0.4609375, + "loss/logits": 0.08326160907745361, + "loss/reg": 16618.63671875, + "step": 1052 + }, + { + "epoch": 0.026325, + "grad_norm": 0.9396947622299194, + "grad_norm_var": 0.022112037925466396, + "learning_rate": 0.0001, + "loss": 16562.8984, + "loss/crossentropy": 2.759321451187134, + "loss/hidden": 0.41015625, + "loss/logits": 0.07942262291908264, + "loss/reg": 16562.408203125, + "step": 1053 + }, + { + "epoch": 0.02635, + "grad_norm": 0.9998372197151184, + "grad_norm_var": 0.017934968504518863, + "learning_rate": 0.0001, + "loss": 16507.1777, + "loss/crossentropy": 2.9365243911743164, + "loss/hidden": 0.43359375, + "loss/logits": 0.09559677541255951, + "loss/reg": 16506.6484375, + "step": 1054 + }, + { + "epoch": 0.026375, + "grad_norm": 0.9944206476211548, + "grad_norm_var": 0.017764722025300238, + "learning_rate": 0.0001, + "loss": 16451.0293, + "loss/crossentropy": 2.7758195400238037, + "loss/hidden": 0.48046875, + "loss/logits": 0.10010755807161331, + "loss/reg": 16450.44921875, + "step": 1055 + }, + { + "epoch": 0.0264, + "grad_norm": 0.9697908163070679, + "grad_norm_var": 0.015667620645066178, + "learning_rate": 0.0001, + "loss": 16395.4805, + "loss/crossentropy": 2.854970693588257, + "loss/hidden": 0.435546875, + "loss/logits": 0.09074226021766663, + "loss/reg": 16394.955078125, + "step": 1056 + }, + { + "epoch": 0.026425, + "grad_norm": 0.9877775311470032, + "grad_norm_var": 0.014328562051209323, + "learning_rate": 0.0001, + "loss": 16340.9678, + "loss/crossentropy": 2.782637357711792, + "loss/hidden": 0.4609375, + "loss/logits": 0.1002429649233818, + "loss/reg": 16340.40625, + "step": 1057 + }, + { + "epoch": 0.02645, + "grad_norm": 0.8147690296173096, + "grad_norm_var": 0.017744210942317786, + "learning_rate": 0.0001, + "loss": 16285.5225, + "loss/crossentropy": 2.6116652488708496, + "loss/hidden": 0.431640625, + "loss/logits": 0.08781687915325165, + "loss/reg": 16285.0029296875, + "step": 1058 + }, + { + "epoch": 0.026475, + "grad_norm": 0.8014287352561951, + "grad_norm_var": 0.020537691856836307, + "learning_rate": 0.0001, + "loss": 16232.4189, + "loss/crossentropy": 2.857954263687134, + "loss/hidden": 0.43359375, + "loss/logits": 0.09286174923181534, + "loss/reg": 16231.892578125, + "step": 1059 + }, + { + "epoch": 0.0265, + "grad_norm": 0.8132032752037048, + "grad_norm_var": 0.01706595369942955, + "learning_rate": 0.0001, + "loss": 16179.2793, + "loss/crossentropy": 2.768078088760376, + "loss/hidden": 0.43359375, + "loss/logits": 0.0950530394911766, + "loss/reg": 16178.7509765625, + "step": 1060 + }, + { + "epoch": 0.026525, + "grad_norm": 0.8404484987258911, + "grad_norm_var": 0.01676066406379461, + "learning_rate": 0.0001, + "loss": 16125.2832, + "loss/crossentropy": 2.6537022590637207, + "loss/hidden": 0.423828125, + "loss/logits": 0.07667912542819977, + "loss/reg": 16124.7822265625, + "step": 1061 + }, + { + "epoch": 0.02655, + "grad_norm": 0.9588916897773743, + "grad_norm_var": 0.014771155375485737, + "learning_rate": 0.0001, + "loss": 16071.2188, + "loss/crossentropy": 2.582611322402954, + "loss/hidden": 0.423828125, + "loss/logits": 0.08356771618127823, + "loss/reg": 16070.7109375, + "step": 1062 + }, + { + "epoch": 0.026575, + "grad_norm": 0.9827629327774048, + "grad_norm_var": 0.011981449334387865, + "learning_rate": 0.0001, + "loss": 16017.4893, + "loss/crossentropy": 2.8175463676452637, + "loss/hidden": 0.435546875, + "loss/logits": 0.0889303982257843, + "loss/reg": 16016.96484375, + "step": 1063 + }, + { + "epoch": 0.0266, + "grad_norm": 0.7705737352371216, + "grad_norm_var": 0.008174274018235516, + "learning_rate": 0.0001, + "loss": 15963.8389, + "loss/crossentropy": 2.9675939083099365, + "loss/hidden": 0.423828125, + "loss/logits": 0.08410842716693878, + "loss/reg": 15963.3310546875, + "step": 1064 + }, + { + "epoch": 0.026625, + "grad_norm": 0.7642571926116943, + "grad_norm_var": 0.009292926304177485, + "learning_rate": 0.0001, + "loss": 15910.2979, + "loss/crossentropy": 2.987457752227783, + "loss/hidden": 0.435546875, + "loss/logits": 0.0909990519285202, + "loss/reg": 15909.771484375, + "step": 1065 + }, + { + "epoch": 0.02665, + "grad_norm": 0.812102198600769, + "grad_norm_var": 0.009886474071495129, + "learning_rate": 0.0001, + "loss": 15856.9854, + "loss/crossentropy": 2.946333885192871, + "loss/hidden": 0.4296875, + "loss/logits": 0.0913858711719513, + "loss/reg": 15856.4638671875, + "step": 1066 + }, + { + "epoch": 0.026675, + "grad_norm": 0.857896625995636, + "grad_norm_var": 0.00987100371605288, + "learning_rate": 0.0001, + "loss": 15803.7031, + "loss/crossentropy": 2.797611951828003, + "loss/hidden": 0.416015625, + "loss/logits": 0.07527288794517517, + "loss/reg": 15803.2119140625, + "step": 1067 + }, + { + "epoch": 0.0267, + "grad_norm": 1.0861632823944092, + "grad_norm_var": 0.010049544829034574, + "learning_rate": 0.0001, + "loss": 15751.3408, + "loss/crossentropy": 2.6403746604919434, + "loss/hidden": 0.46875, + "loss/logits": 0.08439454436302185, + "loss/reg": 15750.7880859375, + "step": 1068 + }, + { + "epoch": 0.026725, + "grad_norm": 0.9127049446105957, + "grad_norm_var": 0.009950880314127098, + "learning_rate": 0.0001, + "loss": 15698.7949, + "loss/crossentropy": 2.682400703430176, + "loss/hidden": 0.42578125, + "loss/logits": 0.08712255954742432, + "loss/reg": 15698.2822265625, + "step": 1069 + }, + { + "epoch": 0.02675, + "grad_norm": 0.9740332365036011, + "grad_norm_var": 0.009641912604132451, + "learning_rate": 0.0001, + "loss": 15646.957, + "loss/crossentropy": 2.686271905899048, + "loss/hidden": 0.44921875, + "loss/logits": 0.08891947567462921, + "loss/reg": 15646.4189453125, + "step": 1070 + }, + { + "epoch": 0.026775, + "grad_norm": 1.1772500276565552, + "grad_norm_var": 0.01412233905048681, + "learning_rate": 0.0001, + "loss": 15595.2686, + "loss/crossentropy": 2.958514928817749, + "loss/hidden": 0.43359375, + "loss/logits": 0.08424527198076248, + "loss/reg": 15594.7509765625, + "step": 1071 + }, + { + "epoch": 0.0268, + "grad_norm": 1.1505613327026367, + "grad_norm_var": 0.017659985188438164, + "learning_rate": 0.0001, + "loss": 15543.4844, + "loss/crossentropy": 2.8109195232391357, + "loss/hidden": 0.458984375, + "loss/logits": 0.10685601830482483, + "loss/reg": 15542.9189453125, + "step": 1072 + }, + { + "epoch": 0.026825, + "grad_norm": 0.9034889936447144, + "grad_norm_var": 0.017331644670570214, + "learning_rate": 0.0001, + "loss": 15491.6006, + "loss/crossentropy": 2.8358302116394043, + "loss/hidden": 0.427734375, + "loss/logits": 0.086671844124794, + "loss/reg": 15491.0859375, + "step": 1073 + }, + { + "epoch": 0.02685, + "grad_norm": 0.9926300048828125, + "grad_norm_var": 0.01696069494076884, + "learning_rate": 0.0001, + "loss": 15440.167, + "loss/crossentropy": 2.7901105880737305, + "loss/hidden": 0.419921875, + "loss/logits": 0.08056454360485077, + "loss/reg": 15439.6669921875, + "step": 1074 + }, + { + "epoch": 0.026875, + "grad_norm": 0.8253371715545654, + "grad_norm_var": 0.016602820780788348, + "learning_rate": 0.0001, + "loss": 15389.0439, + "loss/crossentropy": 2.8071136474609375, + "loss/hidden": 0.42578125, + "loss/logits": 0.08567114174365997, + "loss/reg": 15388.5322265625, + "step": 1075 + }, + { + "epoch": 0.0269, + "grad_norm": 0.8556614518165588, + "grad_norm_var": 0.016074706019418968, + "learning_rate": 0.0001, + "loss": 15338.8643, + "loss/crossentropy": 2.960524797439575, + "loss/hidden": 0.421875, + "loss/logits": 0.0796973779797554, + "loss/reg": 15338.3623046875, + "step": 1076 + }, + { + "epoch": 0.026925, + "grad_norm": 0.8220919370651245, + "grad_norm_var": 0.016312616474285932, + "learning_rate": 0.0001, + "loss": 15288.1484, + "loss/crossentropy": 2.902122735977173, + "loss/hidden": 0.419921875, + "loss/logits": 0.08749879151582718, + "loss/reg": 15287.640625, + "step": 1077 + }, + { + "epoch": 0.02695, + "grad_norm": 0.9153377413749695, + "grad_norm_var": 0.016251203350715674, + "learning_rate": 0.0001, + "loss": 15239.1123, + "loss/crossentropy": 2.5998146533966064, + "loss/hidden": 0.455078125, + "loss/logits": 0.07559297233819962, + "loss/reg": 15238.58203125, + "step": 1078 + }, + { + "epoch": 0.026975, + "grad_norm": 1.333935260772705, + "grad_norm_var": 0.026655112509867, + "learning_rate": 0.0001, + "loss": 15190.1279, + "loss/crossentropy": 2.974548101425171, + "loss/hidden": 0.48046875, + "loss/logits": 0.17960259318351746, + "loss/reg": 15189.4677734375, + "step": 1079 + }, + { + "epoch": 0.027, + "grad_norm": 0.8889738321304321, + "grad_norm_var": 0.024744090894334634, + "learning_rate": 0.0001, + "loss": 15138.1201, + "loss/crossentropy": 2.7455484867095947, + "loss/hidden": 0.439453125, + "loss/logits": 0.09583721309900284, + "loss/reg": 15137.5849609375, + "step": 1080 + }, + { + "epoch": 0.027025, + "grad_norm": 0.7190319895744324, + "grad_norm_var": 0.026019252870833044, + "learning_rate": 0.0001, + "loss": 15087.3291, + "loss/crossentropy": 2.798892021179199, + "loss/hidden": 0.42578125, + "loss/logits": 0.0866219699382782, + "loss/reg": 15086.81640625, + "step": 1081 + }, + { + "epoch": 0.02705, + "grad_norm": 0.7696980834007263, + "grad_norm_var": 0.026920904193799312, + "learning_rate": 0.0001, + "loss": 15037.7148, + "loss/crossentropy": 2.596310615539551, + "loss/hidden": 0.423828125, + "loss/logits": 0.07713499665260315, + "loss/reg": 15037.2138671875, + "step": 1082 + }, + { + "epoch": 0.027075, + "grad_norm": 0.9159994721412659, + "grad_norm_var": 0.026425733071442047, + "learning_rate": 0.0001, + "loss": 14988.4238, + "loss/crossentropy": 2.9121432304382324, + "loss/hidden": 0.43359375, + "loss/logits": 0.08162932097911835, + "loss/reg": 14987.908203125, + "step": 1083 + }, + { + "epoch": 0.0271, + "grad_norm": 0.910456657409668, + "grad_norm_var": 0.025228125400286894, + "learning_rate": 0.0001, + "loss": 14938.9277, + "loss/crossentropy": 2.826249837875366, + "loss/hidden": 0.4296875, + "loss/logits": 0.08516315370798111, + "loss/reg": 14938.4130859375, + "step": 1084 + }, + { + "epoch": 0.027125, + "grad_norm": 1.0477243661880493, + "grad_norm_var": 0.025845538428886157, + "learning_rate": 0.0001, + "loss": 14889.8086, + "loss/crossentropy": 2.8449409008026123, + "loss/hidden": 0.462890625, + "loss/logits": 0.10157535970211029, + "loss/reg": 14889.244140625, + "step": 1085 + }, + { + "epoch": 0.02715, + "grad_norm": 2.7801320552825928, + "grad_norm_var": 0.23547433160522477, + "learning_rate": 0.0001, + "loss": 14840.0693, + "loss/crossentropy": 3.2276058197021484, + "loss/hidden": 0.470703125, + "loss/logits": 0.08429836481809616, + "loss/reg": 14839.5146484375, + "step": 1086 + }, + { + "epoch": 0.027175, + "grad_norm": 1.2570152282714844, + "grad_norm_var": 0.2370868709294562, + "learning_rate": 0.0001, + "loss": 14790.6309, + "loss/crossentropy": 2.885909080505371, + "loss/hidden": 0.455078125, + "loss/logits": 0.10632868111133575, + "loss/reg": 14790.0693359375, + "step": 1087 + }, + { + "epoch": 0.0272, + "grad_norm": 0.9222579598426819, + "grad_norm_var": 0.23783146250132223, + "learning_rate": 0.0001, + "loss": 14741.3857, + "loss/crossentropy": 2.917914390563965, + "loss/hidden": 0.435546875, + "loss/logits": 0.08871741592884064, + "loss/reg": 14740.861328125, + "step": 1088 + }, + { + "epoch": 0.027225, + "grad_norm": 1.1597723960876465, + "grad_norm_var": 0.23680243597076794, + "learning_rate": 0.0001, + "loss": 14691.9111, + "loss/crossentropy": 3.21909499168396, + "loss/hidden": 0.466796875, + "loss/logits": 0.08862360566854477, + "loss/reg": 14691.35546875, + "step": 1089 + }, + { + "epoch": 0.02725, + "grad_norm": 1.7188750505447388, + "grad_norm_var": 0.262298855826787, + "learning_rate": 0.0001, + "loss": 14641.9336, + "loss/crossentropy": 2.998014450073242, + "loss/hidden": 0.4453125, + "loss/logits": 0.08976680040359497, + "loss/reg": 14641.3984375, + "step": 1090 + }, + { + "epoch": 0.027275, + "grad_norm": 0.9482136964797974, + "grad_norm_var": 0.2584944634799588, + "learning_rate": 0.0001, + "loss": 14591.6689, + "loss/crossentropy": 2.530045986175537, + "loss/hidden": 0.431640625, + "loss/logits": 0.07906611263751984, + "loss/reg": 14591.158203125, + "step": 1091 + }, + { + "epoch": 0.0273, + "grad_norm": 0.7725463509559631, + "grad_norm_var": 0.26188691520636426, + "learning_rate": 0.0001, + "loss": 14542.3398, + "loss/crossentropy": 2.439523935317993, + "loss/hidden": 0.435546875, + "loss/logits": 0.09834860265254974, + "loss/reg": 14541.8056640625, + "step": 1092 + }, + { + "epoch": 0.027325, + "grad_norm": 0.824009358882904, + "grad_norm_var": 0.2618115891237431, + "learning_rate": 0.0001, + "loss": 14493.7295, + "loss/crossentropy": 2.9190967082977295, + "loss/hidden": 0.435546875, + "loss/logits": 0.08553117513656616, + "loss/reg": 14493.2080078125, + "step": 1093 + }, + { + "epoch": 0.02735, + "grad_norm": 0.9745771288871765, + "grad_norm_var": 0.26043216037856104, + "learning_rate": 0.0001, + "loss": 14444.9678, + "loss/crossentropy": 2.7135884761810303, + "loss/hidden": 0.45703125, + "loss/logits": 0.12357691675424576, + "loss/reg": 14444.38671875, + "step": 1094 + }, + { + "epoch": 0.027375, + "grad_norm": 0.9248599410057068, + "grad_norm_var": 0.25930147487457583, + "learning_rate": 0.0001, + "loss": 14395.5244, + "loss/crossentropy": 2.629840850830078, + "loss/hidden": 0.4375, + "loss/logits": 0.09772805869579315, + "loss/reg": 14394.9892578125, + "step": 1095 + }, + { + "epoch": 0.0274, + "grad_norm": 0.9113340973854065, + "grad_norm_var": 0.25871584829829997, + "learning_rate": 0.0001, + "loss": 14347.1514, + "loss/crossentropy": 2.892179250717163, + "loss/hidden": 0.435546875, + "loss/logits": 0.0860610157251358, + "loss/reg": 14346.6298828125, + "step": 1096 + }, + { + "epoch": 0.027425, + "grad_norm": 1.135790467262268, + "grad_norm_var": 0.24855283365572234, + "learning_rate": 0.0001, + "loss": 14299.7197, + "loss/crossentropy": 2.764049768447876, + "loss/hidden": 0.435546875, + "loss/logits": 0.0889464020729065, + "loss/reg": 14299.1953125, + "step": 1097 + }, + { + "epoch": 0.02745, + "grad_norm": 1.0498467683792114, + "grad_norm_var": 0.24024881145662946, + "learning_rate": 0.0001, + "loss": 14252.5498, + "loss/crossentropy": 2.697889804840088, + "loss/hidden": 0.4375, + "loss/logits": 0.10231277346611023, + "loss/reg": 14252.009765625, + "step": 1098 + }, + { + "epoch": 0.027475, + "grad_norm": 0.903654158115387, + "grad_norm_var": 0.24062843081926943, + "learning_rate": 0.0001, + "loss": 14204.5557, + "loss/crossentropy": 2.7106425762176514, + "loss/hidden": 0.43359375, + "loss/logits": 0.09402255713939667, + "loss/reg": 14204.0283203125, + "step": 1099 + }, + { + "epoch": 0.0275, + "grad_norm": 0.7978147864341736, + "grad_norm_var": 0.24486993530923176, + "learning_rate": 0.0001, + "loss": 14156.9307, + "loss/crossentropy": 2.6107664108276367, + "loss/hidden": 0.423828125, + "loss/logits": 0.07774809002876282, + "loss/reg": 14156.4287109375, + "step": 1100 + }, + { + "epoch": 0.027525, + "grad_norm": 1.0636903047561646, + "grad_norm_var": 0.2447042768004019, + "learning_rate": 0.0001, + "loss": 14110.3281, + "loss/crossentropy": 2.850802421569824, + "loss/hidden": 0.482421875, + "loss/logits": 0.1095501109957695, + "loss/reg": 14109.736328125, + "step": 1101 + }, + { + "epoch": 0.02755, + "grad_norm": 0.854430615901947, + "grad_norm_var": 0.05381972596157554, + "learning_rate": 0.0001, + "loss": 14061.6553, + "loss/crossentropy": 2.9119491577148438, + "loss/hidden": 0.44140625, + "loss/logits": 0.09522178769111633, + "loss/reg": 14061.1181640625, + "step": 1102 + }, + { + "epoch": 0.027575, + "grad_norm": 1.118687391281128, + "grad_norm_var": 0.05052741212680799, + "learning_rate": 0.0001, + "loss": 14015.6572, + "loss/crossentropy": 2.852381944656372, + "loss/hidden": 0.4765625, + "loss/logits": 0.1372184008359909, + "loss/reg": 14015.04296875, + "step": 1103 + }, + { + "epoch": 0.0276, + "grad_norm": 0.9183930158615112, + "grad_norm_var": 0.05057099646219327, + "learning_rate": 0.0001, + "loss": 13969.3428, + "loss/crossentropy": 2.8287909030914307, + "loss/hidden": 0.439453125, + "loss/logits": 0.09352678060531616, + "loss/reg": 13968.8095703125, + "step": 1104 + }, + { + "epoch": 0.027625, + "grad_norm": 0.7984148859977722, + "grad_norm_var": 0.05126455750925533, + "learning_rate": 0.0001, + "loss": 13924.2598, + "loss/crossentropy": 2.8634836673736572, + "loss/hidden": 0.435546875, + "loss/logits": 0.08804089576005936, + "loss/reg": 13923.736328125, + "step": 1105 + }, + { + "epoch": 0.02765, + "grad_norm": 0.9173059463500977, + "grad_norm_var": 0.012688413073267653, + "learning_rate": 0.0001, + "loss": 13879.0557, + "loss/crossentropy": 2.5765674114227295, + "loss/hidden": 0.419921875, + "loss/logits": 0.074608713388443, + "loss/reg": 13878.5615234375, + "step": 1106 + }, + { + "epoch": 0.027675, + "grad_norm": 1.2418100833892822, + "grad_norm_var": 0.01870670627212404, + "learning_rate": 0.0001, + "loss": 13833.5557, + "loss/crossentropy": 2.7832295894622803, + "loss/hidden": 0.435546875, + "loss/logits": 0.09539151191711426, + "loss/reg": 13833.0244140625, + "step": 1107 + }, + { + "epoch": 0.0277, + "grad_norm": 1.3345333337783813, + "grad_norm_var": 0.025115599668681852, + "learning_rate": 0.0001, + "loss": 13788.3779, + "loss/crossentropy": 3.0659708976745605, + "loss/hidden": 0.455078125, + "loss/logits": 0.09437006711959839, + "loss/reg": 13787.828125, + "step": 1108 + }, + { + "epoch": 0.027725, + "grad_norm": 1.1107778549194336, + "grad_norm_var": 0.024077882887884157, + "learning_rate": 0.0001, + "loss": 13743.627, + "loss/crossentropy": 3.3273847103118896, + "loss/hidden": 0.4375, + "loss/logits": 0.09910184144973755, + "loss/reg": 13743.0908203125, + "step": 1109 + }, + { + "epoch": 0.02775, + "grad_norm": 0.7995766997337341, + "grad_norm_var": 0.02666670789335961, + "learning_rate": 0.0001, + "loss": 13698.3145, + "loss/crossentropy": 2.670424699783325, + "loss/hidden": 0.431640625, + "loss/logits": 0.08939287066459656, + "loss/reg": 13697.79296875, + "step": 1110 + }, + { + "epoch": 0.027775, + "grad_norm": 0.998966634273529, + "grad_norm_var": 0.026341033002484915, + "learning_rate": 0.0001, + "loss": 13653.4668, + "loss/crossentropy": 2.8975906372070312, + "loss/hidden": 0.439453125, + "loss/logits": 0.08185222744941711, + "loss/reg": 13652.9453125, + "step": 1111 + }, + { + "epoch": 0.0278, + "grad_norm": 0.9062837958335876, + "grad_norm_var": 0.026400439646917667, + "learning_rate": 0.0001, + "loss": 13608.8438, + "loss/crossentropy": 2.8450193405151367, + "loss/hidden": 0.478515625, + "loss/logits": 0.09909355640411377, + "loss/reg": 13608.2666015625, + "step": 1112 + }, + { + "epoch": 0.027825, + "grad_norm": 0.9913869500160217, + "grad_norm_var": 0.025029034203488957, + "learning_rate": 0.0001, + "loss": 13564.2236, + "loss/crossentropy": 2.94085955619812, + "loss/hidden": 0.4765625, + "loss/logits": 0.09758688509464264, + "loss/reg": 13563.6494140625, + "step": 1113 + }, + { + "epoch": 0.02785, + "grad_norm": 0.9357085824012756, + "grad_norm_var": 0.024899735922832737, + "learning_rate": 0.0001, + "loss": 13519.4248, + "loss/crossentropy": 2.7248375415802, + "loss/hidden": 0.451171875, + "loss/logits": 0.07659360766410828, + "loss/reg": 13518.8974609375, + "step": 1114 + }, + { + "epoch": 0.027875, + "grad_norm": 0.965599536895752, + "grad_norm_var": 0.02450309028379388, + "learning_rate": 0.0001, + "loss": 13475.3721, + "loss/crossentropy": 2.5047667026519775, + "loss/hidden": 0.466796875, + "loss/logits": 0.11319025605916977, + "loss/reg": 13474.7919921875, + "step": 1115 + }, + { + "epoch": 0.0279, + "grad_norm": 0.8854598999023438, + "grad_norm_var": 0.02280057994939105, + "learning_rate": 0.0001, + "loss": 13431.3467, + "loss/crossentropy": 2.8435001373291016, + "loss/hidden": 0.421875, + "loss/logits": 0.07650317251682281, + "loss/reg": 13430.8486328125, + "step": 1116 + }, + { + "epoch": 0.027925, + "grad_norm": 0.8546820878982544, + "grad_norm_var": 0.0234790594300057, + "learning_rate": 0.0001, + "loss": 13388.2959, + "loss/crossentropy": 2.8461828231811523, + "loss/hidden": 0.43359375, + "loss/logits": 0.0916643738746643, + "loss/reg": 13387.7705078125, + "step": 1117 + }, + { + "epoch": 0.02795, + "grad_norm": 0.9899805188179016, + "grad_norm_var": 0.022412165134545116, + "learning_rate": 0.0001, + "loss": 13344.9209, + "loss/crossentropy": 3.1090798377990723, + "loss/hidden": 0.455078125, + "loss/logits": 0.08351921290159225, + "loss/reg": 13344.3818359375, + "step": 1118 + }, + { + "epoch": 0.027975, + "grad_norm": 0.7054470777511597, + "grad_norm_var": 0.025745193825458406, + "learning_rate": 0.0001, + "loss": 13301.3359, + "loss/crossentropy": 2.9630494117736816, + "loss/hidden": 0.421875, + "loss/logits": 0.07988356798887253, + "loss/reg": 13300.833984375, + "step": 1119 + }, + { + "epoch": 0.028, + "grad_norm": 1.0503507852554321, + "grad_norm_var": 0.026107686898817883, + "learning_rate": 0.0001, + "loss": 13259.3818, + "loss/crossentropy": 2.9457664489746094, + "loss/hidden": 0.48828125, + "loss/logits": 0.10067403316497803, + "loss/reg": 13258.79296875, + "step": 1120 + }, + { + "epoch": 0.028025, + "grad_norm": 1.0407966375350952, + "grad_norm_var": 0.024302380428266622, + "learning_rate": 0.0001, + "loss": 13216.4863, + "loss/crossentropy": 2.856452703475952, + "loss/hidden": 0.47265625, + "loss/logits": 0.10159683227539062, + "loss/reg": 13215.912109375, + "step": 1121 + }, + { + "epoch": 0.02805, + "grad_norm": 0.8844307661056519, + "grad_norm_var": 0.024658072108102166, + "learning_rate": 0.0001, + "loss": 13173.7305, + "loss/crossentropy": 2.7508833408355713, + "loss/hidden": 0.470703125, + "loss/logits": 0.10203750431537628, + "loss/reg": 13173.158203125, + "step": 1122 + }, + { + "epoch": 0.028075, + "grad_norm": 0.8127800226211548, + "grad_norm_var": 0.02124211819669591, + "learning_rate": 0.0001, + "loss": 13131.751, + "loss/crossentropy": 2.946770668029785, + "loss/hidden": 0.43359375, + "loss/logits": 0.08815675228834152, + "loss/reg": 13131.2294921875, + "step": 1123 + }, + { + "epoch": 0.0281, + "grad_norm": 0.8686839938163757, + "grad_norm_var": 0.011180152481760535, + "learning_rate": 0.0001, + "loss": 13087.5605, + "loss/crossentropy": 2.7543065547943115, + "loss/hidden": 0.44921875, + "loss/logits": 0.10058388859033585, + "loss/reg": 13087.0107421875, + "step": 1124 + }, + { + "epoch": 0.028125, + "grad_norm": 0.895906388759613, + "grad_norm_var": 0.008744946401899689, + "learning_rate": 0.0001, + "loss": 13045.1602, + "loss/crossentropy": 3.002467155456543, + "loss/hidden": 0.4609375, + "loss/logits": 0.09118813276290894, + "loss/reg": 13044.6083984375, + "step": 1125 + }, + { + "epoch": 0.02815, + "grad_norm": 1.1829336881637573, + "grad_norm_var": 0.012202712169516027, + "learning_rate": 0.0001, + "loss": 13002.5703, + "loss/crossentropy": 3.0006842613220215, + "loss/hidden": 0.435546875, + "loss/logits": 0.09035210311412811, + "loss/reg": 13002.0439453125, + "step": 1126 + }, + { + "epoch": 0.028175, + "grad_norm": 1.4346967935562134, + "grad_norm_var": 0.02775118000169859, + "learning_rate": 0.0001, + "loss": 12960.9385, + "loss/crossentropy": 2.900606393814087, + "loss/hidden": 0.484375, + "loss/logits": 0.13488101959228516, + "loss/reg": 12960.3193359375, + "step": 1127 + }, + { + "epoch": 0.0282, + "grad_norm": 1.0146552324295044, + "grad_norm_var": 0.02766827493378264, + "learning_rate": 0.0001, + "loss": 12919.1104, + "loss/crossentropy": 3.1415045261383057, + "loss/hidden": 0.4375, + "loss/logits": 0.08919603377580643, + "loss/reg": 12918.583984375, + "step": 1128 + }, + { + "epoch": 0.028225, + "grad_norm": 1.0106333494186401, + "grad_norm_var": 0.027747352010968028, + "learning_rate": 0.0001, + "loss": 12876.6816, + "loss/crossentropy": 2.976088285446167, + "loss/hidden": 0.458984375, + "loss/logits": 0.09923793375492096, + "loss/reg": 12876.123046875, + "step": 1129 + }, + { + "epoch": 0.02825, + "grad_norm": 0.9112340211868286, + "grad_norm_var": 0.02789929156606991, + "learning_rate": 0.0001, + "loss": 12834.8223, + "loss/crossentropy": 2.673325538635254, + "loss/hidden": 0.44921875, + "loss/logits": 0.11222439259290695, + "loss/reg": 12834.2607421875, + "step": 1130 + }, + { + "epoch": 0.028275, + "grad_norm": 0.920089602470398, + "grad_norm_var": 0.028050992378277077, + "learning_rate": 0.0001, + "loss": 12793.7109, + "loss/crossentropy": 2.7545506954193115, + "loss/hidden": 0.435546875, + "loss/logits": 0.10355065017938614, + "loss/reg": 12793.171875, + "step": 1131 + }, + { + "epoch": 0.0283, + "grad_norm": 0.8590668439865112, + "grad_norm_var": 0.028379443047076913, + "learning_rate": 0.0001, + "loss": 12752.7422, + "loss/crossentropy": 2.784660577774048, + "loss/hidden": 0.4296875, + "loss/logits": 0.09297633171081543, + "loss/reg": 12752.2197265625, + "step": 1132 + }, + { + "epoch": 0.028325, + "grad_norm": 0.6936010718345642, + "grad_norm_var": 0.03236561028099718, + "learning_rate": 0.0001, + "loss": 12712.0752, + "loss/crossentropy": 2.6045994758605957, + "loss/hidden": 0.4296875, + "loss/logits": 0.07505609840154648, + "loss/reg": 12711.5703125, + "step": 1133 + }, + { + "epoch": 0.02835, + "grad_norm": 0.8183262944221497, + "grad_norm_var": 0.033399834386414716, + "learning_rate": 0.0001, + "loss": 12671.4336, + "loss/crossentropy": 2.6184515953063965, + "loss/hidden": 0.421875, + "loss/logits": 0.07417555153369904, + "loss/reg": 12670.9375, + "step": 1134 + }, + { + "epoch": 0.028375, + "grad_norm": 0.8885838985443115, + "grad_norm_var": 0.02967154541324343, + "learning_rate": 0.0001, + "loss": 12630.6895, + "loss/crossentropy": 3.2083587646484375, + "loss/hidden": 0.419921875, + "loss/logits": 0.0850653350353241, + "loss/reg": 12630.1845703125, + "step": 1135 + }, + { + "epoch": 0.0284, + "grad_norm": 0.8322352170944214, + "grad_norm_var": 0.02988425096504533, + "learning_rate": 0.0001, + "loss": 12590.1465, + "loss/crossentropy": 2.6865787506103516, + "loss/hidden": 0.4296875, + "loss/logits": 0.08578802645206451, + "loss/reg": 12589.630859375, + "step": 1136 + }, + { + "epoch": 0.028425, + "grad_norm": 0.8828990459442139, + "grad_norm_var": 0.02935810929242463, + "learning_rate": 0.0001, + "loss": 12549.917, + "loss/crossentropy": 2.778714179992676, + "loss/hidden": 0.4375, + "loss/logits": 0.09445519745349884, + "loss/reg": 12549.384765625, + "step": 1137 + }, + { + "epoch": 0.02845, + "grad_norm": 1.2958890199661255, + "grad_norm_var": 0.037333791847929315, + "learning_rate": 0.0001, + "loss": 12509.4854, + "loss/crossentropy": 3.0362274646759033, + "loss/hidden": 0.486328125, + "loss/logits": 0.11208359897136688, + "loss/reg": 12508.88671875, + "step": 1138 + }, + { + "epoch": 0.028475, + "grad_norm": 1.0555758476257324, + "grad_norm_var": 0.036328687068246525, + "learning_rate": 0.0001, + "loss": 12469.209, + "loss/crossentropy": 2.772933006286621, + "loss/hidden": 0.47265625, + "loss/logits": 0.0948249101638794, + "loss/reg": 12468.6416015625, + "step": 1139 + }, + { + "epoch": 0.0285, + "grad_norm": 0.9909307956695557, + "grad_norm_var": 0.0355654438528526, + "learning_rate": 0.0001, + "loss": 12429.7412, + "loss/crossentropy": 2.6208302974700928, + "loss/hidden": 0.478515625, + "loss/logits": 0.09450601041316986, + "loss/reg": 12429.16796875, + "step": 1140 + }, + { + "epoch": 0.028525, + "grad_norm": 0.8380599021911621, + "grad_norm_var": 0.03642668333360503, + "learning_rate": 0.0001, + "loss": 12390.4238, + "loss/crossentropy": 2.7336008548736572, + "loss/hidden": 0.451171875, + "loss/logits": 0.08721227943897247, + "loss/reg": 12389.8857421875, + "step": 1141 + }, + { + "epoch": 0.02855, + "grad_norm": 1.08302640914917, + "grad_norm_var": 0.03430513245419273, + "learning_rate": 0.0001, + "loss": 12350.8652, + "loss/crossentropy": 2.8139443397521973, + "loss/hidden": 0.47265625, + "loss/logits": 0.10447725653648376, + "loss/reg": 12350.2880859375, + "step": 1142 + }, + { + "epoch": 0.028575, + "grad_norm": 1.0056302547454834, + "grad_norm_var": 0.01926046379658614, + "learning_rate": 0.0001, + "loss": 12311.4238, + "loss/crossentropy": 2.7299814224243164, + "loss/hidden": 0.443359375, + "loss/logits": 0.09301269799470901, + "loss/reg": 12310.8876953125, + "step": 1143 + }, + { + "epoch": 0.0286, + "grad_norm": 1.0221171379089355, + "grad_norm_var": 0.019334461722572863, + "learning_rate": 0.0001, + "loss": 12272.2256, + "loss/crossentropy": 2.738447427749634, + "loss/hidden": 0.48046875, + "loss/logits": 0.12227018177509308, + "loss/reg": 12271.623046875, + "step": 1144 + }, + { + "epoch": 0.028625, + "grad_norm": 1.2503671646118164, + "grad_norm_var": 0.025048594107295645, + "learning_rate": 0.0001, + "loss": 12233.4121, + "loss/crossentropy": 2.5988683700561523, + "loss/hidden": 0.474609375, + "loss/logits": 0.09158191084861755, + "loss/reg": 12232.845703125, + "step": 1145 + }, + { + "epoch": 0.02865, + "grad_norm": 0.8764640688896179, + "grad_norm_var": 0.025346648753542058, + "learning_rate": 0.0001, + "loss": 12194.0625, + "loss/crossentropy": 2.9124915599823, + "loss/hidden": 0.44921875, + "loss/logits": 0.09028612077236176, + "loss/reg": 12193.5234375, + "step": 1146 + }, + { + "epoch": 0.028675, + "grad_norm": 0.9417552947998047, + "grad_norm_var": 0.025269205414616976, + "learning_rate": 0.0001, + "loss": 12155.0537, + "loss/crossentropy": 2.856606960296631, + "loss/hidden": 0.451171875, + "loss/logits": 0.11057109385728836, + "loss/reg": 12154.4921875, + "step": 1147 + }, + { + "epoch": 0.0287, + "grad_norm": 1.073207974433899, + "grad_norm_var": 0.025298827761974948, + "learning_rate": 0.0001, + "loss": 12116.6191, + "loss/crossentropy": 2.9602468013763428, + "loss/hidden": 0.4921875, + "loss/logits": 0.11190488934516907, + "loss/reg": 12116.0146484375, + "step": 1148 + }, + { + "epoch": 0.028725, + "grad_norm": 0.8884559273719788, + "grad_norm_var": 0.02044427749182618, + "learning_rate": 0.0001, + "loss": 12078.3057, + "loss/crossentropy": 2.954441547393799, + "loss/hidden": 0.45703125, + "loss/logits": 0.0968749150633812, + "loss/reg": 12077.751953125, + "step": 1149 + }, + { + "epoch": 0.02875, + "grad_norm": 1.098034381866455, + "grad_norm_var": 0.01915647141942376, + "learning_rate": 0.0001, + "loss": 12040.9893, + "loss/crossentropy": 2.5061094760894775, + "loss/hidden": 0.4609375, + "loss/logits": 0.11192715167999268, + "loss/reg": 12040.416015625, + "step": 1150 + }, + { + "epoch": 0.028775, + "grad_norm": 0.8926265835762024, + "grad_norm_var": 0.01909665417308942, + "learning_rate": 0.0001, + "loss": 12003.5557, + "loss/crossentropy": 2.9123997688293457, + "loss/hidden": 0.466796875, + "loss/logits": 0.09902822971343994, + "loss/reg": 12002.990234375, + "step": 1151 + }, + { + "epoch": 0.0288, + "grad_norm": 0.8516673445701599, + "grad_norm_var": 0.01868116766225493, + "learning_rate": 0.0001, + "loss": 11965.3525, + "loss/crossentropy": 2.524635076522827, + "loss/hidden": 0.4609375, + "loss/logits": 0.08091796934604645, + "loss/reg": 11964.810546875, + "step": 1152 + }, + { + "epoch": 0.028825, + "grad_norm": 0.9828234314918518, + "grad_norm_var": 0.017706164191503015, + "learning_rate": 0.0001, + "loss": 11927.0225, + "loss/crossentropy": 2.964435577392578, + "loss/hidden": 0.48046875, + "loss/logits": 0.09457586705684662, + "loss/reg": 11926.447265625, + "step": 1153 + }, + { + "epoch": 0.02885, + "grad_norm": 1.2419341802597046, + "grad_norm_var": 0.015825419211028195, + "learning_rate": 0.0001, + "loss": 11888.6523, + "loss/crossentropy": 3.0043070316314697, + "loss/hidden": 0.474609375, + "loss/logits": 0.09112387895584106, + "loss/reg": 11888.0869140625, + "step": 1154 + }, + { + "epoch": 0.028875, + "grad_norm": 1.0305931568145752, + "grad_norm_var": 0.015698597353836267, + "learning_rate": 0.0001, + "loss": 11850.6299, + "loss/crossentropy": 3.150991678237915, + "loss/hidden": 0.455078125, + "loss/logits": 0.0933547168970108, + "loss/reg": 11850.0810546875, + "step": 1155 + }, + { + "epoch": 0.0289, + "grad_norm": 1.0774022340774536, + "grad_norm_var": 0.016012585602837352, + "learning_rate": 0.0001, + "loss": 11812.8965, + "loss/crossentropy": 2.512453079223633, + "loss/hidden": 0.484375, + "loss/logits": 0.11678382754325867, + "loss/reg": 11812.294921875, + "step": 1156 + }, + { + "epoch": 0.028925, + "grad_norm": 1.0668998956680298, + "grad_norm_var": 0.014050459894159777, + "learning_rate": 0.0001, + "loss": 11775.5527, + "loss/crossentropy": 2.8800909519195557, + "loss/hidden": 0.435546875, + "loss/logits": 0.08428835868835449, + "loss/reg": 11775.033203125, + "step": 1157 + }, + { + "epoch": 0.02895, + "grad_norm": 1.095428705215454, + "grad_norm_var": 0.014157784643863858, + "learning_rate": 0.0001, + "loss": 11738.0811, + "loss/crossentropy": 2.7751150131225586, + "loss/hidden": 0.4609375, + "loss/logits": 0.099217489361763, + "loss/reg": 11737.5205078125, + "step": 1158 + }, + { + "epoch": 0.028975, + "grad_norm": 0.9712222218513489, + "grad_norm_var": 0.014319325743920726, + "learning_rate": 0.0001, + "loss": 11700.4678, + "loss/crossentropy": 2.9733681678771973, + "loss/hidden": 0.447265625, + "loss/logits": 0.08700287342071533, + "loss/reg": 11699.93359375, + "step": 1159 + }, + { + "epoch": 0.029, + "grad_norm": 1.1365258693695068, + "grad_norm_var": 0.015130617116427854, + "learning_rate": 0.0001, + "loss": 11663.209, + "loss/crossentropy": 2.8745062351226807, + "loss/hidden": 0.478515625, + "loss/logits": 0.11189639568328857, + "loss/reg": 11662.6181640625, + "step": 1160 + }, + { + "epoch": 0.029025, + "grad_norm": 1.149517297744751, + "grad_norm_var": 0.01279922687653959, + "learning_rate": 0.0001, + "loss": 11626.1377, + "loss/crossentropy": 3.0209760665893555, + "loss/hidden": 0.462890625, + "loss/logits": 0.10469117760658264, + "loss/reg": 11625.5703125, + "step": 1161 + }, + { + "epoch": 0.02905, + "grad_norm": 1.017334222793579, + "grad_norm_var": 0.011279464116419632, + "learning_rate": 0.0001, + "loss": 11589.6016, + "loss/crossentropy": 2.7759485244750977, + "loss/hidden": 0.453125, + "loss/logits": 0.09751708805561066, + "loss/reg": 11589.05078125, + "step": 1162 + }, + { + "epoch": 0.029075, + "grad_norm": 0.8124439120292664, + "grad_norm_var": 0.013884204178922666, + "learning_rate": 0.0001, + "loss": 11553.4912, + "loss/crossentropy": 2.4102635383605957, + "loss/hidden": 0.439453125, + "loss/logits": 0.08739569783210754, + "loss/reg": 11552.96484375, + "step": 1163 + }, + { + "epoch": 0.0291, + "grad_norm": 0.8549541234970093, + "grad_norm_var": 0.015433249300702733, + "learning_rate": 0.0001, + "loss": 11517.8145, + "loss/crossentropy": 2.915776014328003, + "loss/hidden": 0.43359375, + "loss/logits": 0.08613419532775879, + "loss/reg": 11517.294921875, + "step": 1164 + }, + { + "epoch": 0.029125, + "grad_norm": 0.9901089668273926, + "grad_norm_var": 0.014425045260586921, + "learning_rate": 0.0001, + "loss": 11482.0186, + "loss/crossentropy": 2.914726495742798, + "loss/hidden": 0.470703125, + "loss/logits": 0.0901867225766182, + "loss/reg": 11481.4580078125, + "step": 1165 + }, + { + "epoch": 0.02915, + "grad_norm": 1.008752703666687, + "grad_norm_var": 0.013956747237325576, + "learning_rate": 0.0001, + "loss": 11446.793, + "loss/crossentropy": 2.808004379272461, + "loss/hidden": 0.4609375, + "loss/logits": 0.08883035182952881, + "loss/reg": 11446.2431640625, + "step": 1166 + }, + { + "epoch": 0.029175, + "grad_norm": 1.0150833129882812, + "grad_norm_var": 0.012956904326744182, + "learning_rate": 0.0001, + "loss": 11411.3867, + "loss/crossentropy": 2.716193437576294, + "loss/hidden": 0.466796875, + "loss/logits": 0.09643800556659698, + "loss/reg": 11410.8232421875, + "step": 1167 + }, + { + "epoch": 0.0292, + "grad_norm": 1.141662359237671, + "grad_norm_var": 0.011746050791282636, + "learning_rate": 0.0001, + "loss": 11375.5488, + "loss/crossentropy": 2.971235990524292, + "loss/hidden": 0.482421875, + "loss/logits": 0.12297721207141876, + "loss/reg": 11374.943359375, + "step": 1168 + }, + { + "epoch": 0.029225, + "grad_norm": 1.0099797248840332, + "grad_norm_var": 0.011595822291827495, + "learning_rate": 0.0001, + "loss": 11339.9443, + "loss/crossentropy": 2.673962354660034, + "loss/hidden": 0.48046875, + "loss/logits": 0.09458102285861969, + "loss/reg": 11339.369140625, + "step": 1169 + }, + { + "epoch": 0.02925, + "grad_norm": 1.2093685865402222, + "grad_norm_var": 0.010779820227608105, + "learning_rate": 0.0001, + "loss": 11305.1221, + "loss/crossentropy": 2.8429079055786133, + "loss/hidden": 0.5, + "loss/logits": 0.08760304749011993, + "loss/reg": 11304.5341796875, + "step": 1170 + }, + { + "epoch": 0.029275, + "grad_norm": 1.0845022201538086, + "grad_norm_var": 0.01091752697247627, + "learning_rate": 0.0001, + "loss": 11270.4902, + "loss/crossentropy": 2.9612159729003906, + "loss/hidden": 0.482421875, + "loss/logits": 0.11011095345020294, + "loss/reg": 11269.8974609375, + "step": 1171 + }, + { + "epoch": 0.0293, + "grad_norm": 1.1352198123931885, + "grad_norm_var": 0.011414219105993812, + "learning_rate": 0.0001, + "loss": 11235.9561, + "loss/crossentropy": 2.685256004333496, + "loss/hidden": 0.482421875, + "loss/logits": 0.09084700793027878, + "loss/reg": 11235.3828125, + "step": 1172 + }, + { + "epoch": 0.029325, + "grad_norm": 0.9860196113586426, + "grad_norm_var": 0.011572749677654256, + "learning_rate": 0.0001, + "loss": 11201.4805, + "loss/crossentropy": 2.589221715927124, + "loss/hidden": 0.466796875, + "loss/logits": 0.08546493202447891, + "loss/reg": 11200.927734375, + "step": 1173 + }, + { + "epoch": 0.02935, + "grad_norm": 1.1775130033493042, + "grad_norm_var": 0.012615471899458204, + "learning_rate": 0.0001, + "loss": 11167.1963, + "loss/crossentropy": 3.0070106983184814, + "loss/hidden": 0.466796875, + "loss/logits": 0.10018382966518402, + "loss/reg": 11166.62890625, + "step": 1174 + }, + { + "epoch": 0.029375, + "grad_norm": 1.0073573589324951, + "grad_norm_var": 0.012347578378169391, + "learning_rate": 0.0001, + "loss": 11132.5576, + "loss/crossentropy": 2.6534533500671387, + "loss/hidden": 0.48046875, + "loss/logits": 0.08900558203458786, + "loss/reg": 11131.98828125, + "step": 1175 + }, + { + "epoch": 0.0294, + "grad_norm": 1.043900489807129, + "grad_norm_var": 0.01176606043919694, + "learning_rate": 0.0001, + "loss": 11098.457, + "loss/crossentropy": 2.8767192363739014, + "loss/hidden": 0.4765625, + "loss/logits": 0.10700362920761108, + "loss/reg": 11097.873046875, + "step": 1176 + }, + { + "epoch": 0.029425, + "grad_norm": 0.8921750783920288, + "grad_norm_var": 0.012155306817358028, + "learning_rate": 0.0001, + "loss": 11064.6582, + "loss/crossentropy": 2.9935896396636963, + "loss/hidden": 0.46484375, + "loss/logits": 0.09252595901489258, + "loss/reg": 11064.1005859375, + "step": 1177 + }, + { + "epoch": 0.02945, + "grad_norm": 1.0026613473892212, + "grad_norm_var": 0.012182093922875467, + "learning_rate": 0.0001, + "loss": 11031.0293, + "loss/crossentropy": 3.0917484760284424, + "loss/hidden": 0.48828125, + "loss/logits": 0.11424778401851654, + "loss/reg": 11030.4267578125, + "step": 1178 + }, + { + "epoch": 0.029475, + "grad_norm": 1.1867682933807373, + "grad_norm_var": 0.010419128530427695, + "learning_rate": 0.0001, + "loss": 10992.6582, + "loss/crossentropy": 3.0319387912750244, + "loss/hidden": 0.5, + "loss/logits": 0.12374146282672882, + "loss/reg": 10992.0341796875, + "step": 1179 + }, + { + "epoch": 0.0295, + "grad_norm": 1.1099034547805786, + "grad_norm_var": 0.007966003871787801, + "learning_rate": 0.0001, + "loss": 10958.374, + "loss/crossentropy": 2.9356632232666016, + "loss/hidden": 0.48046875, + "loss/logits": 0.09477466344833374, + "loss/reg": 10957.798828125, + "step": 1180 + }, + { + "epoch": 0.029525, + "grad_norm": 1.1255443096160889, + "grad_norm_var": 0.0078040821343260324, + "learning_rate": 0.0001, + "loss": 10924.3008, + "loss/crossentropy": 2.931514263153076, + "loss/hidden": 0.46484375, + "loss/logits": 0.09181191027164459, + "loss/reg": 10923.744140625, + "step": 1181 + }, + { + "epoch": 0.02955, + "grad_norm": 0.9785094857215881, + "grad_norm_var": 0.008112359456496524, + "learning_rate": 0.0001, + "loss": 10890.1377, + "loss/crossentropy": 2.845010280609131, + "loss/hidden": 0.43359375, + "loss/logits": 0.08981429040431976, + "loss/reg": 10889.6142578125, + "step": 1182 + }, + { + "epoch": 0.029575, + "grad_norm": 1.1371616125106812, + "grad_norm_var": 0.008163990240574733, + "learning_rate": 0.0001, + "loss": 10856.3428, + "loss/crossentropy": 3.007722854614258, + "loss/hidden": 0.46484375, + "loss/logits": 0.08930703997612, + "loss/reg": 10855.7890625, + "step": 1183 + }, + { + "epoch": 0.0296, + "grad_norm": 1.1515138149261475, + "grad_norm_var": 0.008255299846433894, + "learning_rate": 0.0001, + "loss": 10822.2334, + "loss/crossentropy": 2.758664131164551, + "loss/hidden": 0.47265625, + "loss/logits": 0.1009654551744461, + "loss/reg": 10821.66015625, + "step": 1184 + }, + { + "epoch": 0.029625, + "grad_norm": 0.8978103399276733, + "grad_norm_var": 0.010049723032669282, + "learning_rate": 0.0001, + "loss": 10788.4775, + "loss/crossentropy": 2.641123056411743, + "loss/hidden": 0.4296875, + "loss/logits": 0.07992805540561676, + "loss/reg": 10787.9677734375, + "step": 1185 + }, + { + "epoch": 0.02965, + "grad_norm": 1.1538819074630737, + "grad_norm_var": 0.009213807670944367, + "learning_rate": 0.0001, + "loss": 10755.3135, + "loss/crossentropy": 2.8010590076446533, + "loss/hidden": 0.462890625, + "loss/logits": 0.07945588231086731, + "loss/reg": 10754.771484375, + "step": 1186 + }, + { + "epoch": 0.029675, + "grad_norm": 0.839072585105896, + "grad_norm_var": 0.012402611272060786, + "learning_rate": 0.0001, + "loss": 10722.417, + "loss/crossentropy": 2.5748178958892822, + "loss/hidden": 0.470703125, + "loss/logits": 0.08163893967866898, + "loss/reg": 10721.8642578125, + "step": 1187 + }, + { + "epoch": 0.0297, + "grad_norm": 0.8767160177230835, + "grad_norm_var": 0.013695720424199923, + "learning_rate": 0.0001, + "loss": 10688.084, + "loss/crossentropy": 2.836094856262207, + "loss/hidden": 0.431640625, + "loss/logits": 0.08492501080036163, + "loss/reg": 10687.5673828125, + "step": 1188 + }, + { + "epoch": 0.029725, + "grad_norm": 1.5030205249786377, + "grad_norm_var": 0.026996912799878325, + "learning_rate": 0.0001, + "loss": 10654.8174, + "loss/crossentropy": 2.943218469619751, + "loss/hidden": 0.46484375, + "loss/logits": 0.09647072851657867, + "loss/reg": 10654.255859375, + "step": 1189 + }, + { + "epoch": 0.02975, + "grad_norm": 1.4945253133773804, + "grad_norm_var": 0.03791875498742254, + "learning_rate": 0.0001, + "loss": 10621.7188, + "loss/crossentropy": 2.9707281589508057, + "loss/hidden": 0.478515625, + "loss/logits": 0.11330445110797882, + "loss/reg": 10621.126953125, + "step": 1190 + }, + { + "epoch": 0.029775, + "grad_norm": 1.161048173904419, + "grad_norm_var": 0.03775209897049773, + "learning_rate": 0.0001, + "loss": 10588.3164, + "loss/crossentropy": 2.8618388175964355, + "loss/hidden": 0.486328125, + "loss/logits": 0.09845025092363358, + "loss/reg": 10587.7314453125, + "step": 1191 + }, + { + "epoch": 0.0298, + "grad_norm": 0.9410629868507385, + "grad_norm_var": 0.039143050056103114, + "learning_rate": 0.0001, + "loss": 10555.1143, + "loss/crossentropy": 2.684262990951538, + "loss/hidden": 0.439453125, + "loss/logits": 0.08633062243461609, + "loss/reg": 10554.5888671875, + "step": 1192 + }, + { + "epoch": 0.029825, + "grad_norm": 0.8235244750976562, + "grad_norm_var": 0.04125488764726738, + "learning_rate": 0.0001, + "loss": 10522.4512, + "loss/crossentropy": 2.9447689056396484, + "loss/hidden": 0.43359375, + "loss/logits": 0.09423172473907471, + "loss/reg": 10521.923828125, + "step": 1193 + }, + { + "epoch": 0.02985, + "grad_norm": 0.8602933287620544, + "grad_norm_var": 0.044111624406263754, + "learning_rate": 0.0001, + "loss": 10490.2285, + "loss/crossentropy": 2.9620957374572754, + "loss/hidden": 0.45703125, + "loss/logits": 0.08669282495975494, + "loss/reg": 10489.6845703125, + "step": 1194 + }, + { + "epoch": 0.029875, + "grad_norm": 0.8690145015716553, + "grad_norm_var": 0.04579364713540136, + "learning_rate": 0.0001, + "loss": 10457.7422, + "loss/crossentropy": 2.8131606578826904, + "loss/hidden": 0.4375, + "loss/logits": 0.09218825399875641, + "loss/reg": 10457.212890625, + "step": 1195 + }, + { + "epoch": 0.0299, + "grad_norm": 0.7750914692878723, + "grad_norm_var": 0.050467726933814974, + "learning_rate": 0.0001, + "loss": 10425.543, + "loss/crossentropy": 2.4100165367126465, + "loss/hidden": 0.443359375, + "loss/logits": 0.0891672819852829, + "loss/reg": 10425.0107421875, + "step": 1196 + }, + { + "epoch": 0.029925, + "grad_norm": 0.6748509407043457, + "grad_norm_var": 0.05782635591090936, + "learning_rate": 0.0001, + "loss": 10393.8721, + "loss/crossentropy": 2.8050386905670166, + "loss/hidden": 0.427734375, + "loss/logits": 0.08326968550682068, + "loss/reg": 10393.361328125, + "step": 1197 + }, + { + "epoch": 0.02995, + "grad_norm": 0.836169958114624, + "grad_norm_var": 0.05966311960294477, + "learning_rate": 0.0001, + "loss": 10362.6973, + "loss/crossentropy": 2.8260862827301025, + "loss/hidden": 0.43359375, + "loss/logits": 0.08343534171581268, + "loss/reg": 10362.1806640625, + "step": 1198 + }, + { + "epoch": 0.029975, + "grad_norm": 0.932468831539154, + "grad_norm_var": 0.058529408178915256, + "learning_rate": 0.0001, + "loss": 10329.6719, + "loss/crossentropy": 2.977475881576538, + "loss/hidden": 0.474609375, + "loss/logits": 0.08931249380111694, + "loss/reg": 10329.1083984375, + "step": 1199 + }, + { + "epoch": 0.03, + "grad_norm": 0.7377482652664185, + "grad_norm_var": 0.060146837549021136, + "learning_rate": 0.0001, + "loss": 10297.377, + "loss/crossentropy": 2.5743801593780518, + "loss/hidden": 0.4453125, + "loss/logits": 0.08330844342708588, + "loss/reg": 10296.8486328125, + "step": 1200 + }, + { + "epoch": 0.030025, + "grad_norm": 1.0377100706100464, + "grad_norm_var": 0.06019103863493148, + "learning_rate": 0.0001, + "loss": 10265.8555, + "loss/crossentropy": 3.201678514480591, + "loss/hidden": 0.46484375, + "loss/logits": 0.09200893342494965, + "loss/reg": 10265.298828125, + "step": 1201 + }, + { + "epoch": 0.03005, + "grad_norm": 1.1051257848739624, + "grad_norm_var": 0.05914268438559394, + "learning_rate": 0.0001, + "loss": 10234.3828, + "loss/crossentropy": 2.675462484359741, + "loss/hidden": 0.48046875, + "loss/logits": 0.095950648188591, + "loss/reg": 10233.806640625, + "step": 1202 + }, + { + "epoch": 0.030075, + "grad_norm": 0.9691970944404602, + "grad_norm_var": 0.057986367583310626, + "learning_rate": 0.0001, + "loss": 10202.7168, + "loss/crossentropy": 2.9942128658294678, + "loss/hidden": 0.4765625, + "loss/logits": 0.09277678281068802, + "loss/reg": 10202.1474609375, + "step": 1203 + }, + { + "epoch": 0.0301, + "grad_norm": 2.018515110015869, + "grad_norm_var": 0.12452834192716093, + "learning_rate": 0.0001, + "loss": 10171.2373, + "loss/crossentropy": 2.7582156658172607, + "loss/hidden": 0.5390625, + "loss/logits": 0.20079275965690613, + "loss/reg": 10170.4970703125, + "step": 1204 + }, + { + "epoch": 0.030125, + "grad_norm": 0.8988450765609741, + "grad_norm_var": 0.1105434640884032, + "learning_rate": 0.0001, + "loss": 10139.9619, + "loss/crossentropy": 3.1079561710357666, + "loss/hidden": 0.478515625, + "loss/logits": 0.09983308613300323, + "loss/reg": 10139.3837890625, + "step": 1205 + }, + { + "epoch": 0.03015, + "grad_norm": 0.8246884346008301, + "grad_norm_var": 0.09517385013607554, + "learning_rate": 0.0001, + "loss": 10108.6289, + "loss/crossentropy": 2.796762704849243, + "loss/hidden": 0.474609375, + "loss/logits": 0.0915188193321228, + "loss/reg": 10108.0625, + "step": 1206 + }, + { + "epoch": 0.030175, + "grad_norm": 0.8127716183662415, + "grad_norm_var": 0.09372460720769069, + "learning_rate": 0.0001, + "loss": 10077.668, + "loss/crossentropy": 2.611097574234009, + "loss/hidden": 0.43359375, + "loss/logits": 0.08079911768436432, + "loss/reg": 10077.1533203125, + "step": 1207 + }, + { + "epoch": 0.0302, + "grad_norm": 0.9361163377761841, + "grad_norm_var": 0.0937286127578139, + "learning_rate": 0.0001, + "loss": 10047.2109, + "loss/crossentropy": 3.0536181926727295, + "loss/hidden": 0.486328125, + "loss/logits": 0.09539620578289032, + "loss/reg": 10046.62890625, + "step": 1208 + }, + { + "epoch": 0.030225, + "grad_norm": 0.7071399688720703, + "grad_norm_var": 0.09645261491092669, + "learning_rate": 0.0001, + "loss": 10016.3311, + "loss/crossentropy": 2.8262569904327393, + "loss/hidden": 0.4375, + "loss/logits": 0.08987779915332794, + "loss/reg": 10015.8037109375, + "step": 1209 + }, + { + "epoch": 0.03025, + "grad_norm": 0.8685421347618103, + "grad_norm_var": 0.09637224490467025, + "learning_rate": 0.0001, + "loss": 9985.8564, + "loss/crossentropy": 2.7836618423461914, + "loss/hidden": 0.470703125, + "loss/logits": 0.08982132375240326, + "loss/reg": 9985.2958984375, + "step": 1210 + }, + { + "epoch": 0.030275, + "grad_norm": 0.916743814945221, + "grad_norm_var": 0.09607720069623867, + "learning_rate": 0.0001, + "loss": 9955.1826, + "loss/crossentropy": 2.9358649253845215, + "loss/hidden": 0.431640625, + "loss/logits": 0.08366554230451584, + "loss/reg": 9954.6669921875, + "step": 1211 + }, + { + "epoch": 0.0303, + "grad_norm": 1.0723576545715332, + "grad_norm_var": 0.09503487394484282, + "learning_rate": 0.0001, + "loss": 9925.3447, + "loss/crossentropy": 2.8664326667785645, + "loss/hidden": 0.474609375, + "loss/logits": 0.09154078364372253, + "loss/reg": 9924.7783203125, + "step": 1212 + }, + { + "epoch": 0.030325, + "grad_norm": 0.9004722237586975, + "grad_norm_var": 0.08965903996462453, + "learning_rate": 0.0001, + "loss": 9895.2139, + "loss/crossentropy": 2.6698296070098877, + "loss/hidden": 0.46875, + "loss/logits": 0.09159491211175919, + "loss/reg": 9894.6533203125, + "step": 1213 + }, + { + "epoch": 0.03035, + "grad_norm": 0.7564451098442078, + "grad_norm_var": 0.09151518684718109, + "learning_rate": 0.0001, + "loss": 9865.2354, + "loss/crossentropy": 2.6375725269317627, + "loss/hidden": 0.4296875, + "loss/logits": 0.08477196842432022, + "loss/reg": 9864.720703125, + "step": 1214 + }, + { + "epoch": 0.030375, + "grad_norm": 1.297078013420105, + "grad_norm_var": 0.09807566864007426, + "learning_rate": 0.0001, + "loss": 9835.8916, + "loss/crossentropy": 3.296292781829834, + "loss/hidden": 0.51171875, + "loss/logits": 0.12085559964179993, + "loss/reg": 9835.2587890625, + "step": 1215 + }, + { + "epoch": 0.0304, + "grad_norm": 0.9563354849815369, + "grad_norm_var": 0.09367456011200681, + "learning_rate": 0.0001, + "loss": 9805.2402, + "loss/crossentropy": 2.7964541912078857, + "loss/hidden": 0.47265625, + "loss/logits": 0.08087512105703354, + "loss/reg": 9804.6865234375, + "step": 1216 + }, + { + "epoch": 0.030425, + "grad_norm": 0.8623160123825073, + "grad_norm_var": 0.09482949817968937, + "learning_rate": 0.0001, + "loss": 9775.4941, + "loss/crossentropy": 2.7524194717407227, + "loss/hidden": 0.439453125, + "loss/logits": 0.08655818551778793, + "loss/reg": 9774.9677734375, + "step": 1217 + }, + { + "epoch": 0.03045, + "grad_norm": 0.9165598154067993, + "grad_norm_var": 0.09425582092502614, + "learning_rate": 0.0001, + "loss": 9746.5801, + "loss/crossentropy": 2.9415946006774902, + "loss/hidden": 0.453125, + "loss/logits": 0.09110292047262192, + "loss/reg": 9746.0361328125, + "step": 1218 + }, + { + "epoch": 0.030475, + "grad_norm": 0.921715497970581, + "grad_norm_var": 0.09447862133950806, + "learning_rate": 0.0001, + "loss": 9717.4014, + "loss/crossentropy": 2.724311590194702, + "loss/hidden": 0.447265625, + "loss/logits": 0.09495562314987183, + "loss/reg": 9716.859375, + "step": 1219 + }, + { + "epoch": 0.0305, + "grad_norm": 1.1677136421203613, + "grad_norm_var": 0.02181600391456436, + "learning_rate": 0.0001, + "loss": 9688.4238, + "loss/crossentropy": 2.642845630645752, + "loss/hidden": 0.4765625, + "loss/logits": 0.10420182347297668, + "loss/reg": 9687.8427734375, + "step": 1220 + }, + { + "epoch": 0.030525, + "grad_norm": 0.9965676069259644, + "grad_norm_var": 0.022059169625035166, + "learning_rate": 0.0001, + "loss": 9659.293, + "loss/crossentropy": 2.785884141921997, + "loss/hidden": 0.451171875, + "loss/logits": 0.07995257526636124, + "loss/reg": 9658.76171875, + "step": 1221 + }, + { + "epoch": 0.03055, + "grad_norm": 1.2723093032836914, + "grad_norm_var": 0.028171462793960166, + "learning_rate": 0.0001, + "loss": 9630.8027, + "loss/crossentropy": 2.9275527000427246, + "loss/hidden": 0.4453125, + "loss/logits": 0.08214107900857925, + "loss/reg": 9630.275390625, + "step": 1222 + }, + { + "epoch": 0.030575, + "grad_norm": 1.1707496643066406, + "grad_norm_var": 0.0291499271671719, + "learning_rate": 0.0001, + "loss": 9602.373, + "loss/crossentropy": 2.870009660720825, + "loss/hidden": 0.51171875, + "loss/logits": 0.09101098030805588, + "loss/reg": 9601.7705078125, + "step": 1223 + }, + { + "epoch": 0.0306, + "grad_norm": 1.0315990447998047, + "grad_norm_var": 0.029129891252388622, + "learning_rate": 0.0001, + "loss": 9572.2627, + "loss/crossentropy": 2.8040213584899902, + "loss/hidden": 0.466796875, + "loss/logits": 0.08623090386390686, + "loss/reg": 9571.7099609375, + "step": 1224 + }, + { + "epoch": 0.030625, + "grad_norm": 1.137269139289856, + "grad_norm_var": 0.02456178787075095, + "learning_rate": 0.0001, + "loss": 9543.2119, + "loss/crossentropy": 2.747441053390503, + "loss/hidden": 0.45703125, + "loss/logits": 0.09086060523986816, + "loss/reg": 9542.6640625, + "step": 1225 + }, + { + "epoch": 0.03065, + "grad_norm": 1.2014390230178833, + "grad_norm_var": 0.024974099129706436, + "learning_rate": 0.0001, + "loss": 9514.4326, + "loss/crossentropy": 3.3663926124572754, + "loss/hidden": 0.4765625, + "loss/logits": 0.09788084030151367, + "loss/reg": 9513.8583984375, + "step": 1226 + }, + { + "epoch": 0.030675, + "grad_norm": 1.1044578552246094, + "grad_norm_var": 0.024188962145927903, + "learning_rate": 0.0001, + "loss": 9485.6719, + "loss/crossentropy": 3.089447498321533, + "loss/hidden": 0.48828125, + "loss/logits": 0.10295175760984421, + "loss/reg": 9485.0810546875, + "step": 1227 + }, + { + "epoch": 0.0307, + "grad_norm": 0.9288240075111389, + "grad_norm_var": 0.025007301258384965, + "learning_rate": 0.0001, + "loss": 9456.8525, + "loss/crossentropy": 2.601111888885498, + "loss/hidden": 0.50390625, + "loss/logits": 0.09696957468986511, + "loss/reg": 9456.251953125, + "step": 1228 + }, + { + "epoch": 0.030725, + "grad_norm": 0.8527458310127258, + "grad_norm_var": 0.02603033391088611, + "learning_rate": 0.0001, + "loss": 9428.3262, + "loss/crossentropy": 2.9268016815185547, + "loss/hidden": 0.453125, + "loss/logits": 0.0778612419962883, + "loss/reg": 9427.794921875, + "step": 1229 + }, + { + "epoch": 0.03075, + "grad_norm": 1.004744291305542, + "grad_norm_var": 0.02063239364545984, + "learning_rate": 0.0001, + "loss": 9400.5186, + "loss/crossentropy": 3.0767099857330322, + "loss/hidden": 0.48046875, + "loss/logits": 0.09042792022228241, + "loss/reg": 9399.947265625, + "step": 1230 + }, + { + "epoch": 0.030775, + "grad_norm": 0.9882547855377197, + "grad_norm_var": 0.01647704902306407, + "learning_rate": 0.0001, + "loss": 9372.2334, + "loss/crossentropy": 2.9301977157592773, + "loss/hidden": 0.478515625, + "loss/logits": 0.09938155114650726, + "loss/reg": 9371.6552734375, + "step": 1231 + }, + { + "epoch": 0.0308, + "grad_norm": 1.0480315685272217, + "grad_norm_var": 0.01607625108212716, + "learning_rate": 0.0001, + "loss": 9344.4482, + "loss/crossentropy": 2.779447555541992, + "loss/hidden": 0.478515625, + "loss/logits": 0.09339262545108795, + "loss/reg": 9343.8759765625, + "step": 1232 + }, + { + "epoch": 0.030825, + "grad_norm": 0.8750410676002502, + "grad_norm_var": 0.015788579684884126, + "learning_rate": 0.0001, + "loss": 9317.4863, + "loss/crossentropy": 3.0507824420928955, + "loss/hidden": 0.4765625, + "loss/logits": 0.0992969423532486, + "loss/reg": 9316.91015625, + "step": 1233 + }, + { + "epoch": 0.03085, + "grad_norm": 0.9422901272773743, + "grad_norm_var": 0.015411182960378215, + "learning_rate": 0.0001, + "loss": 9289.8379, + "loss/crossentropy": 2.902963638305664, + "loss/hidden": 0.435546875, + "loss/logits": 0.08303774148225784, + "loss/reg": 9289.3193359375, + "step": 1234 + }, + { + "epoch": 0.030875, + "grad_norm": 0.9532501697540283, + "grad_norm_var": 0.01497500735035633, + "learning_rate": 0.0001, + "loss": 9260.5039, + "loss/crossentropy": 2.6641650199890137, + "loss/hidden": 0.443359375, + "loss/logits": 0.08857601881027222, + "loss/reg": 9259.9716796875, + "step": 1235 + }, + { + "epoch": 0.0309, + "grad_norm": 0.924162745475769, + "grad_norm_var": 0.014606637793103753, + "learning_rate": 0.0001, + "loss": 9233.3857, + "loss/crossentropy": 2.730973958969116, + "loss/hidden": 0.486328125, + "loss/logits": 0.10253769159317017, + "loss/reg": 9232.796875, + "step": 1236 + }, + { + "epoch": 0.030925, + "grad_norm": 0.9403824806213379, + "grad_norm_var": 0.015031792033177697, + "learning_rate": 0.0001, + "loss": 9205.9912, + "loss/crossentropy": 2.6175835132598877, + "loss/hidden": 0.453125, + "loss/logits": 0.09433528780937195, + "loss/reg": 9205.443359375, + "step": 1237 + }, + { + "epoch": 0.03095, + "grad_norm": 1.058070182800293, + "grad_norm_var": 0.01079234914789892, + "learning_rate": 0.0001, + "loss": 9179.0371, + "loss/crossentropy": 2.9107890129089355, + "loss/hidden": 0.474609375, + "loss/logits": 0.10070379078388214, + "loss/reg": 9178.4619140625, + "step": 1238 + }, + { + "epoch": 0.030975, + "grad_norm": 0.8924942016601562, + "grad_norm_var": 0.009670593148200565, + "learning_rate": 0.0001, + "loss": 9152.2129, + "loss/crossentropy": 2.9251017570495605, + "loss/hidden": 0.443359375, + "loss/logits": 0.08694691210985184, + "loss/reg": 9151.6826171875, + "step": 1239 + }, + { + "epoch": 0.031, + "grad_norm": 1.0834685564041138, + "grad_norm_var": 0.01010783131661602, + "learning_rate": 0.0001, + "loss": 9125.5234, + "loss/crossentropy": 3.0762522220611572, + "loss/hidden": 0.484375, + "loss/logits": 0.09588149189949036, + "loss/reg": 9124.943359375, + "step": 1240 + }, + { + "epoch": 0.031025, + "grad_norm": 1.2146703004837036, + "grad_norm_var": 0.011940877159930589, + "learning_rate": 0.0001, + "loss": 9098.4736, + "loss/crossentropy": 2.8192780017852783, + "loss/hidden": 0.466796875, + "loss/logits": 0.09233398735523224, + "loss/reg": 9097.9140625, + "step": 1241 + }, + { + "epoch": 0.03105, + "grad_norm": 1.1458591222763062, + "grad_norm_var": 0.010646862243281655, + "learning_rate": 0.0001, + "loss": 9071.7295, + "loss/crossentropy": 2.8693604469299316, + "loss/hidden": 0.48046875, + "loss/logits": 0.09497147798538208, + "loss/reg": 9071.154296875, + "step": 1242 + }, + { + "epoch": 0.031075, + "grad_norm": 0.8777756094932556, + "grad_norm_var": 0.010619544110101591, + "learning_rate": 0.0001, + "loss": 9044.9199, + "loss/crossentropy": 2.9424221515655518, + "loss/hidden": 0.439453125, + "loss/logits": 0.09632548689842224, + "loss/reg": 9044.3837890625, + "step": 1243 + }, + { + "epoch": 0.0311, + "grad_norm": 1.0108309984207153, + "grad_norm_var": 0.010446079850578134, + "learning_rate": 0.0001, + "loss": 9015.875, + "loss/crossentropy": 2.8137118816375732, + "loss/hidden": 0.439453125, + "loss/logits": 0.08419637382030487, + "loss/reg": 9015.3515625, + "step": 1244 + }, + { + "epoch": 0.031125, + "grad_norm": 1.1220258474349976, + "grad_norm_var": 0.01011275916858628, + "learning_rate": 0.0001, + "loss": 8988.9355, + "loss/crossentropy": 2.8730502128601074, + "loss/hidden": 0.49609375, + "loss/logits": 0.11299853026866913, + "loss/reg": 8988.326171875, + "step": 1245 + }, + { + "epoch": 0.03115, + "grad_norm": 0.9927026629447937, + "grad_norm_var": 0.010122367947183146, + "learning_rate": 0.0001, + "loss": 8962.1436, + "loss/crossentropy": 2.9736275672912598, + "loss/hidden": 0.466796875, + "loss/logits": 0.0912986621260643, + "loss/reg": 8961.5859375, + "step": 1246 + }, + { + "epoch": 0.031175, + "grad_norm": 1.0541266202926636, + "grad_norm_var": 0.010252357725601216, + "learning_rate": 0.0001, + "loss": 8935.4277, + "loss/crossentropy": 2.722059726715088, + "loss/hidden": 0.494140625, + "loss/logits": 0.1156279444694519, + "loss/reg": 8934.818359375, + "step": 1247 + }, + { + "epoch": 0.0312, + "grad_norm": 0.9716544151306152, + "grad_norm_var": 0.010213854612195365, + "learning_rate": 0.0001, + "loss": 8909.125, + "loss/crossentropy": 2.8865716457366943, + "loss/hidden": 0.4453125, + "loss/logits": 0.09578832238912582, + "loss/reg": 8908.583984375, + "step": 1248 + }, + { + "epoch": 0.031225, + "grad_norm": 0.9475975632667542, + "grad_norm_var": 0.009298449025630321, + "learning_rate": 0.0001, + "loss": 8883.3818, + "loss/crossentropy": 3.0513219833374023, + "loss/hidden": 0.4765625, + "loss/logits": 0.10030034184455872, + "loss/reg": 8882.8046875, + "step": 1249 + }, + { + "epoch": 0.03125, + "grad_norm": 0.8895021677017212, + "grad_norm_var": 0.009936580350242237, + "learning_rate": 0.0001, + "loss": 8857.7197, + "loss/crossentropy": 2.9448611736297607, + "loss/hidden": 0.484375, + "loss/logits": 0.11295387148857117, + "loss/reg": 8857.1220703125, + "step": 1250 + }, + { + "epoch": 0.031275, + "grad_norm": 0.937999427318573, + "grad_norm_var": 0.010056165424536268, + "learning_rate": 0.0001, + "loss": 8831.6973, + "loss/crossentropy": 3.092855930328369, + "loss/hidden": 0.474609375, + "loss/logits": 0.09617440402507782, + "loss/reg": 8831.126953125, + "step": 1251 + }, + { + "epoch": 0.0313, + "grad_norm": 0.9391719698905945, + "grad_norm_var": 0.009910557212322724, + "learning_rate": 0.0001, + "loss": 8805.9775, + "loss/crossentropy": 2.961047410964966, + "loss/hidden": 0.490234375, + "loss/logits": 0.11525814980268478, + "loss/reg": 8805.3720703125, + "step": 1252 + }, + { + "epoch": 0.031325, + "grad_norm": 1.1222453117370605, + "grad_norm_var": 0.010413345496494827, + "learning_rate": 0.0001, + "loss": 8780.6943, + "loss/crossentropy": 2.7763471603393555, + "loss/hidden": 0.486328125, + "loss/logits": 0.11867141723632812, + "loss/reg": 8780.0888671875, + "step": 1253 + }, + { + "epoch": 0.03135, + "grad_norm": 1.1006278991699219, + "grad_norm_var": 0.010763776670529154, + "learning_rate": 0.0001, + "loss": 8755.541, + "loss/crossentropy": 3.163494348526001, + "loss/hidden": 0.478515625, + "loss/logits": 0.08899696916341782, + "loss/reg": 8754.9736328125, + "step": 1254 + }, + { + "epoch": 0.031375, + "grad_norm": 0.8858732581138611, + "grad_norm_var": 0.010878126022268416, + "learning_rate": 0.0001, + "loss": 8730.6084, + "loss/crossentropy": 2.7622194290161133, + "loss/hidden": 0.455078125, + "loss/logits": 0.0905875563621521, + "loss/reg": 8730.0625, + "step": 1255 + }, + { + "epoch": 0.0314, + "grad_norm": 1.098836898803711, + "grad_norm_var": 0.011025998642754093, + "learning_rate": 0.0001, + "loss": 8705.5156, + "loss/crossentropy": 3.0155763626098633, + "loss/hidden": 0.484375, + "loss/logits": 0.11086156964302063, + "loss/reg": 8704.919921875, + "step": 1256 + }, + { + "epoch": 0.031425, + "grad_norm": 1.2236031293869019, + "grad_norm_var": 0.011263479458820077, + "learning_rate": 0.0001, + "loss": 8680.752, + "loss/crossentropy": 2.602417230606079, + "loss/hidden": 0.48828125, + "loss/logits": 0.09238439798355103, + "loss/reg": 8680.1708984375, + "step": 1257 + }, + { + "epoch": 0.03145, + "grad_norm": 0.8990491032600403, + "grad_norm_var": 0.010929796606747866, + "learning_rate": 0.0001, + "loss": 8656.1475, + "loss/crossentropy": 2.729997396469116, + "loss/hidden": 0.435546875, + "loss/logits": 0.09076844155788422, + "loss/reg": 8655.62109375, + "step": 1258 + }, + { + "epoch": 0.031475, + "grad_norm": 0.8509149551391602, + "grad_norm_var": 0.01142910663300564, + "learning_rate": 0.0001, + "loss": 8631.0742, + "loss/crossentropy": 2.835317611694336, + "loss/hidden": 0.462890625, + "loss/logits": 0.07381744682788849, + "loss/reg": 8630.537109375, + "step": 1259 + }, + { + "epoch": 0.0315, + "grad_norm": 1.0317010879516602, + "grad_norm_var": 0.011478335594675985, + "learning_rate": 0.0001, + "loss": 8605.9824, + "loss/crossentropy": 2.855336904525757, + "loss/hidden": 0.48046875, + "loss/logits": 0.10006128251552582, + "loss/reg": 8605.40234375, + "step": 1260 + }, + { + "epoch": 0.031525, + "grad_norm": 1.0000284910202026, + "grad_norm_var": 0.01049239253982955, + "learning_rate": 0.0001, + "loss": 8580.8525, + "loss/crossentropy": 2.6447980403900146, + "loss/hidden": 0.484375, + "loss/logits": 0.08910873532295227, + "loss/reg": 8580.279296875, + "step": 1261 + }, + { + "epoch": 0.03155, + "grad_norm": 0.962989330291748, + "grad_norm_var": 0.010563021713161888, + "learning_rate": 0.0001, + "loss": 8555.9385, + "loss/crossentropy": 2.884161949157715, + "loss/hidden": 0.46484375, + "loss/logits": 0.08687566220760345, + "loss/reg": 8555.38671875, + "step": 1262 + }, + { + "epoch": 0.031575, + "grad_norm": 0.9393940567970276, + "grad_norm_var": 0.010477345104453603, + "learning_rate": 0.0001, + "loss": 8531.585, + "loss/crossentropy": 2.914567708969116, + "loss/hidden": 0.482421875, + "loss/logits": 0.10453340411186218, + "loss/reg": 8530.998046875, + "step": 1263 + }, + { + "epoch": 0.0316, + "grad_norm": 0.8663077354431152, + "grad_norm_var": 0.011394579787916293, + "learning_rate": 0.0001, + "loss": 8506.8096, + "loss/crossentropy": 2.7986392974853516, + "loss/hidden": 0.46484375, + "loss/logits": 0.08872459828853607, + "loss/reg": 8506.255859375, + "step": 1264 + }, + { + "epoch": 0.031625, + "grad_norm": 1.0881963968276978, + "grad_norm_var": 0.012004087305830646, + "learning_rate": 0.0001, + "loss": 8482.6777, + "loss/crossentropy": 2.84379506111145, + "loss/hidden": 0.482421875, + "loss/logits": 0.11023124307394028, + "loss/reg": 8482.0849609375, + "step": 1265 + }, + { + "epoch": 0.03165, + "grad_norm": 0.8993434906005859, + "grad_norm_var": 0.01187856150995447, + "learning_rate": 0.0001, + "loss": 8458.5137, + "loss/crossentropy": 2.8666434288024902, + "loss/hidden": 0.470703125, + "loss/logits": 0.0925627276301384, + "loss/reg": 8457.9501953125, + "step": 1266 + }, + { + "epoch": 0.031675, + "grad_norm": 0.8524366021156311, + "grad_norm_var": 0.012933845547619175, + "learning_rate": 0.0001, + "loss": 8434.5479, + "loss/crossentropy": 3.014401912689209, + "loss/hidden": 0.42578125, + "loss/logits": 0.08417293429374695, + "loss/reg": 8434.0380859375, + "step": 1267 + }, + { + "epoch": 0.0317, + "grad_norm": 1.3166847229003906, + "grad_norm_var": 0.019532068339086807, + "learning_rate": 0.0001, + "loss": 8410.9355, + "loss/crossentropy": 3.2959179878234863, + "loss/hidden": 0.490234375, + "loss/logits": 0.11012903600931168, + "loss/reg": 8410.3349609375, + "step": 1268 + }, + { + "epoch": 0.031725, + "grad_norm": 0.8100549578666687, + "grad_norm_var": 0.02089461048390171, + "learning_rate": 0.0001, + "loss": 8386.4502, + "loss/crossentropy": 2.7742624282836914, + "loss/hidden": 0.45703125, + "loss/logits": 0.09910713881254196, + "loss/reg": 8385.89453125, + "step": 1269 + }, + { + "epoch": 0.03175, + "grad_norm": 0.8693830966949463, + "grad_norm_var": 0.02079889777364117, + "learning_rate": 0.0001, + "loss": 8362.4961, + "loss/crossentropy": 2.7391388416290283, + "loss/hidden": 0.43359375, + "loss/logits": 0.08556564152240753, + "loss/reg": 8361.9765625, + "step": 1270 + }, + { + "epoch": 0.031775, + "grad_norm": 1.057019829750061, + "grad_norm_var": 0.02060318325980237, + "learning_rate": 0.0001, + "loss": 8339.002, + "loss/crossentropy": 2.7165987491607666, + "loss/hidden": 0.48046875, + "loss/logits": 0.09790123999118805, + "loss/reg": 8338.423828125, + "step": 1271 + }, + { + "epoch": 0.0318, + "grad_norm": 0.9491914510726929, + "grad_norm_var": 0.019738849063740106, + "learning_rate": 0.0001, + "loss": 8315.4844, + "loss/crossentropy": 2.8127589225769043, + "loss/hidden": 0.48828125, + "loss/logits": 0.08863645792007446, + "loss/reg": 8314.9072265625, + "step": 1272 + }, + { + "epoch": 0.031825, + "grad_norm": 1.124371886253357, + "grad_norm_var": 0.017078527814280875, + "learning_rate": 0.0001, + "loss": 8292.4492, + "loss/crossentropy": 2.9964218139648438, + "loss/hidden": 0.486328125, + "loss/logits": 0.1079523116350174, + "loss/reg": 8291.8544921875, + "step": 1273 + }, + { + "epoch": 0.03185, + "grad_norm": 0.9936317205429077, + "grad_norm_var": 0.016745193518145487, + "learning_rate": 0.0001, + "loss": 8269.1445, + "loss/crossentropy": 2.620728015899658, + "loss/hidden": 0.484375, + "loss/logits": 0.11555637419223785, + "loss/reg": 8268.544921875, + "step": 1274 + }, + { + "epoch": 0.031875, + "grad_norm": 1.2509433031082153, + "grad_norm_var": 0.020089437443413738, + "learning_rate": 0.0001, + "loss": 8245.5898, + "loss/crossentropy": 2.6017544269561768, + "loss/hidden": 0.46875, + "loss/logits": 0.09242773801088333, + "loss/reg": 8245.0283203125, + "step": 1275 + }, + { + "epoch": 0.0319, + "grad_norm": 1.5470917224884033, + "grad_norm_var": 0.03881945897441915, + "learning_rate": 0.0001, + "loss": 8221.9727, + "loss/crossentropy": 2.775578260421753, + "loss/hidden": 0.5078125, + "loss/logits": 0.10969553142786026, + "loss/reg": 8221.35546875, + "step": 1276 + }, + { + "epoch": 0.031925, + "grad_norm": 0.9281179308891296, + "grad_norm_var": 0.03945822977643634, + "learning_rate": 0.0001, + "loss": 8198.8486, + "loss/crossentropy": 2.7830638885498047, + "loss/hidden": 0.482421875, + "loss/logits": 0.09233975410461426, + "loss/reg": 8198.2734375, + "step": 1277 + }, + { + "epoch": 0.03195, + "grad_norm": 0.7995396256446838, + "grad_norm_var": 0.042554514376413964, + "learning_rate": 0.0001, + "loss": 8175.2583, + "loss/crossentropy": 2.6517460346221924, + "loss/hidden": 0.43359375, + "loss/logits": 0.0899507999420166, + "loss/reg": 8174.73486328125, + "step": 1278 + }, + { + "epoch": 0.031975, + "grad_norm": 0.8659801483154297, + "grad_norm_var": 0.04366306924828033, + "learning_rate": 0.0001, + "loss": 8152.5845, + "loss/crossentropy": 2.704268455505371, + "loss/hidden": 0.4765625, + "loss/logits": 0.09852954745292664, + "loss/reg": 8152.00927734375, + "step": 1279 + }, + { + "epoch": 0.032, + "grad_norm": 0.8435584902763367, + "grad_norm_var": 0.044142318153788655, + "learning_rate": 0.0001, + "loss": 8129.3618, + "loss/crossentropy": 2.626718044281006, + "loss/hidden": 0.482421875, + "loss/logits": 0.08609582483768463, + "loss/reg": 8128.79345703125, + "step": 1280 + }, + { + "epoch": 0.032025, + "grad_norm": 1.0621079206466675, + "grad_norm_var": 0.04392058058480234, + "learning_rate": 0.0001, + "loss": 8106.6753, + "loss/crossentropy": 2.4879398345947266, + "loss/hidden": 0.51171875, + "loss/logits": 0.11241374909877777, + "loss/reg": 8106.05126953125, + "step": 1281 + }, + { + "epoch": 0.03205, + "grad_norm": 1.3785101175308228, + "grad_norm_var": 0.05116313883933944, + "learning_rate": 0.0001, + "loss": 8083.8066, + "loss/crossentropy": 2.842101573944092, + "loss/hidden": 0.5625, + "loss/logits": 0.12760193645954132, + "loss/reg": 8083.11669921875, + "step": 1282 + }, + { + "epoch": 0.032075, + "grad_norm": 0.8804663419723511, + "grad_norm_var": 0.05050924825759508, + "learning_rate": 0.0001, + "loss": 8060.4771, + "loss/crossentropy": 2.930004358291626, + "loss/hidden": 0.474609375, + "loss/logits": 0.10016739368438721, + "loss/reg": 8059.90234375, + "step": 1283 + }, + { + "epoch": 0.0321, + "grad_norm": 0.8530605435371399, + "grad_norm_var": 0.04698137364486722, + "learning_rate": 0.0001, + "loss": 8038.0005, + "loss/crossentropy": 2.982766628265381, + "loss/hidden": 0.462890625, + "loss/logits": 0.09573029726743698, + "loss/reg": 8037.44189453125, + "step": 1284 + }, + { + "epoch": 0.032125, + "grad_norm": 1.142184853553772, + "grad_norm_var": 0.04487463069720589, + "learning_rate": 0.0001, + "loss": 8015.4233, + "loss/crossentropy": 2.594120502471924, + "loss/hidden": 0.51171875, + "loss/logits": 0.0928168073296547, + "loss/reg": 8014.81884765625, + "step": 1285 + }, + { + "epoch": 0.03215, + "grad_norm": 0.9862678050994873, + "grad_norm_var": 0.043161885828666806, + "learning_rate": 0.0001, + "loss": 7992.7622, + "loss/crossentropy": 2.7711265087127686, + "loss/hidden": 0.470703125, + "loss/logits": 0.092730812728405, + "loss/reg": 7992.19873046875, + "step": 1286 + }, + { + "epoch": 0.032175, + "grad_norm": 2.5009279251098633, + "grad_norm_var": 0.1764777313681615, + "learning_rate": 0.0001, + "loss": 7970.334, + "loss/crossentropy": 2.942647695541382, + "loss/hidden": 0.5546875, + "loss/logits": 0.12212435156106949, + "loss/reg": 7969.6572265625, + "step": 1287 + }, + { + "epoch": 0.0322, + "grad_norm": 1.21425461769104, + "grad_norm_var": 0.17442147111949607, + "learning_rate": 0.0001, + "loss": 7946.9565, + "loss/crossentropy": 2.6966969966888428, + "loss/hidden": 0.494140625, + "loss/logits": 0.10218788683414459, + "loss/reg": 7946.3603515625, + "step": 1288 + }, + { + "epoch": 0.032225, + "grad_norm": 1.1258981227874756, + "grad_norm_var": 0.1744167700820533, + "learning_rate": 0.0001, + "loss": 7924.2075, + "loss/crossentropy": 2.7866528034210205, + "loss/hidden": 0.474609375, + "loss/logits": 0.09944634884595871, + "loss/reg": 7923.63330078125, + "step": 1289 + }, + { + "epoch": 0.03225, + "grad_norm": 1.285189151763916, + "grad_norm_var": 0.1737176325484464, + "learning_rate": 0.0001, + "loss": 7901.8428, + "loss/crossentropy": 2.994377851486206, + "loss/hidden": 0.490234375, + "loss/logits": 0.09650782495737076, + "loss/reg": 7901.255859375, + "step": 1290 + }, + { + "epoch": 0.032275, + "grad_norm": 1.1900367736816406, + "grad_norm_var": 0.1732637797998457, + "learning_rate": 0.0001, + "loss": 7879.4111, + "loss/crossentropy": 2.7183420658111572, + "loss/hidden": 0.486328125, + "loss/logits": 0.11161264032125473, + "loss/reg": 7878.81298828125, + "step": 1291 + }, + { + "epoch": 0.0323, + "grad_norm": 1.0578922033309937, + "grad_norm_var": 0.16314844204604045, + "learning_rate": 0.0001, + "loss": 7857.1812, + "loss/crossentropy": 2.464585781097412, + "loss/hidden": 0.4921875, + "loss/logits": 0.07893523573875427, + "loss/reg": 7856.60986328125, + "step": 1292 + }, + { + "epoch": 0.032325, + "grad_norm": 0.8201889395713806, + "grad_norm_var": 0.16681224735512307, + "learning_rate": 0.0001, + "loss": 7835.291, + "loss/crossentropy": 2.7440521717071533, + "loss/hidden": 0.478515625, + "loss/logits": 0.0926779955625534, + "loss/reg": 7834.7197265625, + "step": 1293 + }, + { + "epoch": 0.03235, + "grad_norm": 0.8110074400901794, + "grad_norm_var": 0.16632224475366872, + "learning_rate": 0.0001, + "loss": 7813.4004, + "loss/crossentropy": 2.948300361633301, + "loss/hidden": 0.482421875, + "loss/logits": 0.0882749855518341, + "loss/reg": 7812.82958984375, + "step": 1294 + }, + { + "epoch": 0.032375, + "grad_norm": 0.9979842901229858, + "grad_norm_var": 0.16283313552398174, + "learning_rate": 0.0001, + "loss": 7791.646, + "loss/crossentropy": 2.79728102684021, + "loss/hidden": 0.48046875, + "loss/logits": 0.09832129627466202, + "loss/reg": 7791.0673828125, + "step": 1295 + }, + { + "epoch": 0.0324, + "grad_norm": 0.9378174543380737, + "grad_norm_var": 0.15973385564981984, + "learning_rate": 0.0001, + "loss": 7769.6074, + "loss/crossentropy": 2.986835241317749, + "loss/hidden": 0.443359375, + "loss/logits": 0.08560170233249664, + "loss/reg": 7769.07861328125, + "step": 1296 + }, + { + "epoch": 0.032425, + "grad_norm": 1.0474334955215454, + "grad_norm_var": 0.15990018120436067, + "learning_rate": 0.0001, + "loss": 7748.0244, + "loss/crossentropy": 2.919323682785034, + "loss/hidden": 0.486328125, + "loss/logits": 0.11888301372528076, + "loss/reg": 7747.41943359375, + "step": 1297 + }, + { + "epoch": 0.03245, + "grad_norm": 1.028988242149353, + "grad_norm_var": 0.15638857063354974, + "learning_rate": 0.0001, + "loss": 7726.6182, + "loss/crossentropy": 2.753113031387329, + "loss/hidden": 0.48828125, + "loss/logits": 0.08861382305622101, + "loss/reg": 7726.04150390625, + "step": 1298 + }, + { + "epoch": 0.032475, + "grad_norm": 0.9262399077415466, + "grad_norm_var": 0.15507302500825274, + "learning_rate": 0.0001, + "loss": 7705.2729, + "loss/crossentropy": 2.8557252883911133, + "loss/hidden": 0.48828125, + "loss/logits": 0.08935841172933578, + "loss/reg": 7704.6953125, + "step": 1299 + }, + { + "epoch": 0.0325, + "grad_norm": 1.2030805349349976, + "grad_norm_var": 0.15025659494356405, + "learning_rate": 0.0001, + "loss": 7684.001, + "loss/crossentropy": 2.6697678565979004, + "loss/hidden": 0.5078125, + "loss/logits": 0.11960919201374054, + "loss/reg": 7683.37353515625, + "step": 1300 + }, + { + "epoch": 0.032525, + "grad_norm": 1.3810847997665405, + "grad_norm_var": 0.15382280474053756, + "learning_rate": 0.0001, + "loss": 7662.5615, + "loss/crossentropy": 2.75545072555542, + "loss/hidden": 0.5078125, + "loss/logits": 0.11684276163578033, + "loss/reg": 7661.93701171875, + "step": 1301 + }, + { + "epoch": 0.03255, + "grad_norm": 1.2311424016952515, + "grad_norm_var": 0.15199145426370816, + "learning_rate": 0.0001, + "loss": 7641.6509, + "loss/crossentropy": 3.2293875217437744, + "loss/hidden": 0.5, + "loss/logits": 0.133016437292099, + "loss/reg": 7641.01806640625, + "step": 1302 + }, + { + "epoch": 0.032575, + "grad_norm": 1.601858377456665, + "grad_norm_var": 0.04325904037096214, + "learning_rate": 0.0001, + "loss": 7620.6187, + "loss/crossentropy": 3.1844873428344727, + "loss/hidden": 0.52734375, + "loss/logits": 0.10423330962657928, + "loss/reg": 7619.9873046875, + "step": 1303 + }, + { + "epoch": 0.0326, + "grad_norm": 0.9981883764266968, + "grad_norm_var": 0.04335360478470348, + "learning_rate": 0.0001, + "loss": 7599.3457, + "loss/crossentropy": 2.9040017127990723, + "loss/hidden": 0.4921875, + "loss/logits": 0.10085226595401764, + "loss/reg": 7598.75244140625, + "step": 1304 + }, + { + "epoch": 0.032625, + "grad_norm": 1.127403974533081, + "grad_norm_var": 0.04335839381164678, + "learning_rate": 0.0001, + "loss": 7578.4678, + "loss/crossentropy": 3.017641544342041, + "loss/hidden": 0.50390625, + "loss/logits": 0.1118115782737732, + "loss/reg": 7577.85205078125, + "step": 1305 + }, + { + "epoch": 0.03265, + "grad_norm": 1.0478172302246094, + "grad_norm_var": 0.04110889794239984, + "learning_rate": 0.0001, + "loss": 7557.6021, + "loss/crossentropy": 3.1238770484924316, + "loss/hidden": 0.48828125, + "loss/logits": 0.09554004669189453, + "loss/reg": 7557.01806640625, + "step": 1306 + }, + { + "epoch": 0.032675, + "grad_norm": 1.0185546875, + "grad_norm_var": 0.040614017374103625, + "learning_rate": 0.0001, + "loss": 7536.9092, + "loss/crossentropy": 3.1412482261657715, + "loss/hidden": 0.4921875, + "loss/logits": 0.10232207924127579, + "loss/reg": 7536.314453125, + "step": 1307 + }, + { + "epoch": 0.0327, + "grad_norm": 1.301594614982605, + "grad_norm_var": 0.04369555513625844, + "learning_rate": 0.0001, + "loss": 7515.9883, + "loss/crossentropy": 3.0319020748138428, + "loss/hidden": 0.515625, + "loss/logits": 0.09871071577072144, + "loss/reg": 7515.3740234375, + "step": 1308 + }, + { + "epoch": 0.032725, + "grad_norm": 0.8953453302383423, + "grad_norm_var": 0.04131955548779486, + "learning_rate": 0.0001, + "loss": 7495.7622, + "loss/crossentropy": 2.6120543479919434, + "loss/hidden": 0.48828125, + "loss/logits": 0.09244601428508759, + "loss/reg": 7495.181640625, + "step": 1309 + }, + { + "epoch": 0.03275, + "grad_norm": 0.9810869097709656, + "grad_norm_var": 0.0366369472383818, + "learning_rate": 0.0001, + "loss": 7474.7329, + "loss/crossentropy": 3.0907046794891357, + "loss/hidden": 0.484375, + "loss/logits": 0.11530554294586182, + "loss/reg": 7474.13330078125, + "step": 1310 + }, + { + "epoch": 0.032775, + "grad_norm": 1.078172206878662, + "grad_norm_var": 0.03586416117602245, + "learning_rate": 0.0001, + "loss": 7453.2261, + "loss/crossentropy": 3.0059869289398193, + "loss/hidden": 0.51953125, + "loss/logits": 0.11619419604539871, + "loss/reg": 7452.59033203125, + "step": 1311 + }, + { + "epoch": 0.0328, + "grad_norm": 0.8973978161811829, + "grad_norm_var": 0.03690964090499163, + "learning_rate": 0.0001, + "loss": 7432.4702, + "loss/crossentropy": 2.6508216857910156, + "loss/hidden": 0.478515625, + "loss/logits": 0.08029112219810486, + "loss/reg": 7431.91162109375, + "step": 1312 + }, + { + "epoch": 0.032825, + "grad_norm": 1.2111337184906006, + "grad_norm_var": 0.03721152961874593, + "learning_rate": 0.0001, + "loss": 7412.1855, + "loss/crossentropy": 2.8881707191467285, + "loss/hidden": 0.48828125, + "loss/logits": 0.0881146490573883, + "loss/reg": 7411.609375, + "step": 1313 + }, + { + "epoch": 0.03285, + "grad_norm": 0.8852730393409729, + "grad_norm_var": 0.04025726348707579, + "learning_rate": 0.0001, + "loss": 7392.3027, + "loss/crossentropy": 3.0368635654449463, + "loss/hidden": 0.4609375, + "loss/logits": 0.11413509398698807, + "loss/reg": 7391.7275390625, + "step": 1314 + }, + { + "epoch": 0.032875, + "grad_norm": 0.9257658123970032, + "grad_norm_var": 0.04026899375604476, + "learning_rate": 0.0001, + "loss": 7372.4961, + "loss/crossentropy": 2.818535804748535, + "loss/hidden": 0.48046875, + "loss/logits": 0.09589619934558868, + "loss/reg": 7371.919921875, + "step": 1315 + }, + { + "epoch": 0.0329, + "grad_norm": 0.9861080646514893, + "grad_norm_var": 0.040563542561517354, + "learning_rate": 0.0001, + "loss": 7352.4141, + "loss/crossentropy": 2.960479974746704, + "loss/hidden": 0.486328125, + "loss/logits": 0.09554879367351532, + "loss/reg": 7351.83203125, + "step": 1316 + }, + { + "epoch": 0.032925, + "grad_norm": 1.205779790878296, + "grad_norm_var": 0.03586735238293806, + "learning_rate": 0.0001, + "loss": 7332.7109, + "loss/crossentropy": 2.8014562129974365, + "loss/hidden": 0.53125, + "loss/logits": 0.10308459401130676, + "loss/reg": 7332.07666015625, + "step": 1317 + }, + { + "epoch": 0.03295, + "grad_norm": 1.0991089344024658, + "grad_norm_var": 0.03442003994635622, + "learning_rate": 0.0001, + "loss": 7312.6997, + "loss/crossentropy": 2.7398338317871094, + "loss/hidden": 0.48828125, + "loss/logits": 0.09255098551511765, + "loss/reg": 7312.11865234375, + "step": 1318 + }, + { + "epoch": 0.032975, + "grad_norm": 1.1162376403808594, + "grad_norm_var": 0.01529067172337856, + "learning_rate": 0.0001, + "loss": 7292.959, + "loss/crossentropy": 3.011981964111328, + "loss/hidden": 0.486328125, + "loss/logits": 0.0965576171875, + "loss/reg": 7292.3759765625, + "step": 1319 + }, + { + "epoch": 0.033, + "grad_norm": 1.3372790813446045, + "grad_norm_var": 0.02020530026863927, + "learning_rate": 0.0001, + "loss": 7273.0537, + "loss/crossentropy": 2.8459720611572266, + "loss/hidden": 0.515625, + "loss/logits": 0.11112953722476959, + "loss/reg": 7272.4267578125, + "step": 1320 + }, + { + "epoch": 0.033025, + "grad_norm": 1.3002338409423828, + "grad_norm_var": 0.02340355833699679, + "learning_rate": 0.0001, + "loss": 7252.2334, + "loss/crossentropy": 2.893174171447754, + "loss/hidden": 0.48828125, + "loss/logits": 0.08477090299129486, + "loss/reg": 7251.66015625, + "step": 1321 + }, + { + "epoch": 0.03305, + "grad_norm": 1.2420011758804321, + "grad_norm_var": 0.02491587351695799, + "learning_rate": 0.0001, + "loss": 7232.2285, + "loss/crossentropy": 2.4793474674224854, + "loss/hidden": 0.515625, + "loss/logits": 0.11036118865013123, + "loss/reg": 7231.6025390625, + "step": 1322 + }, + { + "epoch": 0.033075, + "grad_norm": 0.8365013003349304, + "grad_norm_var": 0.02878389821853061, + "learning_rate": 0.0001, + "loss": 7212.4717, + "loss/crossentropy": 2.832367181777954, + "loss/hidden": 0.478515625, + "loss/logits": 0.08904410898685455, + "loss/reg": 7211.904296875, + "step": 1323 + }, + { + "epoch": 0.0331, + "grad_norm": 1.0674488544464111, + "grad_norm_var": 0.025329465113824397, + "learning_rate": 0.0001, + "loss": 7193.1782, + "loss/crossentropy": 2.6910011768341064, + "loss/hidden": 0.5, + "loss/logits": 0.09814359247684479, + "loss/reg": 7192.580078125, + "step": 1324 + }, + { + "epoch": 0.033125, + "grad_norm": 1.0702201128005981, + "grad_norm_var": 0.023248765074305178, + "learning_rate": 0.0001, + "loss": 7173.7251, + "loss/crossentropy": 2.985245943069458, + "loss/hidden": 0.48046875, + "loss/logits": 0.08581198006868362, + "loss/reg": 7173.15869140625, + "step": 1325 + }, + { + "epoch": 0.03315, + "grad_norm": 1.0932273864746094, + "grad_norm_var": 0.022593393543815843, + "learning_rate": 0.0001, + "loss": 7154.1875, + "loss/crossentropy": 3.2283401489257812, + "loss/hidden": 0.494140625, + "loss/logits": 0.11157424747943878, + "loss/reg": 7153.58154296875, + "step": 1326 + }, + { + "epoch": 0.033175, + "grad_norm": 1.1343928575515747, + "grad_norm_var": 0.022743559564361013, + "learning_rate": 0.0001, + "loss": 7135.0659, + "loss/crossentropy": 2.826775074005127, + "loss/hidden": 0.490234375, + "loss/logits": 0.115767702460289, + "loss/reg": 7134.4599609375, + "step": 1327 + }, + { + "epoch": 0.0332, + "grad_norm": 1.005599021911621, + "grad_norm_var": 0.020725394890757073, + "learning_rate": 0.0001, + "loss": 7115.8457, + "loss/crossentropy": 2.8775322437286377, + "loss/hidden": 0.482421875, + "loss/logits": 0.10473422706127167, + "loss/reg": 7115.2587890625, + "step": 1328 + }, + { + "epoch": 0.033225, + "grad_norm": 0.8706186413764954, + "grad_norm_var": 0.022689128924057892, + "learning_rate": 0.0001, + "loss": 7096.7861, + "loss/crossentropy": 2.806633710861206, + "loss/hidden": 0.49609375, + "loss/logits": 0.10028624534606934, + "loss/reg": 7096.18994140625, + "step": 1329 + }, + { + "epoch": 0.03325, + "grad_norm": 0.7180449962615967, + "grad_norm_var": 0.028633580337355472, + "learning_rate": 0.0001, + "loss": 7077.8433, + "loss/crossentropy": 2.5447871685028076, + "loss/hidden": 0.46875, + "loss/logits": 0.08894561231136322, + "loss/reg": 7077.28564453125, + "step": 1330 + }, + { + "epoch": 0.033275, + "grad_norm": 0.8574270009994507, + "grad_norm_var": 0.03017624676980996, + "learning_rate": 0.0001, + "loss": 7059.2168, + "loss/crossentropy": 2.8060553073883057, + "loss/hidden": 0.478515625, + "loss/logits": 0.09654555469751358, + "loss/reg": 7058.6416015625, + "step": 1331 + }, + { + "epoch": 0.0333, + "grad_norm": 0.9025835394859314, + "grad_norm_var": 0.0314214120190319, + "learning_rate": 0.0001, + "loss": 7040.6797, + "loss/crossentropy": 2.8328611850738525, + "loss/hidden": 0.48828125, + "loss/logits": 0.11992849409580231, + "loss/reg": 7040.0712890625, + "step": 1332 + }, + { + "epoch": 0.033325, + "grad_norm": 0.8659266233444214, + "grad_norm_var": 0.031741798051395625, + "learning_rate": 0.0001, + "loss": 7022.3887, + "loss/crossentropy": 2.7338974475860596, + "loss/hidden": 0.48046875, + "loss/logits": 0.09682288765907288, + "loss/reg": 7021.8115234375, + "step": 1333 + }, + { + "epoch": 0.03335, + "grad_norm": 1.019660472869873, + "grad_norm_var": 0.03142861984241999, + "learning_rate": 0.0001, + "loss": 7004.1777, + "loss/crossentropy": 3.012676477432251, + "loss/hidden": 0.48828125, + "loss/logits": 0.09259988367557526, + "loss/reg": 7003.5966796875, + "step": 1334 + }, + { + "epoch": 0.033375, + "grad_norm": 1.0380840301513672, + "grad_norm_var": 0.030883988419247627, + "learning_rate": 0.0001, + "loss": 6985.627, + "loss/crossentropy": 2.9568233489990234, + "loss/hidden": 0.4765625, + "loss/logits": 0.09427786618471146, + "loss/reg": 6985.05615234375, + "step": 1335 + }, + { + "epoch": 0.0334, + "grad_norm": 1.1338788270950317, + "grad_norm_var": 0.02493162497171597, + "learning_rate": 0.0001, + "loss": 6967.625, + "loss/crossentropy": 2.920952558517456, + "loss/hidden": 0.484375, + "loss/logits": 0.0969780683517456, + "loss/reg": 6967.04345703125, + "step": 1336 + }, + { + "epoch": 0.033425, + "grad_norm": 0.9520040154457092, + "grad_norm_var": 0.01902283493770819, + "learning_rate": 0.0001, + "loss": 6949.4731, + "loss/crossentropy": 2.965240240097046, + "loss/hidden": 0.48046875, + "loss/logits": 0.09136784076690674, + "loss/reg": 6948.9013671875, + "step": 1337 + }, + { + "epoch": 0.03345, + "grad_norm": 0.8833340406417847, + "grad_norm_var": 0.014914911312502947, + "learning_rate": 0.0001, + "loss": 6931.3071, + "loss/crossentropy": 2.8117728233337402, + "loss/hidden": 0.47265625, + "loss/logits": 0.07982734590768814, + "loss/reg": 6930.7548828125, + "step": 1338 + }, + { + "epoch": 0.033475, + "grad_norm": 1.3426650762557983, + "grad_norm_var": 0.022217578237794657, + "learning_rate": 0.0001, + "loss": 6913.1899, + "loss/crossentropy": 3.039302110671997, + "loss/hidden": 0.5390625, + "loss/logits": 0.13471244275569916, + "loss/reg": 6912.51611328125, + "step": 1339 + }, + { + "epoch": 0.0335, + "grad_norm": 1.1176189184188843, + "grad_norm_var": 0.022844846877402233, + "learning_rate": 0.0001, + "loss": 6894.9351, + "loss/crossentropy": 2.8302769660949707, + "loss/hidden": 0.482421875, + "loss/logits": 0.09411150962114334, + "loss/reg": 6894.3583984375, + "step": 1340 + }, + { + "epoch": 0.033525, + "grad_norm": 1.0748586654663086, + "grad_norm_var": 0.022889416618989467, + "learning_rate": 0.0001, + "loss": 6876.7759, + "loss/crossentropy": 2.611422538757324, + "loss/hidden": 0.486328125, + "loss/logits": 0.08938396722078323, + "loss/reg": 6876.2001953125, + "step": 1341 + }, + { + "epoch": 0.03355, + "grad_norm": 0.9157936573028564, + "grad_norm_var": 0.0226662033596554, + "learning_rate": 0.0001, + "loss": 6859.0195, + "loss/crossentropy": 2.811737298965454, + "loss/hidden": 0.482421875, + "loss/logits": 0.10107594728469849, + "loss/reg": 6858.43603515625, + "step": 1342 + }, + { + "epoch": 0.033575, + "grad_norm": 0.7973225116729736, + "grad_norm_var": 0.02325672885853861, + "learning_rate": 0.0001, + "loss": 6841.3901, + "loss/crossentropy": 2.892944097518921, + "loss/hidden": 0.486328125, + "loss/logits": 0.09526005387306213, + "loss/reg": 6840.80859375, + "step": 1343 + }, + { + "epoch": 0.0336, + "grad_norm": 0.9367047548294067, + "grad_norm_var": 0.02321225915422347, + "learning_rate": 0.0001, + "loss": 6823.8916, + "loss/crossentropy": 2.7473795413970947, + "loss/hidden": 0.5, + "loss/logits": 0.11404645442962646, + "loss/reg": 6823.27734375, + "step": 1344 + }, + { + "epoch": 0.033625, + "grad_norm": 0.8657424449920654, + "grad_norm_var": 0.02327456064868153, + "learning_rate": 0.0001, + "loss": 6806.124, + "loss/crossentropy": 2.865846872329712, + "loss/hidden": 0.48828125, + "loss/logits": 0.1009342223405838, + "loss/reg": 6805.53466796875, + "step": 1345 + }, + { + "epoch": 0.03365, + "grad_norm": 1.054735779762268, + "grad_norm_var": 0.019324760377264265, + "learning_rate": 0.0001, + "loss": 6788.7681, + "loss/crossentropy": 3.1837844848632812, + "loss/hidden": 0.51953125, + "loss/logits": 0.11502991616725922, + "loss/reg": 6788.13330078125, + "step": 1346 + }, + { + "epoch": 0.033675, + "grad_norm": 0.9969978928565979, + "grad_norm_var": 0.018170129173570717, + "learning_rate": 0.0001, + "loss": 6770.9668, + "loss/crossentropy": 2.617208480834961, + "loss/hidden": 0.484375, + "loss/logits": 0.10487374663352966, + "loss/reg": 6770.37744140625, + "step": 1347 + }, + { + "epoch": 0.0337, + "grad_norm": 0.8903811573982239, + "grad_norm_var": 0.01832754930692624, + "learning_rate": 0.0001, + "loss": 6753.5425, + "loss/crossentropy": 2.6439528465270996, + "loss/hidden": 0.4921875, + "loss/logits": 0.0947711169719696, + "loss/reg": 6752.95556640625, + "step": 1348 + }, + { + "epoch": 0.033725, + "grad_norm": 1.1429519653320312, + "grad_norm_var": 0.018435605742367623, + "learning_rate": 0.0001, + "loss": 6736.2373, + "loss/crossentropy": 3.1083426475524902, + "loss/hidden": 0.5078125, + "loss/logits": 0.10791288316249847, + "loss/reg": 6735.62158203125, + "step": 1349 + }, + { + "epoch": 0.03375, + "grad_norm": 1.1418884992599487, + "grad_norm_var": 0.019523988822685096, + "learning_rate": 0.0001, + "loss": 6718.9263, + "loss/crossentropy": 2.8117763996124268, + "loss/hidden": 0.50390625, + "loss/logits": 0.12460052967071533, + "loss/reg": 6718.2978515625, + "step": 1350 + }, + { + "epoch": 0.033775, + "grad_norm": 0.87659752368927, + "grad_norm_var": 0.020717329095012702, + "learning_rate": 0.0001, + "loss": 6701.9033, + "loss/crossentropy": 2.945575475692749, + "loss/hidden": 0.4765625, + "loss/logits": 0.08381037414073944, + "loss/reg": 6701.3427734375, + "step": 1351 + }, + { + "epoch": 0.0338, + "grad_norm": 1.689099669456482, + "grad_norm_var": 0.049323888443802116, + "learning_rate": 0.0001, + "loss": 6684.8394, + "loss/crossentropy": 2.7382237911224365, + "loss/hidden": 0.54296875, + "loss/logits": 0.1253061294555664, + "loss/reg": 6684.1708984375, + "step": 1352 + }, + { + "epoch": 0.033825, + "grad_norm": 1.2289764881134033, + "grad_norm_var": 0.05077952008831321, + "learning_rate": 0.0001, + "loss": 6667.2241, + "loss/crossentropy": 2.894232988357544, + "loss/hidden": 0.51171875, + "loss/logits": 0.1199125349521637, + "loss/reg": 6666.59228515625, + "step": 1353 + }, + { + "epoch": 0.03385, + "grad_norm": 0.9368696212768555, + "grad_norm_var": 0.04969952561096349, + "learning_rate": 0.0001, + "loss": 6650.0176, + "loss/crossentropy": 2.712751626968384, + "loss/hidden": 0.484375, + "loss/logits": 0.09183096885681152, + "loss/reg": 6649.44140625, + "step": 1354 + }, + { + "epoch": 0.033875, + "grad_norm": 1.061965823173523, + "grad_norm_var": 0.04415994410644473, + "learning_rate": 0.0001, + "loss": 6632.9712, + "loss/crossentropy": 3.0443739891052246, + "loss/hidden": 0.5, + "loss/logits": 0.11184091866016388, + "loss/reg": 6632.359375, + "step": 1355 + }, + { + "epoch": 0.0339, + "grad_norm": 1.4387856721878052, + "grad_norm_var": 0.05369363995281968, + "learning_rate": 0.0001, + "loss": 6615.7505, + "loss/crossentropy": 2.9656739234924316, + "loss/hidden": 0.53515625, + "loss/logits": 0.11242452263832092, + "loss/reg": 6615.10302734375, + "step": 1356 + }, + { + "epoch": 0.033925, + "grad_norm": 0.915932834148407, + "grad_norm_var": 0.05507613206998211, + "learning_rate": 0.0001, + "loss": 6598.4375, + "loss/crossentropy": 2.9624454975128174, + "loss/hidden": 0.494140625, + "loss/logits": 0.10640692710876465, + "loss/reg": 6597.8369140625, + "step": 1357 + }, + { + "epoch": 0.03395, + "grad_norm": 0.7371416091918945, + "grad_norm_var": 0.06040284854185741, + "learning_rate": 0.0001, + "loss": 6581.6763, + "loss/crossentropy": 2.574276924133301, + "loss/hidden": 0.48828125, + "loss/logits": 0.09294277429580688, + "loss/reg": 6581.09521484375, + "step": 1358 + }, + { + "epoch": 0.033975, + "grad_norm": 0.9200770854949951, + "grad_norm_var": 0.05729892266983065, + "learning_rate": 0.0001, + "loss": 6565.1777, + "loss/crossentropy": 2.9009039402008057, + "loss/hidden": 0.5078125, + "loss/logits": 0.09284225851297379, + "loss/reg": 6564.5771484375, + "step": 1359 + }, + { + "epoch": 0.034, + "grad_norm": 0.8759512305259705, + "grad_norm_var": 0.058464997552856585, + "learning_rate": 0.0001, + "loss": 6548.5493, + "loss/crossentropy": 2.875610589981079, + "loss/hidden": 0.48828125, + "loss/logits": 0.09936296939849854, + "loss/reg": 6547.9619140625, + "step": 1360 + }, + { + "epoch": 0.034025, + "grad_norm": 0.8347700238227844, + "grad_norm_var": 0.05927918748393553, + "learning_rate": 0.0001, + "loss": 6532.1455, + "loss/crossentropy": 2.864703893661499, + "loss/hidden": 0.4765625, + "loss/logits": 0.08602680265903473, + "loss/reg": 6531.5830078125, + "step": 1361 + }, + { + "epoch": 0.03405, + "grad_norm": 0.8607924580574036, + "grad_norm_var": 0.06141567570151428, + "learning_rate": 0.0001, + "loss": 6515.8506, + "loss/crossentropy": 2.605031728744507, + "loss/hidden": 0.49609375, + "loss/logits": 0.10415361821651459, + "loss/reg": 6515.25048828125, + "step": 1362 + }, + { + "epoch": 0.034075, + "grad_norm": 0.9597486853599548, + "grad_norm_var": 0.0616877756756257, + "learning_rate": 0.0001, + "loss": 6499.5688, + "loss/crossentropy": 2.887112855911255, + "loss/hidden": 0.490234375, + "loss/logits": 0.09598223119974136, + "loss/reg": 6498.982421875, + "step": 1363 + }, + { + "epoch": 0.0341, + "grad_norm": 2.3581037521362305, + "grad_norm_var": 0.16861243853628496, + "learning_rate": 0.0001, + "loss": 6483.1235, + "loss/crossentropy": 2.8899037837982178, + "loss/hidden": 0.51171875, + "loss/logits": 0.19754047691822052, + "loss/reg": 6482.4140625, + "step": 1364 + }, + { + "epoch": 0.034125, + "grad_norm": 1.3278796672821045, + "grad_norm_var": 0.17122382768664993, + "learning_rate": 0.0001, + "loss": 6465.9492, + "loss/crossentropy": 2.6608476638793945, + "loss/hidden": 0.5390625, + "loss/logits": 0.12474717944860458, + "loss/reg": 6465.28564453125, + "step": 1365 + }, + { + "epoch": 0.03415, + "grad_norm": 0.8733435273170471, + "grad_norm_var": 0.17549470413593488, + "learning_rate": 0.0001, + "loss": 6449.3564, + "loss/crossentropy": 2.6548588275909424, + "loss/hidden": 0.484375, + "loss/logits": 0.09747719019651413, + "loss/reg": 6448.7744140625, + "step": 1366 + }, + { + "epoch": 0.034175, + "grad_norm": 1.191423773765564, + "grad_norm_var": 0.17153503360068617, + "learning_rate": 0.0001, + "loss": 6433.3599, + "loss/crossentropy": 2.8686625957489014, + "loss/hidden": 0.54296875, + "loss/logits": 0.13953447341918945, + "loss/reg": 6432.67724609375, + "step": 1367 + }, + { + "epoch": 0.0342, + "grad_norm": 0.9046757817268372, + "grad_norm_var": 0.15237186112470422, + "learning_rate": 0.0001, + "loss": 6417.104, + "loss/crossentropy": 2.729344129562378, + "loss/hidden": 0.48828125, + "loss/logits": 0.11070854961872101, + "loss/reg": 6416.5048828125, + "step": 1368 + }, + { + "epoch": 0.034225, + "grad_norm": 0.7749925851821899, + "grad_norm_var": 0.1567894782132723, + "learning_rate": 0.0001, + "loss": 6400.7339, + "loss/crossentropy": 2.8414149284362793, + "loss/hidden": 0.4765625, + "loss/logits": 0.08838851749897003, + "loss/reg": 6400.1689453125, + "step": 1369 + }, + { + "epoch": 0.03425, + "grad_norm": 0.8586689233779907, + "grad_norm_var": 0.158463654422662, + "learning_rate": 0.0001, + "loss": 6384.9233, + "loss/crossentropy": 2.941861152648926, + "loss/hidden": 0.484375, + "loss/logits": 0.10005222260951996, + "loss/reg": 6384.3388671875, + "step": 1370 + }, + { + "epoch": 0.034275, + "grad_norm": 0.9387425780296326, + "grad_norm_var": 0.15931284187209346, + "learning_rate": 0.0001, + "loss": 6368.7803, + "loss/crossentropy": 2.8324739933013916, + "loss/hidden": 0.48828125, + "loss/logits": 0.09414509683847427, + "loss/reg": 6368.19775390625, + "step": 1371 + }, + { + "epoch": 0.0343, + "grad_norm": 1.0363467931747437, + "grad_norm_var": 0.14847634007850463, + "learning_rate": 0.0001, + "loss": 6353.0469, + "loss/crossentropy": 2.7621524333953857, + "loss/hidden": 0.5, + "loss/logits": 0.11550111323595047, + "loss/reg": 6352.43115234375, + "step": 1372 + }, + { + "epoch": 0.034325, + "grad_norm": 1.0415022373199463, + "grad_norm_var": 0.14766861966588107, + "learning_rate": 0.0001, + "loss": 6337.2188, + "loss/crossentropy": 2.695035457611084, + "loss/hidden": 0.53515625, + "loss/logits": 0.12410050630569458, + "loss/reg": 6336.5595703125, + "step": 1373 + }, + { + "epoch": 0.03435, + "grad_norm": 0.9406169652938843, + "grad_norm_var": 0.1422869851031559, + "learning_rate": 0.0001, + "loss": 6321.0205, + "loss/crossentropy": 2.7569470405578613, + "loss/hidden": 0.494140625, + "loss/logits": 0.09523436427116394, + "loss/reg": 6320.43115234375, + "step": 1374 + }, + { + "epoch": 0.034375, + "grad_norm": 0.8832371830940247, + "grad_norm_var": 0.1429785627751386, + "learning_rate": 0.0001, + "loss": 6305.4341, + "loss/crossentropy": 3.0882391929626465, + "loss/hidden": 0.486328125, + "loss/logits": 0.11143780499696732, + "loss/reg": 6304.83642578125, + "step": 1375 + }, + { + "epoch": 0.0344, + "grad_norm": 1.429364800453186, + "grad_norm_var": 0.14991940796402817, + "learning_rate": 0.0001, + "loss": 6290.0093, + "loss/crossentropy": 3.073742151260376, + "loss/hidden": 0.5078125, + "loss/logits": 0.1092057153582573, + "loss/reg": 6289.39208984375, + "step": 1376 + }, + { + "epoch": 0.034425, + "grad_norm": 1.6629365682601929, + "grad_norm_var": 0.16616085625258492, + "learning_rate": 0.0001, + "loss": 6274.0537, + "loss/crossentropy": 3.041329860687256, + "loss/hidden": 0.4921875, + "loss/logits": 0.10428580641746521, + "loss/reg": 6273.45703125, + "step": 1377 + }, + { + "epoch": 0.03445, + "grad_norm": 1.0548546314239502, + "grad_norm_var": 0.16160972528239753, + "learning_rate": 0.0001, + "loss": 6258.5503, + "loss/crossentropy": 2.8612847328186035, + "loss/hidden": 0.498046875, + "loss/logits": 0.09525103121995926, + "loss/reg": 6257.95703125, + "step": 1378 + }, + { + "epoch": 0.034475, + "grad_norm": 0.9255495667457581, + "grad_norm_var": 0.16250373381180752, + "learning_rate": 0.0001, + "loss": 6242.9575, + "loss/crossentropy": 2.9928011894226074, + "loss/hidden": 0.49609375, + "loss/logits": 0.11399185657501221, + "loss/reg": 6242.34765625, + "step": 1379 + }, + { + "epoch": 0.0345, + "grad_norm": 0.872288703918457, + "grad_norm_var": 0.058697168420043376, + "learning_rate": 0.0001, + "loss": 6227.2896, + "loss/crossentropy": 2.8623993396759033, + "loss/hidden": 0.486328125, + "loss/logits": 0.09496334940195084, + "loss/reg": 6226.70849609375, + "step": 1380 + }, + { + "epoch": 0.034525, + "grad_norm": 1.1161308288574219, + "grad_norm_var": 0.05350661484273282, + "learning_rate": 0.0001, + "loss": 6211.978, + "loss/crossentropy": 3.1282906532287598, + "loss/hidden": 0.48828125, + "loss/logits": 0.09676229953765869, + "loss/reg": 6211.39306640625, + "step": 1381 + }, + { + "epoch": 0.03455, + "grad_norm": 0.9747671484947205, + "grad_norm_var": 0.05201019226660599, + "learning_rate": 0.0001, + "loss": 6196.6685, + "loss/crossentropy": 2.552155017852783, + "loss/hidden": 0.53125, + "loss/logits": 0.12433020770549774, + "loss/reg": 6196.0126953125, + "step": 1382 + }, + { + "epoch": 0.034575, + "grad_norm": 1.1094785928726196, + "grad_norm_var": 0.05075227060493869, + "learning_rate": 0.0001, + "loss": 6181.2075, + "loss/crossentropy": 2.9492762088775635, + "loss/hidden": 0.48046875, + "loss/logits": 0.09517276287078857, + "loss/reg": 6180.6318359375, + "step": 1383 + }, + { + "epoch": 0.0346, + "grad_norm": 1.1116600036621094, + "grad_norm_var": 0.04989508043688596, + "learning_rate": 0.0001, + "loss": 6165.6851, + "loss/crossentropy": 3.004523754119873, + "loss/hidden": 0.4921875, + "loss/logits": 0.10090765357017517, + "loss/reg": 6165.091796875, + "step": 1384 + }, + { + "epoch": 0.034625, + "grad_norm": 0.8511428833007812, + "grad_norm_var": 0.04750895557643346, + "learning_rate": 0.0001, + "loss": 6150.2388, + "loss/crossentropy": 2.8703434467315674, + "loss/hidden": 0.486328125, + "loss/logits": 0.09198765456676483, + "loss/reg": 6149.66064453125, + "step": 1385 + }, + { + "epoch": 0.03465, + "grad_norm": 1.1000410318374634, + "grad_norm_var": 0.0449779782934615, + "learning_rate": 0.0001, + "loss": 6134.9727, + "loss/crossentropy": 2.912747383117676, + "loss/hidden": 0.49609375, + "loss/logits": 0.09684903919696808, + "loss/reg": 6134.3798828125, + "step": 1386 + }, + { + "epoch": 0.034675, + "grad_norm": 1.5239450931549072, + "grad_norm_var": 0.05648812952174939, + "learning_rate": 0.0001, + "loss": 6119.5884, + "loss/crossentropy": 2.942798376083374, + "loss/hidden": 0.515625, + "loss/logits": 0.1038217693567276, + "loss/reg": 6118.96875, + "step": 1387 + }, + { + "epoch": 0.0347, + "grad_norm": 1.1341832876205444, + "grad_norm_var": 0.056228422031379906, + "learning_rate": 0.0001, + "loss": 6104.0986, + "loss/crossentropy": 3.102213144302368, + "loss/hidden": 0.52734375, + "loss/logits": 0.11665617674589157, + "loss/reg": 6103.45458984375, + "step": 1388 + }, + { + "epoch": 0.034725, + "grad_norm": 1.142909288406372, + "grad_norm_var": 0.05596889532093123, + "learning_rate": 0.0001, + "loss": 6088.6807, + "loss/crossentropy": 2.795085906982422, + "loss/hidden": 0.494140625, + "loss/logits": 0.09639351069927216, + "loss/reg": 6088.09033203125, + "step": 1389 + }, + { + "epoch": 0.03475, + "grad_norm": 1.1043593883514404, + "grad_norm_var": 0.05384683860217643, + "learning_rate": 0.0001, + "loss": 6073.5322, + "loss/crossentropy": 2.617596387863159, + "loss/hidden": 0.48828125, + "loss/logits": 0.09738233685493469, + "loss/reg": 6072.94677734375, + "step": 1390 + }, + { + "epoch": 0.034775, + "grad_norm": 1.5261905193328857, + "grad_norm_var": 0.05897490555796399, + "learning_rate": 0.0001, + "loss": 6058.2266, + "loss/crossentropy": 2.8050098419189453, + "loss/hidden": 0.52734375, + "loss/logits": 0.09992828965187073, + "loss/reg": 6057.59912109375, + "step": 1391 + }, + { + "epoch": 0.0348, + "grad_norm": 1.1945099830627441, + "grad_norm_var": 0.054143504860030826, + "learning_rate": 0.0001, + "loss": 6042.606, + "loss/crossentropy": 2.8568167686462402, + "loss/hidden": 0.5, + "loss/logits": 0.09500862658023834, + "loss/reg": 6042.0107421875, + "step": 1392 + }, + { + "epoch": 0.034825, + "grad_norm": 1.2398542165756226, + "grad_norm_var": 0.03641311092145567, + "learning_rate": 0.0001, + "loss": 6027.4819, + "loss/crossentropy": 3.028388261795044, + "loss/hidden": 0.490234375, + "loss/logits": 0.10643279552459717, + "loss/reg": 6026.88525390625, + "step": 1393 + }, + { + "epoch": 0.03485, + "grad_norm": 0.9621533751487732, + "grad_norm_var": 0.03780320526095952, + "learning_rate": 0.0001, + "loss": 6012.3086, + "loss/crossentropy": 2.7710087299346924, + "loss/hidden": 0.48828125, + "loss/logits": 0.0932699665427208, + "loss/reg": 6011.72705078125, + "step": 1394 + }, + { + "epoch": 0.034875, + "grad_norm": 0.8675263524055481, + "grad_norm_var": 0.03950306541272881, + "learning_rate": 0.0001, + "loss": 5997.2275, + "loss/crossentropy": 2.6906023025512695, + "loss/hidden": 0.48828125, + "loss/logits": 0.08551843464374542, + "loss/reg": 5996.65380859375, + "step": 1395 + }, + { + "epoch": 0.0349, + "grad_norm": 0.8454170823097229, + "grad_norm_var": 0.040415817948201746, + "learning_rate": 0.0001, + "loss": 5982.5356, + "loss/crossentropy": 3.0521061420440674, + "loss/hidden": 0.494140625, + "loss/logits": 0.09865646809339523, + "loss/reg": 5981.94287109375, + "step": 1396 + }, + { + "epoch": 0.034925, + "grad_norm": 0.9366073608398438, + "grad_norm_var": 0.04234958763299872, + "learning_rate": 0.0001, + "loss": 5967.7051, + "loss/crossentropy": 3.2539052963256836, + "loss/hidden": 0.48828125, + "loss/logits": 0.09871365129947662, + "loss/reg": 5967.1181640625, + "step": 1397 + }, + { + "epoch": 0.03495, + "grad_norm": 1.0396111011505127, + "grad_norm_var": 0.041516266332779835, + "learning_rate": 0.0001, + "loss": 5953.0903, + "loss/crossentropy": 2.9550793170928955, + "loss/hidden": 0.4921875, + "loss/logits": 0.10799872875213623, + "loss/reg": 5952.490234375, + "step": 1398 + }, + { + "epoch": 0.034975, + "grad_norm": 1.357835292816162, + "grad_norm_var": 0.0454997901936771, + "learning_rate": 0.0001, + "loss": 5938.5894, + "loss/crossentropy": 2.8965654373168945, + "loss/hidden": 0.494140625, + "loss/logits": 0.09642442315816879, + "loss/reg": 5937.9990234375, + "step": 1399 + }, + { + "epoch": 0.035, + "grad_norm": 0.9682173728942871, + "grad_norm_var": 0.04696673725534451, + "learning_rate": 0.0001, + "loss": 5924.3062, + "loss/crossentropy": 2.8100740909576416, + "loss/hidden": 0.515625, + "loss/logits": 0.10012298077344894, + "loss/reg": 5923.6904296875, + "step": 1400 + }, + { + "epoch": 0.035025, + "grad_norm": 0.8568408489227295, + "grad_norm_var": 0.0467704669000735, + "learning_rate": 0.0001, + "loss": 5910.2954, + "loss/crossentropy": 2.8342316150665283, + "loss/hidden": 0.482421875, + "loss/logits": 0.09393075108528137, + "loss/reg": 5909.71923828125, + "step": 1401 + }, + { + "epoch": 0.03505, + "grad_norm": 0.9594826698303223, + "grad_norm_var": 0.048238988806995294, + "learning_rate": 0.0001, + "loss": 5896.4443, + "loss/crossentropy": 2.7887749671936035, + "loss/hidden": 0.4921875, + "loss/logits": 0.09894032776355743, + "loss/reg": 5895.85302734375, + "step": 1402 + }, + { + "epoch": 0.035075, + "grad_norm": 1.1903307437896729, + "grad_norm_var": 0.03650307365173426, + "learning_rate": 0.0001, + "loss": 5882.7915, + "loss/crossentropy": 2.9541616439819336, + "loss/hidden": 0.52734375, + "loss/logits": 0.10406450927257538, + "loss/reg": 5882.16015625, + "step": 1403 + }, + { + "epoch": 0.0351, + "grad_norm": 0.9279814958572388, + "grad_norm_var": 0.037749923854517545, + "learning_rate": 0.0001, + "loss": 5868.8516, + "loss/crossentropy": 3.004645824432373, + "loss/hidden": 0.48828125, + "loss/logits": 0.09477942436933517, + "loss/reg": 5868.2685546875, + "step": 1404 + }, + { + "epoch": 0.035125, + "grad_norm": 0.9412584900856018, + "grad_norm_var": 0.038330778079378014, + "learning_rate": 0.0001, + "loss": 5855.0098, + "loss/crossentropy": 2.9767656326293945, + "loss/hidden": 0.490234375, + "loss/logits": 0.10276122391223907, + "loss/reg": 5854.4169921875, + "step": 1405 + }, + { + "epoch": 0.03515, + "grad_norm": 0.8473858833312988, + "grad_norm_var": 0.0408485346006211, + "learning_rate": 0.0001, + "loss": 5841.4292, + "loss/crossentropy": 2.7796714305877686, + "loss/hidden": 0.4921875, + "loss/logits": 0.10826782882213593, + "loss/reg": 5840.82861328125, + "step": 1406 + }, + { + "epoch": 0.035175, + "grad_norm": 1.0374072790145874, + "grad_norm_var": 0.024181143338040556, + "learning_rate": 0.0001, + "loss": 5827.9946, + "loss/crossentropy": 3.059513807296753, + "loss/hidden": 0.49609375, + "loss/logits": 0.09310650080442429, + "loss/reg": 5827.4052734375, + "step": 1407 + }, + { + "epoch": 0.0352, + "grad_norm": 1.9238723516464233, + "grad_norm_var": 0.07529703372726804, + "learning_rate": 0.0001, + "loss": 5814.2061, + "loss/crossentropy": 2.676701068878174, + "loss/hidden": 0.5234375, + "loss/logits": 0.09868060797452927, + "loss/reg": 5813.583984375, + "step": 1408 + }, + { + "epoch": 0.035225, + "grad_norm": 0.9219335317611694, + "grad_norm_var": 0.07383597514061453, + "learning_rate": 0.0001, + "loss": 5800.8066, + "loss/crossentropy": 3.179941177368164, + "loss/hidden": 0.494140625, + "loss/logits": 0.10882144421339035, + "loss/reg": 5800.20361328125, + "step": 1409 + }, + { + "epoch": 0.03525, + "grad_norm": 1.7151775360107422, + "grad_norm_var": 0.10181254197773316, + "learning_rate": 0.0001, + "loss": 5787.3242, + "loss/crossentropy": 2.928461790084839, + "loss/hidden": 0.5234375, + "loss/logits": 0.10902885347604752, + "loss/reg": 5786.69189453125, + "step": 1410 + }, + { + "epoch": 0.035275, + "grad_norm": 0.9305047988891602, + "grad_norm_var": 0.10024641212645134, + "learning_rate": 0.0001, + "loss": 5773.6714, + "loss/crossentropy": 2.8883421421051025, + "loss/hidden": 0.4765625, + "loss/logits": 0.08832409977912903, + "loss/reg": 5773.1064453125, + "step": 1411 + }, + { + "epoch": 0.0353, + "grad_norm": 0.9236626029014587, + "grad_norm_var": 0.09810356131340967, + "learning_rate": 0.0001, + "loss": 5760.1519, + "loss/crossentropy": 2.5950002670288086, + "loss/hidden": 0.494140625, + "loss/logits": 0.09704364836215973, + "loss/reg": 5759.560546875, + "step": 1412 + }, + { + "epoch": 0.035325, + "grad_norm": 1.4456361532211304, + "grad_norm_var": 0.10372546345283613, + "learning_rate": 0.0001, + "loss": 5746.7002, + "loss/crossentropy": 2.9748454093933105, + "loss/hidden": 0.52734375, + "loss/logits": 0.1023864597082138, + "loss/reg": 5746.0703125, + "step": 1413 + }, + { + "epoch": 0.03535, + "grad_norm": 1.0488073825836182, + "grad_norm_var": 0.10362703347444431, + "learning_rate": 0.0001, + "loss": 5733.1187, + "loss/crossentropy": 2.902679204940796, + "loss/hidden": 0.53125, + "loss/logits": 0.10343801975250244, + "loss/reg": 5732.48388671875, + "step": 1414 + }, + { + "epoch": 0.035375, + "grad_norm": 0.9133599400520325, + "grad_norm_var": 0.1021622460378283, + "learning_rate": 0.0001, + "loss": 5719.3115, + "loss/crossentropy": 2.8861844539642334, + "loss/hidden": 0.484375, + "loss/logits": 0.09216949343681335, + "loss/reg": 5718.73486328125, + "step": 1415 + }, + { + "epoch": 0.0354, + "grad_norm": 0.8939177393913269, + "grad_norm_var": 0.10378298627301626, + "learning_rate": 0.0001, + "loss": 5706.0918, + "loss/crossentropy": 2.7581043243408203, + "loss/hidden": 0.50390625, + "loss/logits": 0.11326596885919571, + "loss/reg": 5705.474609375, + "step": 1416 + }, + { + "epoch": 0.035425, + "grad_norm": 0.9192634224891663, + "grad_norm_var": 0.1020663984992288, + "learning_rate": 0.0001, + "loss": 5692.8921, + "loss/crossentropy": 2.9199843406677246, + "loss/hidden": 0.49609375, + "loss/logits": 0.09658630192279816, + "loss/reg": 5692.29931640625, + "step": 1417 + }, + { + "epoch": 0.03545, + "grad_norm": 1.1331703662872314, + "grad_norm_var": 0.10078458101228187, + "learning_rate": 0.0001, + "loss": 5679.9668, + "loss/crossentropy": 3.0797760486602783, + "loss/hidden": 0.498046875, + "loss/logits": 0.1090155616402626, + "loss/reg": 5679.35986328125, + "step": 1418 + }, + { + "epoch": 0.035475, + "grad_norm": 1.0098109245300293, + "grad_norm_var": 0.10081809220174791, + "learning_rate": 0.0001, + "loss": 5666.6006, + "loss/crossentropy": 2.6272778511047363, + "loss/hidden": 0.48828125, + "loss/logits": 0.09299825131893158, + "loss/reg": 5666.01953125, + "step": 1419 + }, + { + "epoch": 0.0355, + "grad_norm": 1.2260712385177612, + "grad_norm_var": 0.09970081981556332, + "learning_rate": 0.0001, + "loss": 5653.4111, + "loss/crossentropy": 2.8777990341186523, + "loss/hidden": 0.5078125, + "loss/logits": 0.09825815260410309, + "loss/reg": 5652.80517578125, + "step": 1420 + }, + { + "epoch": 0.035525, + "grad_norm": 1.435912847518921, + "grad_norm_var": 0.10357068173817315, + "learning_rate": 0.0001, + "loss": 5640.1152, + "loss/crossentropy": 2.592386245727539, + "loss/hidden": 0.5234375, + "loss/logits": 0.0932084172964096, + "loss/reg": 5639.49853515625, + "step": 1421 + }, + { + "epoch": 0.03555, + "grad_norm": 1.0495048761367798, + "grad_norm_var": 0.09809354771595694, + "learning_rate": 0.0001, + "loss": 5627.1245, + "loss/crossentropy": 2.9958133697509766, + "loss/hidden": 0.4921875, + "loss/logits": 0.09580527245998383, + "loss/reg": 5626.53662109375, + "step": 1422 + }, + { + "epoch": 0.035575, + "grad_norm": 1.34793221950531, + "grad_norm_var": 0.09912718309816147, + "learning_rate": 0.0001, + "loss": 5614.3306, + "loss/crossentropy": 3.0196943283081055, + "loss/hidden": 0.5234375, + "loss/logits": 0.10388782620429993, + "loss/reg": 5613.703125, + "step": 1423 + }, + { + "epoch": 0.0356, + "grad_norm": 1.0757725238800049, + "grad_norm_var": 0.05967166391269097, + "learning_rate": 0.0001, + "loss": 5601.147, + "loss/crossentropy": 2.849278688430786, + "loss/hidden": 0.50390625, + "loss/logits": 0.08855399489402771, + "loss/reg": 5600.5546875, + "step": 1424 + }, + { + "epoch": 0.035625, + "grad_norm": 0.9359875917434692, + "grad_norm_var": 0.059304607452516166, + "learning_rate": 0.0001, + "loss": 5587.9556, + "loss/crossentropy": 2.7247815132141113, + "loss/hidden": 0.51953125, + "loss/logits": 0.09469491243362427, + "loss/reg": 5587.34130859375, + "step": 1425 + }, + { + "epoch": 0.03565, + "grad_norm": 0.9561020135879517, + "grad_norm_var": 0.03561334173187921, + "learning_rate": 0.0001, + "loss": 5575.0317, + "loss/crossentropy": 2.8671956062316895, + "loss/hidden": 0.50390625, + "loss/logits": 0.13245022296905518, + "loss/reg": 5574.3955078125, + "step": 1426 + }, + { + "epoch": 0.035675, + "grad_norm": 0.8375259637832642, + "grad_norm_var": 0.03798018083497675, + "learning_rate": 0.0001, + "loss": 5562.2354, + "loss/crossentropy": 2.699129104614258, + "loss/hidden": 0.48828125, + "loss/logits": 0.0898795947432518, + "loss/reg": 5561.6572265625, + "step": 1427 + }, + { + "epoch": 0.0357, + "grad_norm": 0.9537240266799927, + "grad_norm_var": 0.03744198728680351, + "learning_rate": 0.0001, + "loss": 5550.0005, + "loss/crossentropy": 2.908249855041504, + "loss/hidden": 0.484375, + "loss/logits": 0.08669602870941162, + "loss/reg": 5549.42919921875, + "step": 1428 + }, + { + "epoch": 0.035725, + "grad_norm": 0.958095908164978, + "grad_norm_var": 0.02813351390256747, + "learning_rate": 0.0001, + "loss": 5537.4263, + "loss/crossentropy": 2.706650495529175, + "loss/hidden": 0.48828125, + "loss/logits": 0.09869828075170517, + "loss/reg": 5536.83935546875, + "step": 1429 + }, + { + "epoch": 0.03575, + "grad_norm": 1.3420166969299316, + "grad_norm_var": 0.03371677877437282, + "learning_rate": 0.0001, + "loss": 5524.8125, + "loss/crossentropy": 3.1115193367004395, + "loss/hidden": 0.515625, + "loss/logits": 0.1058741956949234, + "loss/reg": 5524.19091796875, + "step": 1430 + }, + { + "epoch": 0.035775, + "grad_norm": 0.8543722033500671, + "grad_norm_var": 0.0351014260765756, + "learning_rate": 0.0001, + "loss": 5512.2627, + "loss/crossentropy": 2.608214855194092, + "loss/hidden": 0.484375, + "loss/logits": 0.10194234549999237, + "loss/reg": 5511.67626953125, + "step": 1431 + }, + { + "epoch": 0.0358, + "grad_norm": 0.8250775933265686, + "grad_norm_var": 0.03690434825584472, + "learning_rate": 0.0001, + "loss": 5499.5815, + "loss/crossentropy": 2.989871025085449, + "loss/hidden": 0.48828125, + "loss/logits": 0.09901078790426254, + "loss/reg": 5498.994140625, + "step": 1432 + }, + { + "epoch": 0.035825, + "grad_norm": 0.7822527885437012, + "grad_norm_var": 0.040534793707076015, + "learning_rate": 0.0001, + "loss": 5487.1685, + "loss/crossentropy": 2.9314780235290527, + "loss/hidden": 0.482421875, + "loss/logits": 0.09051553905010223, + "loss/reg": 5486.595703125, + "step": 1433 + }, + { + "epoch": 0.03585, + "grad_norm": 0.8068564534187317, + "grad_norm_var": 0.043362734138599854, + "learning_rate": 0.0001, + "loss": 5474.8149, + "loss/crossentropy": 2.6955857276916504, + "loss/hidden": 0.51953125, + "loss/logits": 0.10059453547000885, + "loss/reg": 5474.19482421875, + "step": 1434 + }, + { + "epoch": 0.035875, + "grad_norm": 1.4666972160339355, + "grad_norm_var": 0.05549537286934751, + "learning_rate": 0.0001, + "loss": 5462.3555, + "loss/crossentropy": 2.588399648666382, + "loss/hidden": 0.55859375, + "loss/logits": 0.09282772243022919, + "loss/reg": 5461.7041015625, + "step": 1435 + }, + { + "epoch": 0.0359, + "grad_norm": 1.117689847946167, + "grad_norm_var": 0.0537338345703509, + "learning_rate": 0.0001, + "loss": 5450.0771, + "loss/crossentropy": 2.9459269046783447, + "loss/hidden": 0.51171875, + "loss/logits": 0.0902022123336792, + "loss/reg": 5449.47509765625, + "step": 1436 + }, + { + "epoch": 0.035925, + "grad_norm": 1.105311632156372, + "grad_norm_var": 0.04340371561122809, + "learning_rate": 0.0001, + "loss": 5437.9082, + "loss/crossentropy": 2.984010934829712, + "loss/hidden": 0.50390625, + "loss/logits": 0.11168159544467926, + "loss/reg": 5437.29248046875, + "step": 1437 + }, + { + "epoch": 0.03595, + "grad_norm": 1.0142920017242432, + "grad_norm_var": 0.04337053865574192, + "learning_rate": 0.0001, + "loss": 5425.6343, + "loss/crossentropy": 3.048353910446167, + "loss/hidden": 0.4921875, + "loss/logits": 0.1168377697467804, + "loss/reg": 5425.025390625, + "step": 1438 + }, + { + "epoch": 0.035975, + "grad_norm": 1.074475646018982, + "grad_norm_var": 0.036223558479937745, + "learning_rate": 0.0001, + "loss": 5413.6694, + "loss/crossentropy": 2.865267276763916, + "loss/hidden": 0.51171875, + "loss/logits": 0.11835680902004242, + "loss/reg": 5413.03955078125, + "step": 1439 + }, + { + "epoch": 0.036, + "grad_norm": 1.2115192413330078, + "grad_norm_var": 0.038626513677341955, + "learning_rate": 0.0001, + "loss": 5401.4404, + "loss/crossentropy": 2.965080499649048, + "loss/hidden": 0.51953125, + "loss/logits": 0.12016730010509491, + "loss/reg": 5400.80078125, + "step": 1440 + }, + { + "epoch": 0.036025, + "grad_norm": 0.9538179636001587, + "grad_norm_var": 0.0384582443619114, + "learning_rate": 0.0001, + "loss": 5389.1182, + "loss/crossentropy": 3.0050806999206543, + "loss/hidden": 0.490234375, + "loss/logits": 0.09753800183534622, + "loss/reg": 5388.5302734375, + "step": 1441 + }, + { + "epoch": 0.03605, + "grad_norm": 0.846208930015564, + "grad_norm_var": 0.040094179752653135, + "learning_rate": 0.0001, + "loss": 5376.7876, + "loss/crossentropy": 2.678614854812622, + "loss/hidden": 0.490234375, + "loss/logits": 0.09301547706127167, + "loss/reg": 5376.20458984375, + "step": 1442 + }, + { + "epoch": 0.036075, + "grad_norm": 0.838445782661438, + "grad_norm_var": 0.04007315713849911, + "learning_rate": 0.0001, + "loss": 5364.6421, + "loss/crossentropy": 2.5030295848846436, + "loss/hidden": 0.49609375, + "loss/logits": 0.0893300473690033, + "loss/reg": 5364.056640625, + "step": 1443 + }, + { + "epoch": 0.0361, + "grad_norm": 1.0943560600280762, + "grad_norm_var": 0.04026473399911965, + "learning_rate": 0.0001, + "loss": 5352.6411, + "loss/crossentropy": 2.7688729763031006, + "loss/hidden": 0.5234375, + "loss/logits": 0.09878958761692047, + "loss/reg": 5352.01904296875, + "step": 1444 + }, + { + "epoch": 0.036125, + "grad_norm": 1.0070030689239502, + "grad_norm_var": 0.040022175785339426, + "learning_rate": 0.0001, + "loss": 5340.7573, + "loss/crossentropy": 2.664525270462036, + "loss/hidden": 0.5234375, + "loss/logits": 0.10403424501419067, + "loss/reg": 5340.1298828125, + "step": 1445 + }, + { + "epoch": 0.03615, + "grad_norm": 0.9361730217933655, + "grad_norm_var": 0.03296033870427563, + "learning_rate": 0.0001, + "loss": 5328.8047, + "loss/crossentropy": 3.0316553115844727, + "loss/hidden": 0.51953125, + "loss/logits": 0.10810248553752899, + "loss/reg": 5328.17724609375, + "step": 1446 + }, + { + "epoch": 0.036175, + "grad_norm": 0.8911219239234924, + "grad_norm_var": 0.03235122087723141, + "learning_rate": 0.0001, + "loss": 5317.0811, + "loss/crossentropy": 2.6316604614257812, + "loss/hidden": 0.5, + "loss/logits": 0.10301603376865387, + "loss/reg": 5316.47802734375, + "step": 1447 + }, + { + "epoch": 0.0362, + "grad_norm": 0.9054492115974426, + "grad_norm_var": 0.0308996622053454, + "learning_rate": 0.0001, + "loss": 5305.5972, + "loss/crossentropy": 3.067214012145996, + "loss/hidden": 0.5, + "loss/logits": 0.10464608669281006, + "loss/reg": 5304.99267578125, + "step": 1448 + }, + { + "epoch": 0.036225, + "grad_norm": 0.7438737750053406, + "grad_norm_var": 0.03212250353753062, + "learning_rate": 0.0001, + "loss": 5294.0088, + "loss/crossentropy": 2.698651075363159, + "loss/hidden": 0.48828125, + "loss/logits": 0.09136485308408737, + "loss/reg": 5293.42919921875, + "step": 1449 + }, + { + "epoch": 0.03625, + "grad_norm": 0.8691712617874146, + "grad_norm_var": 0.030753536796646348, + "learning_rate": 0.0001, + "loss": 5282.9487, + "loss/crossentropy": 2.8457107543945312, + "loss/hidden": 0.486328125, + "loss/logits": 0.10344676673412323, + "loss/reg": 5282.35888671875, + "step": 1450 + }, + { + "epoch": 0.036275, + "grad_norm": 1.5145204067230225, + "grad_norm_var": 0.03384220665456894, + "learning_rate": 0.0001, + "loss": 5271.6914, + "loss/crossentropy": 2.5761830806732178, + "loss/hidden": 0.5234375, + "loss/logits": 0.091462142765522, + "loss/reg": 5271.07666015625, + "step": 1451 + }, + { + "epoch": 0.0363, + "grad_norm": 1.5182558298110962, + "grad_norm_var": 0.04974418429261451, + "learning_rate": 0.0001, + "loss": 5260.1943, + "loss/crossentropy": 2.8467907905578613, + "loss/hidden": 0.53125, + "loss/logits": 0.10642577707767487, + "loss/reg": 5259.556640625, + "step": 1452 + }, + { + "epoch": 0.036325, + "grad_norm": 1.5889036655426025, + "grad_norm_var": 0.06903922677874869, + "learning_rate": 0.0001, + "loss": 5248.917, + "loss/crossentropy": 2.746821641921997, + "loss/hidden": 0.5234375, + "loss/logits": 0.09524772316217422, + "loss/reg": 5248.29833984375, + "step": 1453 + }, + { + "epoch": 0.03635, + "grad_norm": 1.0864880084991455, + "grad_norm_var": 0.06889637151009295, + "learning_rate": 0.0001, + "loss": 5237.8438, + "loss/crossentropy": 2.9466421604156494, + "loss/hidden": 0.515625, + "loss/logits": 0.10183542966842651, + "loss/reg": 5237.22607421875, + "step": 1454 + }, + { + "epoch": 0.036375, + "grad_norm": 0.9537948966026306, + "grad_norm_var": 0.0696941507939409, + "learning_rate": 0.0001, + "loss": 5226.4126, + "loss/crossentropy": 2.9261972904205322, + "loss/hidden": 0.48828125, + "loss/logits": 0.09991321712732315, + "loss/reg": 5225.82421875, + "step": 1455 + }, + { + "epoch": 0.0364, + "grad_norm": 0.8657480478286743, + "grad_norm_var": 0.07017845849461607, + "learning_rate": 0.0001, + "loss": 5214.8638, + "loss/crossentropy": 3.091569423675537, + "loss/hidden": 0.5078125, + "loss/logits": 0.10947701334953308, + "loss/reg": 5214.24658203125, + "step": 1456 + }, + { + "epoch": 0.036425, + "grad_norm": 0.8765159249305725, + "grad_norm_var": 0.07142302758301801, + "learning_rate": 0.0001, + "loss": 5203.5029, + "loss/crossentropy": 3.070120334625244, + "loss/hidden": 0.490234375, + "loss/logits": 0.09942461550235748, + "loss/reg": 5202.9130859375, + "step": 1457 + }, + { + "epoch": 0.03645, + "grad_norm": 0.8967143893241882, + "grad_norm_var": 0.07032121058778491, + "learning_rate": 0.0001, + "loss": 5192.2266, + "loss/crossentropy": 2.6533727645874023, + "loss/hidden": 0.48828125, + "loss/logits": 0.0900174081325531, + "loss/reg": 5191.6484375, + "step": 1458 + }, + { + "epoch": 0.036475, + "grad_norm": 1.0593829154968262, + "grad_norm_var": 0.0675330313695351, + "learning_rate": 0.0001, + "loss": 5180.8418, + "loss/crossentropy": 2.7807159423828125, + "loss/hidden": 0.5, + "loss/logits": 0.09789751470088959, + "loss/reg": 5180.244140625, + "step": 1459 + }, + { + "epoch": 0.0365, + "grad_norm": 3.7894692420959473, + "grad_norm_var": 0.5372816782840233, + "learning_rate": 0.0001, + "loss": 5169.7563, + "loss/crossentropy": 2.607316017150879, + "loss/hidden": 0.54296875, + "loss/logits": 0.11729206144809723, + "loss/reg": 5169.09619140625, + "step": 1460 + }, + { + "epoch": 0.036525, + "grad_norm": 1.3040951490402222, + "grad_norm_var": 0.5344039803077333, + "learning_rate": 0.0001, + "loss": 5157.8638, + "loss/crossentropy": 2.8131933212280273, + "loss/hidden": 0.50390625, + "loss/logits": 0.09452105313539505, + "loss/reg": 5157.26513671875, + "step": 1461 + }, + { + "epoch": 0.03655, + "grad_norm": 8.957572937011719, + "grad_norm_var": 4.233578837622493, + "learning_rate": 0.0001, + "loss": 5146.8818, + "loss/crossentropy": 2.7740039825439453, + "loss/hidden": 0.5859375, + "loss/logits": 0.1216060072183609, + "loss/reg": 5146.17431640625, + "step": 1462 + }, + { + "epoch": 0.036575, + "grad_norm": 1.2484403848648071, + "grad_norm_var": 4.201172321158906, + "learning_rate": 0.0001, + "loss": 5135.6914, + "loss/crossentropy": 2.657205104827881, + "loss/hidden": 0.494140625, + "loss/logits": 0.09523904323577881, + "loss/reg": 5135.10205078125, + "step": 1463 + }, + { + "epoch": 0.0366, + "grad_norm": 0.8138161897659302, + "grad_norm_var": 4.212151832844503, + "learning_rate": 0.0001, + "loss": 5124.5767, + "loss/crossentropy": 2.71421480178833, + "loss/hidden": 0.48828125, + "loss/logits": 0.08790698647499084, + "loss/reg": 5124.00048828125, + "step": 1464 + }, + { + "epoch": 0.036625, + "grad_norm": 0.9401508569717407, + "grad_norm_var": 4.188087110269854, + "learning_rate": 0.0001, + "loss": 5113.332, + "loss/crossentropy": 2.528451681137085, + "loss/hidden": 0.49609375, + "loss/logits": 0.10147030651569366, + "loss/reg": 5112.734375, + "step": 1465 + }, + { + "epoch": 0.03665, + "grad_norm": 0.9211971163749695, + "grad_norm_var": 4.182023450957908, + "learning_rate": 0.0001, + "loss": 5102.2676, + "loss/crossentropy": 2.9942121505737305, + "loss/hidden": 0.51953125, + "loss/logits": 0.0982511043548584, + "loss/reg": 5101.64990234375, + "step": 1466 + }, + { + "epoch": 0.036675, + "grad_norm": 1.070372462272644, + "grad_norm_var": 4.209537857366257, + "learning_rate": 0.0001, + "loss": 5091.1826, + "loss/crossentropy": 3.0105254650115967, + "loss/hidden": 0.51953125, + "loss/logits": 0.10811451077461243, + "loss/reg": 5090.55517578125, + "step": 1467 + }, + { + "epoch": 0.0367, + "grad_norm": 1.1308951377868652, + "grad_norm_var": 4.230532903487014, + "learning_rate": 0.0001, + "loss": 5080.3057, + "loss/crossentropy": 2.9224534034729004, + "loss/hidden": 0.53125, + "loss/logits": 0.1031089499592781, + "loss/reg": 5079.67138671875, + "step": 1468 + }, + { + "epoch": 0.036725, + "grad_norm": 1.8754812479019165, + "grad_norm_var": 4.230695854346935, + "learning_rate": 0.0001, + "loss": 5069.5083, + "loss/crossentropy": 2.867027521133423, + "loss/hidden": 0.5234375, + "loss/logits": 0.10570839047431946, + "loss/reg": 5068.87939453125, + "step": 1469 + }, + { + "epoch": 0.03675, + "grad_norm": 1.141131043434143, + "grad_norm_var": 4.22614386003211, + "learning_rate": 0.0001, + "loss": 5058.5923, + "loss/crossentropy": 2.6247942447662354, + "loss/hidden": 0.53125, + "loss/logits": 0.09891894459724426, + "loss/reg": 5057.9619140625, + "step": 1470 + }, + { + "epoch": 0.036775, + "grad_norm": 1.1037302017211914, + "grad_norm_var": 4.211825607138292, + "learning_rate": 0.0001, + "loss": 5047.3931, + "loss/crossentropy": 3.0050926208496094, + "loss/hidden": 0.54296875, + "loss/logits": 0.12298932671546936, + "loss/reg": 5046.72705078125, + "step": 1471 + }, + { + "epoch": 0.0368, + "grad_norm": 1.3930877447128296, + "grad_norm_var": 4.167055799482511, + "learning_rate": 0.0001, + "loss": 5035.3618, + "loss/crossentropy": 2.867047071456909, + "loss/hidden": 0.546875, + "loss/logits": 0.09695276618003845, + "loss/reg": 5034.7177734375, + "step": 1472 + }, + { + "epoch": 0.036825, + "grad_norm": 0.9566633701324463, + "grad_norm_var": 4.157774262940288, + "learning_rate": 0.0001, + "loss": 5024.1187, + "loss/crossentropy": 2.724266767501831, + "loss/hidden": 0.5234375, + "loss/logits": 0.09489968419075012, + "loss/reg": 5023.50048828125, + "step": 1473 + }, + { + "epoch": 0.03685, + "grad_norm": 1.1047159433364868, + "grad_norm_var": 4.135769850467174, + "learning_rate": 0.0001, + "loss": 5013.0962, + "loss/crossentropy": 2.5027365684509277, + "loss/hidden": 0.51953125, + "loss/logits": 0.09591224044561386, + "loss/reg": 5012.48095703125, + "step": 1474 + }, + { + "epoch": 0.036875, + "grad_norm": 1.4689037799835205, + "grad_norm_var": 4.105776990056344, + "learning_rate": 0.0001, + "loss": 5001.6812, + "loss/crossentropy": 2.7909207344055176, + "loss/hidden": 0.54296875, + "loss/logits": 0.1282854527235031, + "loss/reg": 5001.009765625, + "step": 1475 + }, + { + "epoch": 0.0369, + "grad_norm": 1.0739867687225342, + "grad_norm_var": 3.855824374802542, + "learning_rate": 0.0001, + "loss": 4990.5381, + "loss/crossentropy": 2.8297784328460693, + "loss/hidden": 0.51171875, + "loss/logits": 0.08772643655538559, + "loss/reg": 4989.9384765625, + "step": 1476 + }, + { + "epoch": 0.036925, + "grad_norm": 1.1636221408843994, + "grad_norm_var": 3.863658397032923, + "learning_rate": 0.0001, + "loss": 4979.9316, + "loss/crossentropy": 3.1510634422302246, + "loss/hidden": 0.54296875, + "loss/logits": 0.1621702015399933, + "loss/reg": 4979.2265625, + "step": 1477 + }, + { + "epoch": 0.03695, + "grad_norm": 0.8736411929130554, + "grad_norm_var": 0.06906679248819637, + "learning_rate": 0.0001, + "loss": 4969.0767, + "loss/crossentropy": 2.646597146987915, + "loss/hidden": 0.5078125, + "loss/logits": 0.09061123430728912, + "loss/reg": 4968.47802734375, + "step": 1478 + }, + { + "epoch": 0.036975, + "grad_norm": 1.014277458190918, + "grad_norm_var": 0.06918584750399195, + "learning_rate": 0.0001, + "loss": 4958.5859, + "loss/crossentropy": 2.814887762069702, + "loss/hidden": 0.50390625, + "loss/logits": 0.1120857298374176, + "loss/reg": 4957.9697265625, + "step": 1479 + }, + { + "epoch": 0.037, + "grad_norm": 1.068294882774353, + "grad_norm_var": 0.06257783444038457, + "learning_rate": 0.0001, + "loss": 4947.8711, + "loss/crossentropy": 2.810455799102783, + "loss/hidden": 0.53515625, + "loss/logits": 0.12479346990585327, + "loss/reg": 4947.2109375, + "step": 1480 + }, + { + "epoch": 0.037025, + "grad_norm": 0.9667389392852783, + "grad_norm_var": 0.061900209019312945, + "learning_rate": 0.0001, + "loss": 4937.2061, + "loss/crossentropy": 2.9907290935516357, + "loss/hidden": 0.5, + "loss/logits": 0.09580424427986145, + "loss/reg": 4936.6103515625, + "step": 1481 + }, + { + "epoch": 0.03705, + "grad_norm": 0.806204617023468, + "grad_norm_var": 0.06616454229393133, + "learning_rate": 0.0001, + "loss": 4926.8564, + "loss/crossentropy": 2.8602395057678223, + "loss/hidden": 0.494140625, + "loss/logits": 0.10172467678785324, + "loss/reg": 4926.2607421875, + "step": 1482 + }, + { + "epoch": 0.037075, + "grad_norm": 1.797229290008545, + "grad_norm_var": 0.09260782793659808, + "learning_rate": 0.0001, + "loss": 4916.8042, + "loss/crossentropy": 3.3809680938720703, + "loss/hidden": 0.5625, + "loss/logits": 0.15680980682373047, + "loss/reg": 4916.0849609375, + "step": 1483 + }, + { + "epoch": 0.0371, + "grad_norm": 0.965241014957428, + "grad_norm_var": 0.09548839779796195, + "learning_rate": 0.0001, + "loss": 4906.4863, + "loss/crossentropy": 2.7549846172332764, + "loss/hidden": 0.5234375, + "loss/logits": 0.10236407816410065, + "loss/reg": 4905.8603515625, + "step": 1484 + }, + { + "epoch": 0.037125, + "grad_norm": 0.9277069568634033, + "grad_norm_var": 0.06289725839143258, + "learning_rate": 0.0001, + "loss": 4896.3291, + "loss/crossentropy": 2.7426342964172363, + "loss/hidden": 0.515625, + "loss/logits": 0.10069979727268219, + "loss/reg": 4895.712890625, + "step": 1485 + }, + { + "epoch": 0.03715, + "grad_norm": 1.094347357749939, + "grad_norm_var": 0.0628652724996159, + "learning_rate": 0.0001, + "loss": 4885.9609, + "loss/crossentropy": 2.6395647525787354, + "loss/hidden": 0.5, + "loss/logits": 0.0898917019367218, + "loss/reg": 4885.37109375, + "step": 1486 + }, + { + "epoch": 0.037175, + "grad_norm": 1.3503471612930298, + "grad_norm_var": 0.0664225551921736, + "learning_rate": 0.0001, + "loss": 4875.8916, + "loss/crossentropy": 2.944330930709839, + "loss/hidden": 0.5390625, + "loss/logits": 0.16826876997947693, + "loss/reg": 4875.18408203125, + "step": 1487 + }, + { + "epoch": 0.0372, + "grad_norm": 1.0866377353668213, + "grad_norm_var": 0.0614018308856836, + "learning_rate": 0.0001, + "loss": 4865.8545, + "loss/crossentropy": 2.8836276531219482, + "loss/hidden": 0.5625, + "loss/logits": 0.11985873430967331, + "loss/reg": 4865.17236328125, + "step": 1488 + }, + { + "epoch": 0.037225, + "grad_norm": 1.2594707012176514, + "grad_norm_var": 0.06104631059139612, + "learning_rate": 0.0001, + "loss": 4854.1958, + "loss/crossentropy": 3.3286149501800537, + "loss/hidden": 0.546875, + "loss/logits": 0.1134444922208786, + "loss/reg": 4853.53564453125, + "step": 1489 + }, + { + "epoch": 0.03725, + "grad_norm": 1.0488533973693848, + "grad_norm_var": 0.06140237843236241, + "learning_rate": 0.0001, + "loss": 4843.4141, + "loss/crossentropy": 2.967571973800659, + "loss/hidden": 0.5234375, + "loss/logits": 0.10797536373138428, + "loss/reg": 4842.78271484375, + "step": 1490 + }, + { + "epoch": 0.037275, + "grad_norm": 1.0009063482284546, + "grad_norm_var": 0.05349721406061207, + "learning_rate": 0.0001, + "loss": 4833.2422, + "loss/crossentropy": 2.8375730514526367, + "loss/hidden": 0.52734375, + "loss/logits": 0.10792994499206543, + "loss/reg": 4832.60693359375, + "step": 1491 + }, + { + "epoch": 0.0373, + "grad_norm": 0.8741068840026855, + "grad_norm_var": 0.0565167608334434, + "learning_rate": 0.0001, + "loss": 4823.21, + "loss/crossentropy": 2.7234745025634766, + "loss/hidden": 0.51171875, + "loss/logits": 0.10996709764003754, + "loss/reg": 4822.58837890625, + "step": 1492 + }, + { + "epoch": 0.037325, + "grad_norm": 0.8451005220413208, + "grad_norm_var": 0.05935315328759002, + "learning_rate": 0.0001, + "loss": 4813.2583, + "loss/crossentropy": 2.9505717754364014, + "loss/hidden": 0.494140625, + "loss/logits": 0.10429343581199646, + "loss/reg": 4812.65966796875, + "step": 1493 + }, + { + "epoch": 0.03735, + "grad_norm": 0.8686822652816772, + "grad_norm_var": 0.05947869834809385, + "learning_rate": 0.0001, + "loss": 4803.2378, + "loss/crossentropy": 2.901232957839966, + "loss/hidden": 0.51953125, + "loss/logits": 0.09702814370393753, + "loss/reg": 4802.62109375, + "step": 1494 + }, + { + "epoch": 0.037375, + "grad_norm": 0.7863928079605103, + "grad_norm_var": 0.0641405366837444, + "learning_rate": 0.0001, + "loss": 4793.2183, + "loss/crossentropy": 2.769129991531372, + "loss/hidden": 0.494140625, + "loss/logits": 0.09640781581401825, + "loss/reg": 4792.6279296875, + "step": 1495 + }, + { + "epoch": 0.0374, + "grad_norm": 0.8319947719573975, + "grad_norm_var": 0.06694816320778324, + "learning_rate": 0.0001, + "loss": 4783.624, + "loss/crossentropy": 2.732968330383301, + "loss/hidden": 0.4921875, + "loss/logits": 0.1065090000629425, + "loss/reg": 4783.025390625, + "step": 1496 + }, + { + "epoch": 0.037425, + "grad_norm": 1.0754128694534302, + "grad_norm_var": 0.06674251294971753, + "learning_rate": 0.0001, + "loss": 4774.0024, + "loss/crossentropy": 2.5251784324645996, + "loss/hidden": 0.5078125, + "loss/logits": 0.09990575909614563, + "loss/reg": 4773.39453125, + "step": 1497 + }, + { + "epoch": 0.03745, + "grad_norm": 0.9813482761383057, + "grad_norm_var": 0.06323119874060669, + "learning_rate": 0.0001, + "loss": 4764.0625, + "loss/crossentropy": 2.790985584259033, + "loss/hidden": 0.52734375, + "loss/logits": 0.11107774078845978, + "loss/reg": 4763.42431640625, + "step": 1498 + }, + { + "epoch": 0.037475, + "grad_norm": 0.8058618903160095, + "grad_norm_var": 0.025834880792797735, + "learning_rate": 0.0001, + "loss": 4754.4912, + "loss/crossentropy": 2.819187879562378, + "loss/hidden": 0.490234375, + "loss/logits": 0.10257354378700256, + "loss/reg": 4753.8984375, + "step": 1499 + }, + { + "epoch": 0.0375, + "grad_norm": 0.9984162449836731, + "grad_norm_var": 0.02580454190790243, + "learning_rate": 0.0001, + "loss": 4745.0625, + "loss/crossentropy": 2.828315496444702, + "loss/hidden": 0.5, + "loss/logits": 0.10384377837181091, + "loss/reg": 4744.45849609375, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 40000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.683532251136e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}