{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.008241509185161987, "eval_steps": 2000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.241509185161987e-06, "grad_norm": 780.0, "learning_rate": 5e-05, "loss": 22.7489, "loss/crossentropy": 8.68287467956543, "loss/dist_ce": 0.0, "loss/hidden": 16.25, "loss/idx": 0.0, "loss/logits": 6.498888969421387, "step": 1 }, { "epoch": 1.6483018370323974e-05, "grad_norm": 824.0, "learning_rate": 5e-05, "loss": 18.5076, "loss/crossentropy": 8.787271499633789, "loss/dist_ce": 0.0, "loss/hidden": 12.625, "loss/idx": 0.0, "loss/logits": 5.8825788497924805, "step": 2 }, { "epoch": 2.472452755548596e-05, "grad_norm": 466.0, "learning_rate": 5e-05, "loss": 12.0241, "loss/crossentropy": 7.810218334197998, "loss/dist_ce": 0.0, "loss/hidden": 7.21875, "loss/idx": 0.0, "loss/logits": 4.805373191833496, "step": 3 }, { "epoch": 3.296603674064795e-05, "grad_norm": 215.0, "learning_rate": 5e-05, "loss": 9.4126, "loss/crossentropy": 6.043552398681641, "loss/dist_ce": 0.0, "loss/hidden": 6.25, "loss/idx": 0.0, "loss/logits": 3.1625852584838867, "step": 4 }, { "epoch": 4.1207545925809937e-05, "grad_norm": 468.0, "learning_rate": 5e-05, "loss": 6.2853, "loss/crossentropy": 4.783352851867676, "loss/dist_ce": 0.0, "loss/hidden": 4.21875, "loss/idx": 0.0, "loss/logits": 2.066528558731079, "step": 5 }, { "epoch": 4.944905511097192e-05, "grad_norm": 306.0, "learning_rate": 5e-05, "loss": 5.4625, "loss/crossentropy": 1.6133296489715576, "loss/dist_ce": 0.0, "loss/hidden": 4.1875, "loss/idx": 0.0, "loss/logits": 1.2749497890472412, "step": 6 }, { "epoch": 5.769056429613391e-05, "grad_norm": 217.0, "learning_rate": 5e-05, "loss": 8.1947, "loss/crossentropy": 4.63270378112793, "loss/dist_ce": 0.0, "loss/hidden": 5.78125, "loss/idx": 0.0, "loss/logits": 2.4134607315063477, "step": 7 }, { "epoch": 6.59320734812959e-05, "grad_norm": 404.0, "learning_rate": 5e-05, "loss": 5.0477, "loss/crossentropy": 4.424153804779053, "loss/dist_ce": 0.0, "loss/hidden": 3.53125, "loss/idx": 0.0, "loss/logits": 1.5164613723754883, "step": 8 }, { "epoch": 7.417358266645788e-05, "grad_norm": 83.5, "learning_rate": 5e-05, "loss": 3.1549, "loss/crossentropy": 3.354282855987549, "loss/dist_ce": 0.0, "loss/hidden": 2.296875, "loss/idx": 0.0, "loss/logits": 0.8579829931259155, "step": 9 }, { "epoch": 8.241509185161987e-05, "grad_norm": 115.5, "learning_rate": 5e-05, "loss": 3.1588, "loss/crossentropy": 3.1871225833892822, "loss/dist_ce": 0.0, "loss/hidden": 2.375, "loss/idx": 0.0, "loss/logits": 0.7837648391723633, "step": 10 }, { "epoch": 9.065660103678186e-05, "grad_norm": 252.0, "learning_rate": 5e-05, "loss": 7.2603, "loss/crossentropy": 4.682134628295898, "loss/dist_ce": 0.0, "loss/hidden": 5.0, "loss/idx": 0.0, "loss/logits": 2.2602720260620117, "step": 11 }, { "epoch": 9.889811022194384e-05, "grad_norm": 109.0, "learning_rate": 5e-05, "loss": 3.1302, "loss/crossentropy": 2.417746067047119, "loss/dist_ce": 0.0, "loss/hidden": 2.46875, "loss/idx": 0.0, "loss/logits": 0.661416232585907, "step": 12 }, { "epoch": 0.00010713961940710583, "grad_norm": 68.5, "learning_rate": 5e-05, "loss": 2.4003, "loss/crossentropy": 1.6968345642089844, "loss/dist_ce": 0.0, "loss/hidden": 1.984375, "loss/idx": 0.0, "loss/logits": 0.4158973693847656, "step": 13 }, { "epoch": 0.00011538112859226781, "grad_norm": 454.0, "learning_rate": 5e-05, "loss": 7.3347, "loss/crossentropy": 4.652151584625244, "loss/dist_ce": 0.0, "loss/hidden": 4.25, "loss/idx": 0.0, "loss/logits": 3.084686756134033, "step": 14 }, { "epoch": 0.00012362263777742982, "grad_norm": 126.5, "learning_rate": 5e-05, "loss": 2.4695, "loss/crossentropy": 3.0716333389282227, "loss/dist_ce": 0.0, "loss/hidden": 1.875, "loss/idx": 0.0, "loss/logits": 0.594476580619812, "step": 15 }, { "epoch": 0.0001318641469625918, "grad_norm": 306.0, "grad_norm_var": 53540.9625, "learning_rate": 5e-05, "loss": 3.9132, "loss/crossentropy": 2.430070638656616, "loss/dist_ce": 0.0, "loss/hidden": 2.9375, "loss/idx": 0.0, "loss/logits": 0.9756777882575989, "step": 16 }, { "epoch": 0.00014010565614775377, "grad_norm": 68.5, "grad_norm_var": 41986.49895833333, "learning_rate": 5e-05, "loss": 2.3829, "loss/crossentropy": 1.8029091358184814, "loss/dist_ce": 0.0, "loss/hidden": 1.8828125, "loss/idx": 0.0, "loss/logits": 0.5000446438789368, "step": 17 }, { "epoch": 0.00014834716533291577, "grad_norm": 139.0, "grad_norm_var": 21647.707291666666, "learning_rate": 5e-05, "loss": 2.5801, "loss/crossentropy": 1.5956979990005493, "loss/dist_ce": 0.0, "loss/hidden": 2.15625, "loss/idx": 0.0, "loss/logits": 0.4238685965538025, "step": 18 }, { "epoch": 0.00015658867451807774, "grad_norm": 28.625, "grad_norm_var": 20272.937434895834, "learning_rate": 5e-05, "loss": 1.6796, "loss/crossentropy": 2.664867401123047, "loss/dist_ce": 0.0, "loss/hidden": 1.296875, "loss/idx": 0.0, "loss/logits": 0.3827553689479828, "step": 19 }, { "epoch": 0.00016483018370323975, "grad_norm": 81.5, "grad_norm_var": 21299.079622395835, "learning_rate": 5e-05, "loss": 1.8862, "loss/crossentropy": 3.0564301013946533, "loss/dist_ce": 0.0, "loss/hidden": 1.4609375, "loss/idx": 0.0, "loss/logits": 0.42524370551109314, "step": 20 }, { "epoch": 0.00017307169288840172, "grad_norm": 71.5, "grad_norm_var": 17047.856184895834, "learning_rate": 5e-05, "loss": 1.8175, "loss/crossentropy": 1.5220972299575806, "loss/dist_ce": 0.0, "loss/hidden": 1.578125, "loss/idx": 0.0, "loss/logits": 0.23937611281871796, "step": 21 }, { "epoch": 0.00018131320207356372, "grad_norm": 32.25, "grad_norm_var": 17021.051497395834, "learning_rate": 5e-05, "loss": 1.636, "loss/crossentropy": 1.8798402547836304, "loss/dist_ce": 0.0, "loss/hidden": 1.3125, "loss/idx": 0.0, "loss/logits": 0.32345157861709595, "step": 22 }, { "epoch": 0.0001895547112587257, "grad_norm": 31.25, "grad_norm_var": 17761.729622395833, "learning_rate": 5e-05, "loss": 1.5306, "loss/crossentropy": 3.0712087154388428, "loss/dist_ce": 0.0, "loss/hidden": 1.171875, "loss/idx": 0.0, "loss/logits": 0.3587738275527954, "step": 23 }, { "epoch": 0.00019779622044388767, "grad_norm": 13.375, "grad_norm_var": 13976.939583333333, "learning_rate": 5e-05, "loss": 1.0326, "loss/crossentropy": 2.2200183868408203, "loss/dist_ce": 0.0, "loss/hidden": 0.8125, "loss/idx": 0.0, "loss/logits": 0.2200760841369629, "step": 24 }, { "epoch": 0.00020603772962904968, "grad_norm": 28.125, "grad_norm_var": 14466.229622395833, "learning_rate": 5e-05, "loss": 1.509, "loss/crossentropy": 3.206345319747925, "loss/dist_ce": 0.0, "loss/hidden": 1.1484375, "loss/idx": 0.0, "loss/logits": 0.3605613112449646, "step": 25 }, { "epoch": 0.00021427923881421165, "grad_norm": 153.0, "grad_norm_var": 14529.862434895833, "learning_rate": 5e-05, "loss": 2.1474, "loss/crossentropy": 1.5313490629196167, "loss/dist_ce": 0.0, "loss/hidden": 1.765625, "loss/idx": 0.0, "loss/logits": 0.38178950548171997, "step": 26 }, { "epoch": 0.00022252074799937365, "grad_norm": 94.0, "grad_norm_var": 13366.093684895834, "learning_rate": 5e-05, "loss": 3.417, "loss/crossentropy": 1.551514744758606, "loss/dist_ce": 0.0, "loss/hidden": 2.75, "loss/idx": 0.0, "loss/logits": 0.6670438051223755, "step": 27 }, { "epoch": 0.00023076225718453563, "grad_norm": 268.0, "grad_norm_var": 14865.165559895833, "learning_rate": 5e-05, "loss": 1.9003, "loss/crossentropy": 3.108414649963379, "loss/dist_ce": 0.0, "loss/hidden": 1.421875, "loss/idx": 0.0, "loss/logits": 0.4784301221370697, "step": 28 }, { "epoch": 0.00023900376636969763, "grad_norm": 34.25, "grad_norm_var": 15186.259309895833, "learning_rate": 5e-05, "loss": 1.0626, "loss/crossentropy": 3.3259568214416504, "loss/dist_ce": 0.0, "loss/hidden": 0.81640625, "loss/idx": 0.0, "loss/logits": 0.24623815715312958, "step": 29 }, { "epoch": 0.00024724527555485963, "grad_norm": 167.0, "grad_norm_var": 7576.8728515625, "learning_rate": 5e-05, "loss": 1.4178, "loss/crossentropy": 1.5920592546463013, "loss/dist_ce": 0.0, "loss/hidden": 1.28125, "loss/idx": 0.0, "loss/logits": 0.13655498623847961, "step": 30 }, { "epoch": 0.0002554867847400216, "grad_norm": 12.6875, "grad_norm_var": 8024.979931640625, "learning_rate": 5e-05, "loss": 0.9625, "loss/crossentropy": 2.868499517440796, "loss/dist_ce": 0.0, "loss/hidden": 0.7421875, "loss/idx": 0.0, "loss/logits": 0.2203603982925415, "step": 31 }, { "epoch": 0.0002637282939251836, "grad_norm": 49.5, "grad_norm_var": 4940.166650390625, "learning_rate": 5e-05, "loss": 1.0668, "loss/crossentropy": 2.660956859588623, "loss/dist_ce": 0.0, "loss/hidden": 0.8359375, "loss/idx": 0.0, "loss/logits": 0.23084667325019836, "step": 32 }, { "epoch": 0.0002719698031103456, "grad_norm": 18.875, "grad_norm_var": 5167.097639973958, "learning_rate": 5e-05, "loss": 0.9463, "loss/crossentropy": 1.6037225723266602, "loss/dist_ce": 0.0, "loss/hidden": 0.8125, "loss/idx": 0.0, "loss/logits": 0.13378173112869263, "step": 33 }, { "epoch": 0.00028021131229550753, "grad_norm": 18.75, "grad_norm_var": 5067.703499348959, "learning_rate": 5e-05, "loss": 1.0141, "loss/crossentropy": 1.0409276485443115, "loss/dist_ce": 0.0, "loss/hidden": 0.84375, "loss/idx": 0.0, "loss/logits": 0.1703900545835495, "step": 34 }, { "epoch": 0.00028845282148066954, "grad_norm": 16.5, "grad_norm_var": 5142.032275390625, "learning_rate": 5e-05, "loss": 0.9889, "loss/crossentropy": 1.5536582469940186, "loss/dist_ce": 0.0, "loss/hidden": 0.8359375, "loss/idx": 0.0, "loss/logits": 0.1529390513896942, "step": 35 }, { "epoch": 0.00029669433066583154, "grad_norm": 9.8125, "grad_norm_var": 5335.719205729167, "learning_rate": 5e-05, "loss": 0.8687, "loss/crossentropy": 2.7224836349487305, "loss/dist_ce": 0.0, "loss/hidden": 0.671875, "loss/idx": 0.0, "loss/logits": 0.19684143364429474, "step": 36 }, { "epoch": 0.00030493583985099354, "grad_norm": 7.125, "grad_norm_var": 5527.603645833334, "learning_rate": 5e-05, "loss": 0.5503, "loss/crossentropy": 2.5596024990081787, "loss/dist_ce": 0.0, "loss/hidden": 0.4296875, "loss/idx": 0.0, "loss/logits": 0.12058012187480927, "step": 37 }, { "epoch": 0.0003131773490361555, "grad_norm": 13.1875, "grad_norm_var": 5619.972379557292, "learning_rate": 5e-05, "loss": 0.8248, "loss/crossentropy": 2.8074352741241455, "loss/dist_ce": 0.0, "loss/hidden": 0.62109375, "loss/idx": 0.0, "loss/logits": 0.2037278115749359, "step": 38 }, { "epoch": 0.0003214188582213175, "grad_norm": 37.75, "grad_norm_var": 5599.026806640625, "learning_rate": 5e-05, "loss": 1.3462, "loss/crossentropy": 1.5375018119812012, "loss/dist_ce": 0.0, "loss/hidden": 1.15625, "loss/idx": 0.0, "loss/logits": 0.18997883796691895, "step": 39 }, { "epoch": 0.0003296603674064795, "grad_norm": 7.28125, "grad_norm_var": 5638.313244628906, "learning_rate": 5e-05, "loss": 0.6618, "loss/crossentropy": 2.4615395069122314, "loss/dist_ce": 0.0, "loss/hidden": 0.51171875, "loss/idx": 0.0, "loss/logits": 0.15010175108909607, "step": 40 }, { "epoch": 0.00033790187659164144, "grad_norm": 68.5, "grad_norm_var": 5576.730855305989, "learning_rate": 5e-05, "loss": 0.7963, "loss/crossentropy": 1.1309521198272705, "loss/dist_ce": 0.0, "loss/hidden": 0.6796875, "loss/idx": 0.0, "loss/logits": 0.11660157144069672, "step": 41 }, { "epoch": 0.00034614338577680344, "grad_norm": 10.0625, "grad_norm_var": 5100.57030843099, "learning_rate": 5e-05, "loss": 0.6657, "loss/crossentropy": 2.225135326385498, "loss/dist_ce": 0.0, "loss/hidden": 0.54296875, "loss/idx": 0.0, "loss/logits": 0.12272368371486664, "step": 42 }, { "epoch": 0.00035438489496196544, "grad_norm": 15.125, "grad_norm_var": 5048.541564941406, "learning_rate": 5e-05, "loss": 0.7119, "loss/crossentropy": 0.907244861125946, "loss/dist_ce": 0.0, "loss/hidden": 0.6328125, "loss/idx": 0.0, "loss/logits": 0.07912808656692505, "step": 43 }, { "epoch": 0.00036262640414712745, "grad_norm": 11.25, "grad_norm_var": 1608.158426920573, "learning_rate": 5e-05, "loss": 0.757, "loss/crossentropy": 1.7073473930358887, "loss/dist_ce": 0.0, "loss/hidden": 0.62109375, "loss/idx": 0.0, "loss/logits": 0.1358700692653656, "step": 44 }, { "epoch": 0.0003708679133322894, "grad_norm": 8.3125, "grad_norm_var": 1639.323954264323, "learning_rate": 5e-05, "loss": 0.7863, "loss/crossentropy": 2.7458887100219727, "loss/dist_ce": 0.0, "loss/hidden": 0.62890625, "loss/idx": 0.0, "loss/logits": 0.15743763744831085, "step": 45 }, { "epoch": 0.0003791094225174514, "grad_norm": 7.15625, "grad_norm_var": 305.3570963541667, "learning_rate": 5e-05, "loss": 0.7888, "loss/crossentropy": 3.2708899974823, "loss/dist_ce": 0.0, "loss/hidden": 0.58984375, "loss/idx": 0.0, "loss/logits": 0.19896197319030762, "step": 46 }, { "epoch": 0.0003873509317026134, "grad_norm": 95.5, "grad_norm_var": 658.8413899739584, "learning_rate": 5e-05, "loss": 1.2412, "loss/crossentropy": 2.0868113040924072, "loss/dist_ce": 0.0, "loss/hidden": 1.0, "loss/idx": 0.0, "loss/logits": 0.24115484952926636, "step": 47 }, { "epoch": 0.00039559244088777535, "grad_norm": 7.53125, "grad_norm_var": 629.9714803059895, "learning_rate": 5e-05, "loss": 0.7909, "loss/crossentropy": 2.5569632053375244, "loss/dist_ce": 0.0, "loss/hidden": 0.6171875, "loss/idx": 0.0, "loss/logits": 0.17373064160346985, "step": 48 }, { "epoch": 0.00040383395007293735, "grad_norm": 7.125, "grad_norm_var": 643.5665974934896, "learning_rate": 5e-05, "loss": 0.5926, "loss/crossentropy": 1.3575685024261475, "loss/dist_ce": 0.0, "loss/hidden": 0.486328125, "loss/idx": 0.0, "loss/logits": 0.10628513246774673, "step": 49 }, { "epoch": 0.00041207545925809935, "grad_norm": 14.0625, "grad_norm_var": 646.5402303059896, "learning_rate": 5e-05, "loss": 0.5641, "loss/crossentropy": 1.111220359802246, "loss/dist_ce": 0.0, "loss/hidden": 0.486328125, "loss/idx": 0.0, "loss/logits": 0.07779324799776077, "step": 50 }, { "epoch": 0.00042031696844326135, "grad_norm": 5.375, "grad_norm_var": 660.9766560872396, "learning_rate": 5e-05, "loss": 0.622, "loss/crossentropy": 2.907522678375244, "loss/dist_ce": 0.0, "loss/hidden": 0.498046875, "loss/idx": 0.0, "loss/logits": 0.12399697303771973, "step": 51 }, { "epoch": 0.0004285584776284233, "grad_norm": 9.75, "grad_norm_var": 661.0644816080729, "learning_rate": 5e-05, "loss": 0.5746, "loss/crossentropy": 2.72662353515625, "loss/dist_ce": 0.0, "loss/hidden": 0.4453125, "loss/idx": 0.0, "loss/logits": 0.12924730777740479, "step": 52 }, { "epoch": 0.0004367999868135853, "grad_norm": 4.9375, "grad_norm_var": 665.2116170247396, "learning_rate": 5e-05, "loss": 0.6226, "loss/crossentropy": 2.3917365074157715, "loss/dist_ce": 0.0, "loss/hidden": 0.498046875, "loss/idx": 0.0, "loss/logits": 0.1245586946606636, "step": 53 }, { "epoch": 0.0004450414959987473, "grad_norm": 28.0, "grad_norm_var": 665.1113240559896, "learning_rate": 5e-05, "loss": 0.8407, "loss/crossentropy": 2.7228264808654785, "loss/dist_ce": 0.0, "loss/hidden": 0.65234375, "loss/idx": 0.0, "loss/logits": 0.18836885690689087, "step": 54 }, { "epoch": 0.00045328300518390925, "grad_norm": 32.5, "grad_norm_var": 655.1841756184896, "learning_rate": 5e-05, "loss": 0.6963, "loss/crossentropy": 2.543640375137329, "loss/dist_ce": 0.0, "loss/hidden": 0.56640625, "loss/idx": 0.0, "loss/logits": 0.12986385822296143, "step": 55 }, { "epoch": 0.00046152451436907126, "grad_norm": 16.875, "grad_norm_var": 643.6704264322917, "learning_rate": 5e-05, "loss": 0.7344, "loss/crossentropy": 1.649795413017273, "loss/dist_ce": 0.0, "loss/hidden": 0.609375, "loss/idx": 0.0, "loss/logits": 0.12504054605960846, "step": 56 }, { "epoch": 0.00046976602355423326, "grad_norm": 28.0, "grad_norm_var": 491.7321451822917, "learning_rate": 5e-05, "loss": 0.7995, "loss/crossentropy": 1.5515227317810059, "loss/dist_ce": 0.0, "loss/hidden": 0.6796875, "loss/idx": 0.0, "loss/logits": 0.11983367055654526, "step": 57 }, { "epoch": 0.00047800753273939526, "grad_norm": 4.15625, "grad_norm_var": 500.8306925455729, "learning_rate": 5e-05, "loss": 0.4793, "loss/crossentropy": 1.7439237833023071, "loss/dist_ce": 0.0, "loss/hidden": 0.388671875, "loss/idx": 0.0, "loss/logits": 0.09059557318687439, "step": 58 }, { "epoch": 0.0004862490419245572, "grad_norm": 14.5, "grad_norm_var": 501.1345662434896, "learning_rate": 5e-05, "loss": 1.0179, "loss/crossentropy": 1.387863039970398, "loss/dist_ce": 0.0, "loss/hidden": 0.8828125, "loss/idx": 0.0, "loss/logits": 0.1351165473461151, "step": 59 }, { "epoch": 0.0004944905511097193, "grad_norm": 8.0625, "grad_norm_var": 504.82509358723956, "learning_rate": 5e-05, "loss": 0.5969, "loss/crossentropy": 1.6710844039916992, "loss/dist_ce": 0.0, "loss/hidden": 0.490234375, "loss/idx": 0.0, "loss/logits": 0.10665580630302429, "step": 60 }, { "epoch": 0.0005027320602948812, "grad_norm": 18.0, "grad_norm_var": 497.86724853515625, "learning_rate": 5e-05, "loss": 0.5408, "loss/crossentropy": 1.0266728401184082, "loss/dist_ce": 0.0, "loss/hidden": 0.47265625, "loss/idx": 0.0, "loss/logits": 0.068178191781044, "step": 61 }, { "epoch": 0.0005109735694800432, "grad_norm": 7.46875, "grad_norm_var": 497.38629150390625, "learning_rate": 5e-05, "loss": 0.8584, "loss/crossentropy": 2.9015908241271973, "loss/dist_ce": 0.0, "loss/hidden": 0.6796875, "loss/idx": 0.0, "loss/logits": 0.1787503957748413, "step": 62 }, { "epoch": 0.0005192150786652052, "grad_norm": 7.84375, "grad_norm_var": 81.943603515625, "learning_rate": 5e-05, "loss": 0.4871, "loss/crossentropy": 1.605463981628418, "loss/dist_ce": 0.0, "loss/hidden": 0.40234375, "loss/idx": 0.0, "loss/logits": 0.08476820588111877, "step": 63 }, { "epoch": 0.0005274565878503672, "grad_norm": 6.3125, "grad_norm_var": 82.98795166015626, "learning_rate": 5e-05, "loss": 0.4591, "loss/crossentropy": 1.7012584209442139, "loss/dist_ce": 0.0, "loss/hidden": 0.373046875, "loss/idx": 0.0, "loss/logits": 0.0860566645860672, "step": 64 }, { "epoch": 0.0005356980970355292, "grad_norm": 8.625, "grad_norm_var": 81.89146728515625, "learning_rate": 5e-05, "loss": 0.513, "loss/crossentropy": 1.6209317445755005, "loss/dist_ce": 0.0, "loss/hidden": 0.416015625, "loss/idx": 0.0, "loss/logits": 0.09696009755134583, "step": 65 }, { "epoch": 0.0005439396062206912, "grad_norm": 5.0, "grad_norm_var": 86.22919514973958, "learning_rate": 5e-05, "loss": 0.4936, "loss/crossentropy": 2.992037773132324, "loss/dist_ce": 0.0, "loss/hidden": 0.37890625, "loss/idx": 0.0, "loss/logits": 0.114667147397995, "step": 66 }, { "epoch": 0.0005521811154058532, "grad_norm": 10.5625, "grad_norm_var": 82.74924723307292, "learning_rate": 5e-05, "loss": 0.9616, "loss/crossentropy": 2.2757253646850586, "loss/dist_ce": 0.0, "loss/hidden": 0.75, "loss/idx": 0.0, "loss/logits": 0.21162353456020355, "step": 67 }, { "epoch": 0.0005604226245910151, "grad_norm": 6.0625, "grad_norm_var": 85.27672119140625, "learning_rate": 5e-05, "loss": 0.3899, "loss/crossentropy": 0.7420970797538757, "loss/dist_ce": 0.0, "loss/hidden": 0.34765625, "loss/idx": 0.0, "loss/logits": 0.04225603863596916, "step": 68 }, { "epoch": 0.0005686641337761771, "grad_norm": 4.78125, "grad_norm_var": 85.44479166666666, "learning_rate": 5e-05, "loss": 0.3729, "loss/crossentropy": 1.957132339477539, "loss/dist_ce": 0.0, "loss/hidden": 0.296875, "loss/idx": 0.0, "loss/logits": 0.07599128782749176, "step": 69 }, { "epoch": 0.0005769056429613391, "grad_norm": 8.875, "grad_norm_var": 69.85592447916666, "learning_rate": 5e-05, "loss": 0.5605, "loss/crossentropy": 2.908198356628418, "loss/dist_ce": 0.0, "loss/hidden": 0.4453125, "loss/idx": 0.0, "loss/logits": 0.1151949018239975, "step": 70 }, { "epoch": 0.0005851471521465011, "grad_norm": 6.125, "grad_norm_var": 40.280208333333334, "learning_rate": 5e-05, "loss": 0.4273, "loss/crossentropy": 1.8814876079559326, "loss/dist_ce": 0.0, "loss/hidden": 0.33984375, "loss/idx": 0.0, "loss/logits": 0.08749841153621674, "step": 71 }, { "epoch": 0.0005933886613316631, "grad_norm": 3.625, "grad_norm_var": 39.245052083333334, "learning_rate": 5e-05, "loss": 0.3375, "loss/crossentropy": 1.5110681056976318, "loss/dist_ce": 0.0, "loss/hidden": 0.275390625, "loss/idx": 0.0, "loss/logits": 0.062085069715976715, "step": 72 }, { "epoch": 0.0006016301705168251, "grad_norm": 4.09375, "grad_norm_var": 15.198726399739583, "learning_rate": 5e-05, "loss": 0.3922, "loss/crossentropy": 2.652179479598999, "loss/dist_ce": 0.0, "loss/hidden": 0.306640625, "loss/idx": 0.0, "loss/logits": 0.085569366812706, "step": 73 }, { "epoch": 0.0006098716797019871, "grad_norm": 12.0, "grad_norm_var": 15.279410807291667, "learning_rate": 5e-05, "loss": 0.7232, "loss/crossentropy": 1.3632968664169312, "loss/dist_ce": 0.0, "loss/hidden": 0.625, "loss/idx": 0.0, "loss/logits": 0.09823663532733917, "step": 74 }, { "epoch": 0.000618113188887149, "grad_norm": 3.15625, "grad_norm_var": 13.862919108072917, "learning_rate": 5e-05, "loss": 0.3533, "loss/crossentropy": 1.3768854141235352, "loss/dist_ce": 0.0, "loss/hidden": 0.30078125, "loss/idx": 0.0, "loss/logits": 0.05253131687641144, "step": 75 }, { "epoch": 0.000626354698072311, "grad_norm": 4.5, "grad_norm_var": 14.406571451822916, "learning_rate": 5e-05, "loss": 0.5053, "loss/crossentropy": 2.9618265628814697, "loss/dist_ce": 0.0, "loss/hidden": 0.373046875, "loss/idx": 0.0, "loss/logits": 0.13229554891586304, "step": 76 }, { "epoch": 0.000634596207257473, "grad_norm": 3.96875, "grad_norm_var": 6.720442708333334, "learning_rate": 5e-05, "loss": 0.4483, "loss/crossentropy": 1.52455472946167, "loss/dist_ce": 0.0, "loss/hidden": 0.3828125, "loss/idx": 0.0, "loss/logits": 0.065483957529068, "step": 77 }, { "epoch": 0.000642837716442635, "grad_norm": 3.40625, "grad_norm_var": 7.193343098958334, "learning_rate": 5e-05, "loss": 0.3286, "loss/crossentropy": 2.478041648864746, "loss/dist_ce": 0.0, "loss/hidden": 0.255859375, "loss/idx": 0.0, "loss/logits": 0.0727752074599266, "step": 78 }, { "epoch": 0.000651079225627797, "grad_norm": 3.765625, "grad_norm_var": 7.330077107747396, "learning_rate": 5e-05, "loss": 0.382, "loss/crossentropy": 1.4515001773834229, "loss/dist_ce": 0.0, "loss/hidden": 0.31640625, "loss/idx": 0.0, "loss/logits": 0.06562215089797974, "step": 79 }, { "epoch": 0.000659320734812959, "grad_norm": 3.265625, "grad_norm_var": 7.754378255208334, "learning_rate": 5e-05, "loss": 0.4323, "loss/crossentropy": 1.748592495918274, "loss/dist_ce": 0.0, "loss/hidden": 0.34375, "loss/idx": 0.0, "loss/logits": 0.08855777978897095, "step": 80 }, { "epoch": 0.000667562243998121, "grad_norm": 11.9375, "grad_norm_var": 9.71513671875, "learning_rate": 5e-05, "loss": 0.465, "loss/crossentropy": 2.6704611778259277, "loss/dist_ce": 0.0, "loss/hidden": 0.369140625, "loss/idx": 0.0, "loss/logits": 0.09580960124731064, "step": 81 }, { "epoch": 0.0006758037531832829, "grad_norm": 23.0, "grad_norm_var": 27.69638671875, "learning_rate": 5e-05, "loss": 0.8228, "loss/crossentropy": 0.5576035976409912, "loss/dist_ce": 0.0, "loss/hidden": 0.66796875, "loss/idx": 0.0, "loss/logits": 0.1548667550086975, "step": 82 }, { "epoch": 0.0006840452623684449, "grad_norm": 5.59375, "grad_norm_var": 26.92584228515625, "learning_rate": 5e-05, "loss": 0.4884, "loss/crossentropy": 0.8048841953277588, "loss/dist_ce": 0.0, "loss/hidden": 0.43359375, "loss/idx": 0.0, "loss/logits": 0.05478814244270325, "step": 83 }, { "epoch": 0.0006922867715536069, "grad_norm": 4.96875, "grad_norm_var": 27.102294921875, "learning_rate": 5e-05, "loss": 0.5603, "loss/crossentropy": 2.0498743057250977, "loss/dist_ce": 0.0, "loss/hidden": 0.46875, "loss/idx": 0.0, "loss/logits": 0.0915648490190506, "step": 84 }, { "epoch": 0.0007005282807387689, "grad_norm": 7.8125, "grad_norm_var": 26.90455322265625, "learning_rate": 5e-05, "loss": 0.5343, "loss/crossentropy": 1.643184781074524, "loss/dist_ce": 0.0, "loss/hidden": 0.44140625, "loss/idx": 0.0, "loss/logits": 0.09292187541723251, "step": 85 }, { "epoch": 0.0007087697899239309, "grad_norm": 5.34375, "grad_norm_var": 26.745003255208335, "learning_rate": 5e-05, "loss": 0.537, "loss/crossentropy": 2.662973165512085, "loss/dist_ce": 0.0, "loss/hidden": 0.439453125, "loss/idx": 0.0, "loss/logits": 0.09757896512746811, "step": 86 }, { "epoch": 0.0007170112991090929, "grad_norm": 2.4375, "grad_norm_var": 27.857975260416666, "learning_rate": 5e-05, "loss": 0.2826, "loss/crossentropy": 1.4633480310440063, "loss/dist_ce": 0.0, "loss/hidden": 0.236328125, "loss/idx": 0.0, "loss/logits": 0.0462251678109169, "step": 87 }, { "epoch": 0.0007252528082942549, "grad_norm": 7.21875, "grad_norm_var": 27.32125244140625, "learning_rate": 5e-05, "loss": 0.7004, "loss/crossentropy": 2.25626277923584, "loss/dist_ce": 0.0, "loss/hidden": 0.55859375, "loss/idx": 0.0, "loss/logits": 0.14179712533950806, "step": 88 }, { "epoch": 0.0007334943174794168, "grad_norm": 10.0625, "grad_norm_var": 27.510107421875, "learning_rate": 5e-05, "loss": 0.341, "loss/crossentropy": 2.072801113128662, "loss/dist_ce": 0.0, "loss/hidden": 0.275390625, "loss/idx": 0.0, "loss/logits": 0.0656304880976677, "step": 89 }, { "epoch": 0.0007417358266645788, "grad_norm": 3.46875, "grad_norm_var": 26.40260009765625, "learning_rate": 5e-05, "loss": 0.3528, "loss/crossentropy": 1.6771039962768555, "loss/dist_ce": 0.0, "loss/hidden": 0.29296875, "loss/idx": 0.0, "loss/logits": 0.059821829199790955, "step": 90 }, { "epoch": 0.0007499773358497408, "grad_norm": 2.640625, "grad_norm_var": 26.648696899414062, "learning_rate": 5e-05, "loss": 0.2959, "loss/crossentropy": 1.3347995281219482, "loss/dist_ce": 0.0, "loss/hidden": 0.24609375, "loss/idx": 0.0, "loss/logits": 0.049777351319789886, "step": 91 }, { "epoch": 0.0007582188450349028, "grad_norm": 5.21875, "grad_norm_var": 26.492967732747395, "learning_rate": 5e-05, "loss": 0.4277, "loss/crossentropy": 2.2866933345794678, "loss/dist_ce": 0.0, "loss/hidden": 0.34765625, "loss/idx": 0.0, "loss/logits": 0.07999749481678009, "step": 92 }, { "epoch": 0.0007664603542200648, "grad_norm": 20.375, "grad_norm_var": 37.763719685872395, "learning_rate": 5e-05, "loss": 0.592, "loss/crossentropy": 0.5346123576164246, "loss/dist_ce": 0.0, "loss/hidden": 0.52734375, "loss/idx": 0.0, "loss/logits": 0.06467436254024506, "step": 93 }, { "epoch": 0.0007747018634052268, "grad_norm": 5.28125, "grad_norm_var": 36.951952107747395, "learning_rate": 5e-05, "loss": 0.5485, "loss/crossentropy": 1.4174734354019165, "loss/dist_ce": 0.0, "loss/hidden": 0.46484375, "loss/idx": 0.0, "loss/logits": 0.08362259715795517, "step": 94 }, { "epoch": 0.0007829433725903888, "grad_norm": 3.5, "grad_norm_var": 37.093912760416664, "learning_rate": 5e-05, "loss": 0.3081, "loss/crossentropy": 1.5907094478607178, "loss/dist_ce": 0.0, "loss/hidden": 0.25390625, "loss/idx": 0.0, "loss/logits": 0.05422712862491608, "step": 95 }, { "epoch": 0.0007911848817755507, "grad_norm": 3.546875, "grad_norm_var": 36.93508707682292, "learning_rate": 5e-05, "loss": 0.3014, "loss/crossentropy": 1.4840497970581055, "loss/dist_ce": 0.0, "loss/hidden": 0.25, "loss/idx": 0.0, "loss/logits": 0.05143030732870102, "step": 96 }, { "epoch": 0.0007994263909607127, "grad_norm": 6.03125, "grad_norm_var": 35.73922526041667, "learning_rate": 5e-05, "loss": 0.5235, "loss/crossentropy": 1.9094655513763428, "loss/dist_ce": 0.0, "loss/hidden": 0.416015625, "loss/idx": 0.0, "loss/logits": 0.10748349130153656, "step": 97 }, { "epoch": 0.0008076679001458747, "grad_norm": 5.53125, "grad_norm_var": 18.19996337890625, "learning_rate": 5e-05, "loss": 0.3909, "loss/crossentropy": 1.4463390111923218, "loss/dist_ce": 0.0, "loss/hidden": 0.328125, "loss/idx": 0.0, "loss/logits": 0.06276652216911316, "step": 98 }, { "epoch": 0.0008159094093310367, "grad_norm": 7.03125, "grad_norm_var": 18.214937337239583, "learning_rate": 5e-05, "loss": 0.5097, "loss/crossentropy": 2.1369168758392334, "loss/dist_ce": 0.0, "loss/hidden": 0.3984375, "loss/idx": 0.0, "loss/logits": 0.11129927635192871, "step": 99 }, { "epoch": 0.0008241509185161987, "grad_norm": 2.28125, "grad_norm_var": 19.135965983072918, "learning_rate": 5e-05, "loss": 0.2675, "loss/crossentropy": 1.7142083644866943, "loss/dist_ce": 0.0, "loss/hidden": 0.22265625, "loss/idx": 0.0, "loss/logits": 0.044855352491140366, "step": 100 }, { "epoch": 0.0008323924277013607, "grad_norm": 5.25, "grad_norm_var": 18.965132649739584, "learning_rate": 5e-05, "loss": 0.3094, "loss/crossentropy": 0.9302163124084473, "loss/dist_ce": 0.0, "loss/hidden": 0.267578125, "loss/idx": 0.0, "loss/logits": 0.041776590049266815, "step": 101 }, { "epoch": 0.0008406339368865227, "grad_norm": 9.9375, "grad_norm_var": 19.911995442708335, "learning_rate": 5e-05, "loss": 0.4483, "loss/crossentropy": 2.0367283821105957, "loss/dist_ce": 0.0, "loss/hidden": 0.36328125, "loss/idx": 0.0, "loss/logits": 0.08506779372692108, "step": 102 }, { "epoch": 0.0008488754460716846, "grad_norm": 2.984375, "grad_norm_var": 19.65354715983073, "learning_rate": 5e-05, "loss": 0.3074, "loss/crossentropy": 2.5578417778015137, "loss/dist_ce": 0.0, "loss/hidden": 0.25, "loss/idx": 0.0, "loss/logits": 0.05742755904793739, "step": 103 }, { "epoch": 0.0008571169552568466, "grad_norm": 3.96875, "grad_norm_var": 19.903644816080728, "learning_rate": 5e-05, "loss": 0.3362, "loss/crossentropy": 1.235908031463623, "loss/dist_ce": 0.0, "loss/hidden": 0.28515625, "loss/idx": 0.0, "loss/logits": 0.05100230872631073, "step": 104 }, { "epoch": 0.0008653584644420086, "grad_norm": 13.1875, "grad_norm_var": 22.177814737955728, "learning_rate": 5e-05, "loss": 0.5365, "loss/crossentropy": 1.3280326128005981, "loss/dist_ce": 0.0, "loss/hidden": 0.4765625, "loss/idx": 0.0, "loss/logits": 0.05990615114569664, "step": 105 }, { "epoch": 0.0008735999736271706, "grad_norm": 3.640625, "grad_norm_var": 22.11558837890625, "learning_rate": 5e-05, "loss": 0.3851, "loss/crossentropy": 2.764561414718628, "loss/dist_ce": 0.0, "loss/hidden": 0.296875, "loss/idx": 0.0, "loss/logits": 0.08821941912174225, "step": 106 }, { "epoch": 0.0008818414828123326, "grad_norm": 5.875, "grad_norm_var": 21.201919555664062, "learning_rate": 5e-05, "loss": 0.3966, "loss/crossentropy": 2.7057063579559326, "loss/dist_ce": 0.0, "loss/hidden": 0.31640625, "loss/idx": 0.0, "loss/logits": 0.08024018257856369, "step": 107 }, { "epoch": 0.0008900829919974946, "grad_norm": 3.375, "grad_norm_var": 21.723835245768228, "learning_rate": 5e-05, "loss": 0.3745, "loss/crossentropy": 1.9366930723190308, "loss/dist_ce": 0.0, "loss/hidden": 0.2890625, "loss/idx": 0.0, "loss/logits": 0.08542559295892715, "step": 108 }, { "epoch": 0.0008983245011826566, "grad_norm": 3.796875, "grad_norm_var": 7.927079264322916, "learning_rate": 5e-05, "loss": 0.3125, "loss/crossentropy": 2.499528408050537, "loss/dist_ce": 0.0, "loss/hidden": 0.25, "loss/idx": 0.0, "loss/logits": 0.062451932579278946, "step": 109 }, { "epoch": 0.0009065660103678185, "grad_norm": 4.59375, "grad_norm_var": 7.960738118489584, "learning_rate": 5e-05, "loss": 0.3585, "loss/crossentropy": 2.2361199855804443, "loss/dist_ce": 0.0, "loss/hidden": 0.28515625, "loss/idx": 0.0, "loss/logits": 0.07333969324827194, "step": 110 }, { "epoch": 0.0009148075195529805, "grad_norm": 2.671875, "grad_norm_var": 8.200495402018229, "learning_rate": 5e-05, "loss": 0.2519, "loss/crossentropy": 1.3813279867172241, "loss/dist_ce": 0.0, "loss/hidden": 0.21484375, "loss/idx": 0.0, "loss/logits": 0.03708701953291893, "step": 111 }, { "epoch": 0.0009230490287381425, "grad_norm": 6.875, "grad_norm_var": 8.145243326822916, "learning_rate": 5e-05, "loss": 0.6369, "loss/crossentropy": 0.7904279232025146, "loss/dist_ce": 0.0, "loss/hidden": 0.55859375, "loss/idx": 0.0, "loss/logits": 0.07834139466285706, "step": 112 }, { "epoch": 0.0009312905379233045, "grad_norm": 26.625, "grad_norm_var": 36.27662760416667, "learning_rate": 5e-05, "loss": 0.811, "loss/crossentropy": 1.7555081844329834, "loss/dist_ce": 0.0, "loss/hidden": 0.6171875, "loss/idx": 0.0, "loss/logits": 0.19379198551177979, "step": 113 }, { "epoch": 0.0009395320471084665, "grad_norm": 20.875, "grad_norm_var": 48.545633951822914, "learning_rate": 5e-05, "loss": 0.5722, "loss/crossentropy": 2.7940163612365723, "loss/dist_ce": 0.0, "loss/hidden": 0.455078125, "loss/idx": 0.0, "loss/logits": 0.11716997623443604, "step": 114 }, { "epoch": 0.0009477735562936285, "grad_norm": 4.1875, "grad_norm_var": 49.299153645833336, "learning_rate": 5e-05, "loss": 0.3537, "loss/crossentropy": 2.1036393642425537, "loss/dist_ce": 0.0, "loss/hidden": 0.291015625, "loss/idx": 0.0, "loss/logits": 0.06271170824766159, "step": 115 }, { "epoch": 0.0009560150654787905, "grad_norm": 2.84375, "grad_norm_var": 48.926936848958334, "learning_rate": 5e-05, "loss": 0.2983, "loss/crossentropy": 1.6935715675354004, "loss/dist_ce": 0.0, "loss/hidden": 0.25390625, "loss/idx": 0.0, "loss/logits": 0.04436497390270233, "step": 116 }, { "epoch": 0.0009642565746639524, "grad_norm": 2.1875, "grad_norm_var": 50.4494140625, "learning_rate": 5e-05, "loss": 0.2826, "loss/crossentropy": 0.9073331356048584, "loss/dist_ce": 0.0, "loss/hidden": 0.25, "loss/idx": 0.0, "loss/logits": 0.032610610127449036, "step": 117 }, { "epoch": 0.0009724980838491144, "grad_norm": 2.203125, "grad_norm_var": 51.52145080566406, "learning_rate": 5e-05, "loss": 0.1972, "loss/crossentropy": 1.5172605514526367, "loss/dist_ce": 0.0, "loss/hidden": 0.1669921875, "loss/idx": 0.0, "loss/logits": 0.030240532010793686, "step": 118 }, { "epoch": 0.0009807395930342764, "grad_norm": 3.609375, "grad_norm_var": 51.222215779622395, "learning_rate": 5e-05, "loss": 0.3043, "loss/crossentropy": 1.3851293325424194, "loss/dist_ce": 0.0, "loss/hidden": 0.2578125, "loss/idx": 0.0, "loss/logits": 0.04644084721803665, "step": 119 }, { "epoch": 0.0009889811022194385, "grad_norm": 11.5, "grad_norm_var": 51.8164784749349, "learning_rate": 5e-05, "loss": 0.3607, "loss/crossentropy": 1.786331057548523, "loss/dist_ce": 0.0, "loss/hidden": 0.30078125, "loss/idx": 0.0, "loss/logits": 0.059897445142269135, "step": 120 }, { "epoch": 0.0009972226114046004, "grad_norm": 3.421875, "grad_norm_var": 50.212398274739584, "learning_rate": 5e-05, "loss": 0.3368, "loss/crossentropy": 1.3703113794326782, "loss/dist_ce": 0.0, "loss/hidden": 0.283203125, "loss/idx": 0.0, "loss/logits": 0.05364468693733215, "step": 121 }, { "epoch": 0.0010054641205897623, "grad_norm": 5.6875, "grad_norm_var": 49.62085673014323, "learning_rate": 5e-05, "loss": 0.3796, "loss/crossentropy": 1.5540196895599365, "loss/dist_ce": 0.0, "loss/hidden": 0.31640625, "loss/idx": 0.0, "loss/logits": 0.06318466365337372, "step": 122 }, { "epoch": 0.0010137056297749244, "grad_norm": 3.15625, "grad_norm_var": 50.45276590983073, "learning_rate": 5e-05, "loss": 0.2844, "loss/crossentropy": 2.5752596855163574, "loss/dist_ce": 0.0, "loss/hidden": 0.224609375, "loss/idx": 0.0, "loss/logits": 0.05982211232185364, "step": 123 }, { "epoch": 0.0010219471389600863, "grad_norm": 7.9375, "grad_norm_var": 49.715518188476565, "learning_rate": 5e-05, "loss": 0.6157, "loss/crossentropy": 2.424745798110962, "loss/dist_ce": 0.0, "loss/hidden": 0.47265625, "loss/idx": 0.0, "loss/logits": 0.14306676387786865, "step": 124 }, { "epoch": 0.0010301886481452484, "grad_norm": 13.0625, "grad_norm_var": 51.110791015625, "learning_rate": 5e-05, "loss": 0.3906, "loss/crossentropy": 1.6216858625411987, "loss/dist_ce": 0.0, "loss/hidden": 0.32421875, "loss/idx": 0.0, "loss/logits": 0.06637328118085861, "step": 125 }, { "epoch": 0.0010384301573304103, "grad_norm": 10.3125, "grad_norm_var": 50.87027587890625, "learning_rate": 5e-05, "loss": 0.4044, "loss/crossentropy": 2.009226083755493, "loss/dist_ce": 0.0, "loss/hidden": 0.32421875, "loss/idx": 0.0, "loss/logits": 0.08017371594905853, "step": 126 }, { "epoch": 0.0010466716665155724, "grad_norm": 2.734375, "grad_norm_var": 50.826558430989586, "learning_rate": 5e-05, "loss": 0.2593, "loss/crossentropy": 0.31721383333206177, "loss/dist_ce": 0.0, "loss/hidden": 0.240234375, "loss/idx": 0.0, "loss/logits": 0.019071679562330246, "step": 127 }, { "epoch": 0.0010549131757007343, "grad_norm": 9.0625, "grad_norm_var": 50.811747233072914, "learning_rate": 5e-05, "loss": 0.4044, "loss/crossentropy": 1.5959731340408325, "loss/dist_ce": 0.0, "loss/hidden": 0.3359375, "loss/idx": 0.0, "loss/logits": 0.06845791637897491, "step": 128 }, { "epoch": 0.0010631546848858962, "grad_norm": 6.53125, "grad_norm_var": 26.382666015625, "learning_rate": 5e-05, "loss": 0.404, "loss/crossentropy": 1.000110387802124, "loss/dist_ce": 0.0, "loss/hidden": 0.345703125, "loss/idx": 0.0, "loss/logits": 0.0582566112279892, "step": 129 }, { "epoch": 0.0010713961940710583, "grad_norm": 13.375, "grad_norm_var": 15.855322265625, "learning_rate": 5e-05, "loss": 0.4836, "loss/crossentropy": 1.4723174571990967, "loss/dist_ce": 0.0, "loss/hidden": 0.41015625, "loss/idx": 0.0, "loss/logits": 0.07343296706676483, "step": 130 }, { "epoch": 0.0010796377032562202, "grad_norm": 1.8359375, "grad_norm_var": 16.883135732014974, "learning_rate": 5e-05, "loss": 0.2167, "loss/crossentropy": 1.884359359741211, "loss/dist_ce": 0.0, "loss/hidden": 0.17578125, "loss/idx": 0.0, "loss/logits": 0.04091912880539894, "step": 131 }, { "epoch": 0.0010878792124413823, "grad_norm": 9.0625, "grad_norm_var": 16.503775787353515, "learning_rate": 5e-05, "loss": 0.335, "loss/crossentropy": 2.47468638420105, "loss/dist_ce": 0.0, "loss/hidden": 0.265625, "loss/idx": 0.0, "loss/logits": 0.06939947605133057, "step": 132 }, { "epoch": 0.0010961207216265442, "grad_norm": 2.28125, "grad_norm_var": 16.44910659790039, "learning_rate": 5e-05, "loss": 0.2479, "loss/crossentropy": 1.4188505411148071, "loss/dist_ce": 0.0, "loss/hidden": 0.208984375, "loss/idx": 0.0, "loss/logits": 0.03889765217900276, "step": 133 }, { "epoch": 0.0011043622308117063, "grad_norm": 3.453125, "grad_norm_var": 15.812143707275391, "learning_rate": 5e-05, "loss": 0.3702, "loss/crossentropy": 2.618537425994873, "loss/dist_ce": 0.0, "loss/hidden": 0.27734375, "loss/idx": 0.0, "loss/logits": 0.09288428723812103, "step": 134 }, { "epoch": 0.0011126037399968682, "grad_norm": 14.5, "grad_norm_var": 18.753179677327473, "learning_rate": 5e-05, "loss": 0.6568, "loss/crossentropy": 3.4983789920806885, "loss/dist_ce": 0.0, "loss/hidden": 0.5078125, "loss/idx": 0.0, "loss/logits": 0.14897578954696655, "step": 135 }, { "epoch": 0.0011208452491820301, "grad_norm": 3.078125, "grad_norm_var": 18.548115793863932, "learning_rate": 5e-05, "loss": 0.2164, "loss/crossentropy": 0.7671207189559937, "loss/dist_ce": 0.0, "loss/hidden": 0.1875, "loss/idx": 0.0, "loss/logits": 0.028927473351359367, "step": 136 }, { "epoch": 0.0011290867583671922, "grad_norm": 3.015625, "grad_norm_var": 18.743755849202476, "learning_rate": 5e-05, "loss": 0.2917, "loss/crossentropy": 1.4987382888793945, "loss/dist_ce": 0.0, "loss/hidden": 0.240234375, "loss/idx": 0.0, "loss/logits": 0.051444459706544876, "step": 137 }, { "epoch": 0.0011373282675523541, "grad_norm": 2.515625, "grad_norm_var": 19.85060806274414, "learning_rate": 5e-05, "loss": 0.2295, "loss/crossentropy": 1.1994467973709106, "loss/dist_ce": 0.0, "loss/hidden": 0.1982421875, "loss/idx": 0.0, "loss/logits": 0.03128223866224289, "step": 138 }, { "epoch": 0.0011455697767375162, "grad_norm": 11.625, "grad_norm_var": 20.42235895792643, "learning_rate": 5e-05, "loss": 0.3444, "loss/crossentropy": 1.7283226251602173, "loss/dist_ce": 0.0, "loss/hidden": 0.287109375, "loss/idx": 0.0, "loss/logits": 0.05732431262731552, "step": 139 }, { "epoch": 0.0011538112859226781, "grad_norm": 4.09375, "grad_norm_var": 20.941615549723306, "learning_rate": 5e-05, "loss": 0.3394, "loss/crossentropy": 2.3807051181793213, "loss/dist_ce": 0.0, "loss/hidden": 0.26171875, "loss/idx": 0.0, "loss/logits": 0.07771667838096619, "step": 140 }, { "epoch": 0.0011620527951078403, "grad_norm": 4.4375, "grad_norm_var": 18.514149729410807, "learning_rate": 5e-05, "loss": 0.4546, "loss/crossentropy": 3.182520866394043, "loss/dist_ce": 0.0, "loss/hidden": 0.34765625, "loss/idx": 0.0, "loss/logits": 0.10692334175109863, "step": 141 }, { "epoch": 0.0011702943042930021, "grad_norm": 4.34375, "grad_norm_var": 17.60290501912435, "learning_rate": 5e-05, "loss": 0.2908, "loss/crossentropy": 1.5368177890777588, "loss/dist_ce": 0.0, "loss/hidden": 0.2431640625, "loss/idx": 0.0, "loss/logits": 0.04762953519821167, "step": 142 }, { "epoch": 0.001178535813478164, "grad_norm": 4.5, "grad_norm_var": 17.029766591389976, "learning_rate": 5e-05, "loss": 0.407, "loss/crossentropy": 2.778043270111084, "loss/dist_ce": 0.0, "loss/hidden": 0.302734375, "loss/idx": 0.0, "loss/logits": 0.1042385995388031, "step": 143 }, { "epoch": 0.0011867773226633262, "grad_norm": 4.15625, "grad_norm_var": 16.600789133707682, "learning_rate": 5e-05, "loss": 0.3435, "loss/crossentropy": 2.792048692703247, "loss/dist_ce": 0.0, "loss/hidden": 0.267578125, "loss/idx": 0.0, "loss/logits": 0.07588605582714081, "step": 144 }, { "epoch": 0.001195018831848488, "grad_norm": 3.8125, "grad_norm_var": 16.797792307535808, "learning_rate": 5e-05, "loss": 0.2385, "loss/crossentropy": 1.401113510131836, "loss/dist_ce": 0.0, "loss/hidden": 0.205078125, "loss/idx": 0.0, "loss/logits": 0.03345421701669693, "step": 145 }, { "epoch": 0.0012032603410336502, "grad_norm": 6.875, "grad_norm_var": 12.726405588785807, "learning_rate": 5e-05, "loss": 0.3725, "loss/crossentropy": 2.2165474891662598, "loss/dist_ce": 0.0, "loss/hidden": 0.296875, "loss/idx": 0.0, "loss/logits": 0.07561925053596497, "step": 146 }, { "epoch": 0.001211501850218812, "grad_norm": 3.171875, "grad_norm_var": 12.234430948893229, "learning_rate": 5e-05, "loss": 0.2506, "loss/crossentropy": 2.589618444442749, "loss/dist_ce": 0.0, "loss/hidden": 0.19921875, "loss/idx": 0.0, "loss/logits": 0.05137525126338005, "step": 147 }, { "epoch": 0.0012197433594039742, "grad_norm": 22.25, "grad_norm_var": 29.706151326497395, "learning_rate": 5e-05, "loss": 0.542, "loss/crossentropy": 1.4461145401000977, "loss/dist_ce": 0.0, "loss/hidden": 0.47265625, "loss/idx": 0.0, "loss/logits": 0.06938936561346054, "step": 148 }, { "epoch": 0.001227984868589136, "grad_norm": 4.75, "grad_norm_var": 28.819587198893228, "learning_rate": 5e-05, "loss": 0.3594, "loss/crossentropy": 1.5630475282669067, "loss/dist_ce": 0.0, "loss/hidden": 0.28125, "loss/idx": 0.0, "loss/logits": 0.07812213897705078, "step": 149 }, { "epoch": 0.001236226377774298, "grad_norm": 3.09375, "grad_norm_var": 28.963407389322917, "learning_rate": 5e-05, "loss": 0.2805, "loss/crossentropy": 1.3344874382019043, "loss/dist_ce": 0.0, "loss/hidden": 0.234375, "loss/idx": 0.0, "loss/logits": 0.04612912982702255, "step": 150 }, { "epoch": 0.00124446788695946, "grad_norm": 6.34375, "grad_norm_var": 24.164176432291665, "learning_rate": 5e-05, "loss": 0.3487, "loss/crossentropy": 2.1057682037353516, "loss/dist_ce": 0.0, "loss/hidden": 0.28125, "loss/idx": 0.0, "loss/logits": 0.06741908937692642, "step": 151 }, { "epoch": 0.001252709396144622, "grad_norm": 3.078125, "grad_norm_var": 24.164176432291665, "learning_rate": 5e-05, "loss": 0.3282, "loss/crossentropy": 2.7360680103302, "loss/dist_ce": 0.0, "loss/hidden": 0.248046875, "loss/idx": 0.0, "loss/logits": 0.0801510438323021, "step": 152 }, { "epoch": 0.001260950905329784, "grad_norm": 3.265625, "grad_norm_var": 24.076806640625, "learning_rate": 5e-05, "loss": 0.2896, "loss/crossentropy": 2.9115164279937744, "loss/dist_ce": 0.0, "loss/hidden": 0.2265625, "loss/idx": 0.0, "loss/logits": 0.06300797313451767, "step": 153 }, { "epoch": 0.001269192414514946, "grad_norm": 3.109375, "grad_norm_var": 23.841239420572915, "learning_rate": 5e-05, "loss": 0.2801, "loss/crossentropy": 2.20858097076416, "loss/dist_ce": 0.0, "loss/hidden": 0.22265625, "loss/idx": 0.0, "loss/logits": 0.05741541087627411, "step": 154 }, { "epoch": 0.001277433923700108, "grad_norm": 3.4375, "grad_norm_var": 21.679227701822917, "learning_rate": 5e-05, "loss": 0.2616, "loss/crossentropy": 2.4968795776367188, "loss/dist_ce": 0.0, "loss/hidden": 0.205078125, "loss/idx": 0.0, "loss/logits": 0.05649275332689285, "step": 155 }, { "epoch": 0.00128567543288527, "grad_norm": 1.90625, "grad_norm_var": 22.328641764322917, "learning_rate": 5e-05, "loss": 0.2142, "loss/crossentropy": 1.6698881387710571, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.03645133972167969, "step": 156 }, { "epoch": 0.0012939169420704319, "grad_norm": 2.84375, "grad_norm_var": 22.640543619791668, "learning_rate": 5e-05, "loss": 0.2762, "loss/crossentropy": 2.5742998123168945, "loss/dist_ce": 0.0, "loss/hidden": 0.208984375, "loss/idx": 0.0, "loss/logits": 0.06718096137046814, "step": 157 }, { "epoch": 0.001302158451255594, "grad_norm": 13.0, "grad_norm_var": 26.498661295572916, "learning_rate": 5e-05, "loss": 0.3361, "loss/crossentropy": 2.4912259578704834, "loss/dist_ce": 0.0, "loss/hidden": 0.26953125, "loss/idx": 0.0, "loss/logits": 0.06653441488742828, "step": 158 }, { "epoch": 0.0013103999604407559, "grad_norm": 2.28125, "grad_norm_var": 27.131640625, "learning_rate": 5e-05, "loss": 0.244, "loss/crossentropy": 1.5568723678588867, "loss/dist_ce": 0.0, "loss/hidden": 0.1923828125, "loss/idx": 0.0, "loss/logits": 0.05163004621863365, "step": 159 }, { "epoch": 0.001318641469625918, "grad_norm": 2.90625, "grad_norm_var": 27.446744791666667, "learning_rate": 5e-05, "loss": 0.2633, "loss/crossentropy": 2.150268793106079, "loss/dist_ce": 0.0, "loss/hidden": 0.212890625, "loss/idx": 0.0, "loss/logits": 0.050429798662662506, "step": 160 }, { "epoch": 0.0013268829788110799, "grad_norm": 3.6875, "grad_norm_var": 27.473893229166666, "learning_rate": 5e-05, "loss": 0.2778, "loss/crossentropy": 1.7107495069503784, "loss/dist_ce": 0.0, "loss/hidden": 0.21875, "loss/idx": 0.0, "loss/logits": 0.059024274349212646, "step": 161 }, { "epoch": 0.001335124487996242, "grad_norm": 5.46875, "grad_norm_var": 27.316239420572916, "learning_rate": 5e-05, "loss": 0.3054, "loss/crossentropy": 2.609410285949707, "loss/dist_ce": 0.0, "loss/hidden": 0.236328125, "loss/idx": 0.0, "loss/logits": 0.06908264756202698, "step": 162 }, { "epoch": 0.0013433659971814039, "grad_norm": 2.1875, "grad_norm_var": 27.654426066080728, "learning_rate": 5e-05, "loss": 0.1832, "loss/crossentropy": 1.302159309387207, "loss/dist_ce": 0.0, "loss/hidden": 0.1552734375, "loss/idx": 0.0, "loss/logits": 0.027962597087025642, "step": 163 }, { "epoch": 0.0013516075063665658, "grad_norm": 4.96875, "grad_norm_var": 7.092438761393229, "learning_rate": 5e-05, "loss": 0.275, "loss/crossentropy": 1.3545722961425781, "loss/dist_ce": 0.0, "loss/hidden": 0.23828125, "loss/idx": 0.0, "loss/logits": 0.03673800453543663, "step": 164 }, { "epoch": 0.0013598490155517279, "grad_norm": 5.625, "grad_norm_var": 7.210814412434896, "learning_rate": 5e-05, "loss": 0.3255, "loss/crossentropy": 2.3857431411743164, "loss/dist_ce": 0.0, "loss/hidden": 0.267578125, "loss/idx": 0.0, "loss/logits": 0.05789117142558098, "step": 165 }, { "epoch": 0.0013680905247368898, "grad_norm": 2.3125, "grad_norm_var": 7.364216105143229, "learning_rate": 5e-05, "loss": 0.224, "loss/crossentropy": 1.6262630224227905, "loss/dist_ce": 0.0, "loss/hidden": 0.1865234375, "loss/idx": 0.0, "loss/logits": 0.03748723864555359, "step": 166 }, { "epoch": 0.0013763320339220519, "grad_norm": 3.515625, "grad_norm_var": 7.037398274739584, "learning_rate": 5e-05, "loss": 0.3005, "loss/crossentropy": 2.802839756011963, "loss/dist_ce": 0.0, "loss/hidden": 0.2421875, "loss/idx": 0.0, "loss/logits": 0.05828278884291649, "step": 167 }, { "epoch": 0.0013845735431072138, "grad_norm": 5.9375, "grad_norm_var": 7.206615193684896, "learning_rate": 5e-05, "loss": 0.3168, "loss/crossentropy": 2.6197855472564697, "loss/dist_ce": 0.0, "loss/hidden": 0.24609375, "loss/idx": 0.0, "loss/logits": 0.07069416344165802, "step": 168 }, { "epoch": 0.0013928150522923759, "grad_norm": 2.609375, "grad_norm_var": 7.311205037434896, "learning_rate": 5e-05, "loss": 0.2188, "loss/crossentropy": 2.0093469619750977, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.04102495685219765, "step": 169 }, { "epoch": 0.0014010565614775378, "grad_norm": 9.0625, "grad_norm_var": 8.730110677083333, "learning_rate": 5e-05, "loss": 0.4723, "loss/crossentropy": 0.45764070749282837, "loss/dist_ce": 0.0, "loss/hidden": 0.421875, "loss/idx": 0.0, "loss/logits": 0.05042431876063347, "step": 170 }, { "epoch": 0.0014092980706626997, "grad_norm": 10.6875, "grad_norm_var": 11.003287760416667, "learning_rate": 5e-05, "loss": 0.2967, "loss/crossentropy": 1.4506618976593018, "loss/dist_ce": 0.0, "loss/hidden": 0.25, "loss/idx": 0.0, "loss/logits": 0.04666414484381676, "step": 171 }, { "epoch": 0.0014175395798478618, "grad_norm": 2.734375, "grad_norm_var": 10.711449178059896, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 1.2239363193511963, "loss/dist_ce": 0.0, "loss/hidden": 0.1494140625, "loss/idx": 0.0, "loss/logits": 0.03392494469881058, "step": 172 }, { "epoch": 0.0014257810890330237, "grad_norm": 9.25, "grad_norm_var": 11.44383036295573, "learning_rate": 5e-05, "loss": 0.4527, "loss/crossentropy": 2.3572747707366943, "loss/dist_ce": 0.0, "loss/hidden": 0.365234375, "loss/idx": 0.0, "loss/logits": 0.08745455741882324, "step": 173 }, { "epoch": 0.0014340225982181858, "grad_norm": 3.328125, "grad_norm_var": 7.476220703125, "learning_rate": 5e-05, "loss": 0.3071, "loss/crossentropy": 2.6207613945007324, "loss/dist_ce": 0.0, "loss/hidden": 0.23828125, "loss/idx": 0.0, "loss/logits": 0.06882129609584808, "step": 174 }, { "epoch": 0.0014422641074033477, "grad_norm": 12.8125, "grad_norm_var": 10.892020670572917, "learning_rate": 5e-05, "loss": 0.3909, "loss/crossentropy": 2.3904902935028076, "loss/dist_ce": 0.0, "loss/hidden": 0.3046875, "loss/idx": 0.0, "loss/logits": 0.08622868359088898, "step": 175 }, { "epoch": 0.0014505056165885098, "grad_norm": 6.78125, "grad_norm_var": 10.519657389322917, "learning_rate": 5e-05, "loss": 0.4847, "loss/crossentropy": 2.4582583904266357, "loss/dist_ce": 0.0, "loss/hidden": 0.40625, "loss/idx": 0.0, "loss/logits": 0.07843705266714096, "step": 176 }, { "epoch": 0.0014587471257736717, "grad_norm": 6.125, "grad_norm_var": 10.241630045572917, "learning_rate": 5e-05, "loss": 0.3267, "loss/crossentropy": 2.8331282138824463, "loss/dist_ce": 0.0, "loss/hidden": 0.248046875, "loss/idx": 0.0, "loss/logits": 0.07867051661014557, "step": 177 }, { "epoch": 0.0014669886349588336, "grad_norm": 4.21875, "grad_norm_var": 10.400809733072917, "learning_rate": 5e-05, "loss": 0.4616, "loss/crossentropy": 2.921239137649536, "loss/dist_ce": 0.0, "loss/hidden": 0.35546875, "loss/idx": 0.0, "loss/logits": 0.10613523423671722, "step": 178 }, { "epoch": 0.0014752301441439957, "grad_norm": 2.96875, "grad_norm_var": 10.066845703125, "learning_rate": 5e-05, "loss": 0.2874, "loss/crossentropy": 2.7095119953155518, "loss/dist_ce": 0.0, "loss/hidden": 0.2333984375, "loss/idx": 0.0, "loss/logits": 0.05400337651371956, "step": 179 }, { "epoch": 0.0014834716533291576, "grad_norm": 2.71875, "grad_norm_var": 10.635205078125, "learning_rate": 5e-05, "loss": 0.3012, "loss/crossentropy": 2.5020012855529785, "loss/dist_ce": 0.0, "loss/hidden": 0.23046875, "loss/idx": 0.0, "loss/logits": 0.07070466130971909, "step": 180 }, { "epoch": 0.0014917131625143197, "grad_norm": 2.375, "grad_norm_var": 11.313981119791666, "learning_rate": 5e-05, "loss": 0.2055, "loss/crossentropy": 1.2848535776138306, "loss/dist_ce": 0.0, "loss/hidden": 0.1767578125, "loss/idx": 0.0, "loss/logits": 0.02870912477374077, "step": 181 }, { "epoch": 0.0014999546716994816, "grad_norm": 4.4375, "grad_norm_var": 10.703043619791666, "learning_rate": 5e-05, "loss": 0.2174, "loss/crossentropy": 0.40366628766059875, "loss/dist_ce": 0.0, "loss/hidden": 0.201171875, "loss/idx": 0.0, "loss/logits": 0.016210440546274185, "step": 182 }, { "epoch": 0.0015081961808846437, "grad_norm": 6.28125, "grad_norm_var": 10.41333719889323, "learning_rate": 5e-05, "loss": 0.3403, "loss/crossentropy": 2.8000519275665283, "loss/dist_ce": 0.0, "loss/hidden": 0.263671875, "loss/idx": 0.0, "loss/logits": 0.07665810734033585, "step": 183 }, { "epoch": 0.0015164376900698056, "grad_norm": 2.15625, "grad_norm_var": 11.22276102701823, "learning_rate": 5e-05, "loss": 0.2103, "loss/crossentropy": 1.5652306079864502, "loss/dist_ce": 0.0, "loss/hidden": 0.1708984375, "loss/idx": 0.0, "loss/logits": 0.039365194737911224, "step": 184 }, { "epoch": 0.0015246791992549675, "grad_norm": 50.5, "grad_norm_var": 135.891162109375, "learning_rate": 5e-05, "loss": 0.5337, "loss/crossentropy": 1.4967753887176514, "loss/dist_ce": 0.0, "loss/hidden": 0.4453125, "loss/idx": 0.0, "loss/logits": 0.08833958208560944, "step": 185 }, { "epoch": 0.0015329207084401296, "grad_norm": 6.53125, "grad_norm_var": 136.11099853515626, "learning_rate": 5e-05, "loss": 0.3785, "loss/crossentropy": 2.4065871238708496, "loss/dist_ce": 0.0, "loss/hidden": 0.2890625, "loss/idx": 0.0, "loss/logits": 0.08948490023612976, "step": 186 }, { "epoch": 0.0015411622176252915, "grad_norm": 6.125, "grad_norm_var": 136.0016886393229, "learning_rate": 5e-05, "loss": 0.315, "loss/crossentropy": 2.2277615070343018, "loss/dist_ce": 0.0, "loss/hidden": 0.244140625, "loss/idx": 0.0, "loss/logits": 0.07082026451826096, "step": 187 }, { "epoch": 0.0015494037268104536, "grad_norm": 2.578125, "grad_norm_var": 136.11466471354166, "learning_rate": 5e-05, "loss": 0.2275, "loss/crossentropy": 2.0500736236572266, "loss/dist_ce": 0.0, "loss/hidden": 0.1806640625, "loss/idx": 0.0, "loss/logits": 0.04683098569512367, "step": 188 }, { "epoch": 0.0015576452359956155, "grad_norm": 2.203125, "grad_norm_var": 138.1135732014974, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 0.851466953754425, "loss/dist_ce": 0.0, "loss/hidden": 0.1552734375, "loss/idx": 0.0, "loss/logits": 0.021058566868305206, "step": 189 }, { "epoch": 0.0015658867451807776, "grad_norm": 10.9375, "grad_norm_var": 137.36402994791666, "learning_rate": 5e-05, "loss": 0.4211, "loss/crossentropy": 1.5812166929244995, "loss/dist_ce": 0.0, "loss/hidden": 0.33984375, "loss/idx": 0.0, "loss/logits": 0.08130454272031784, "step": 190 }, { "epoch": 0.0015741282543659395, "grad_norm": 5.75, "grad_norm_var": 136.052685546875, "learning_rate": 5e-05, "loss": 0.2656, "loss/crossentropy": 1.3986968994140625, "loss/dist_ce": 0.0, "loss/hidden": 0.2216796875, "loss/idx": 0.0, "loss/logits": 0.04394121095538139, "step": 191 }, { "epoch": 0.0015823697635511014, "grad_norm": 2.203125, "grad_norm_var": 137.9039052327474, "learning_rate": 5e-05, "loss": 0.2049, "loss/crossentropy": 2.949207067489624, "loss/dist_ce": 0.0, "loss/hidden": 0.1591796875, "loss/idx": 0.0, "loss/logits": 0.04576685652136803, "step": 192 }, { "epoch": 0.0015906112727362635, "grad_norm": 206.0, "grad_norm_var": 2601.2852040608723, "learning_rate": 5e-05, "loss": 1.1479, "loss/crossentropy": 1.637980580329895, "loss/dist_ce": 0.0, "loss/hidden": 1.015625, "loss/idx": 0.0, "loss/logits": 0.13231301307678223, "step": 193 }, { "epoch": 0.0015988527819214254, "grad_norm": 4.28125, "grad_norm_var": 2601.1549875895184, "learning_rate": 5e-05, "loss": 0.294, "loss/crossentropy": 2.0168232917785645, "loss/dist_ce": 0.0, "loss/hidden": 0.23828125, "loss/idx": 0.0, "loss/logits": 0.055670544505119324, "step": 194 }, { "epoch": 0.0016070942911065875, "grad_norm": 5.4375, "grad_norm_var": 2595.9699696858725, "learning_rate": 5e-05, "loss": 0.3063, "loss/crossentropy": 2.8578052520751953, "loss/dist_ce": 0.0, "loss/hidden": 0.2275390625, "loss/idx": 0.0, "loss/logits": 0.07876630872488022, "step": 195 }, { "epoch": 0.0016153358002917494, "grad_norm": 2.5, "grad_norm_var": 2596.477936808268, "learning_rate": 5e-05, "loss": 0.1944, "loss/crossentropy": 1.4366533756256104, "loss/dist_ce": 0.0, "loss/hidden": 0.166015625, "loss/idx": 0.0, "loss/logits": 0.028374146670103073, "step": 196 }, { "epoch": 0.0016235773094769115, "grad_norm": 2.09375, "grad_norm_var": 2597.144513956706, "learning_rate": 5e-05, "loss": 0.2272, "loss/crossentropy": 1.4764188528060913, "loss/dist_ce": 0.0, "loss/hidden": 0.1845703125, "loss/idx": 0.0, "loss/logits": 0.04259010776877403, "step": 197 }, { "epoch": 0.0016318188186620734, "grad_norm": 8.25, "grad_norm_var": 2590.14152730306, "learning_rate": 5e-05, "loss": 0.2137, "loss/crossentropy": 0.44381940364837646, "loss/dist_ce": 0.0, "loss/hidden": 0.1884765625, "loss/idx": 0.0, "loss/logits": 0.025243356823921204, "step": 198 }, { "epoch": 0.0016400603278472353, "grad_norm": 3.703125, "grad_norm_var": 2595.355013020833, "learning_rate": 5e-05, "loss": 0.2982, "loss/crossentropy": 2.862804889678955, "loss/dist_ce": 0.0, "loss/hidden": 0.23046875, "loss/idx": 0.0, "loss/logits": 0.06769528239965439, "step": 199 }, { "epoch": 0.0016483018370323974, "grad_norm": 30.625, "grad_norm_var": 2577.980920410156, "learning_rate": 5e-05, "loss": 0.5157, "loss/crossentropy": 2.726966142654419, "loss/dist_ce": 0.0, "loss/hidden": 0.4296875, "loss/idx": 0.0, "loss/logits": 0.08603046834468842, "step": 200 }, { "epoch": 0.0016565433462175593, "grad_norm": 8.4375, "grad_norm_var": 2527.9221638997396, "learning_rate": 5e-05, "loss": 0.4416, "loss/crossentropy": 1.4357587099075317, "loss/dist_ce": 0.0, "loss/hidden": 0.37109375, "loss/idx": 0.0, "loss/logits": 0.07054366171360016, "step": 201 }, { "epoch": 0.0016647848554027214, "grad_norm": 3.515625, "grad_norm_var": 2533.595897420247, "learning_rate": 5e-05, "loss": 0.2845, "loss/crossentropy": 2.2265231609344482, "loss/dist_ce": 0.0, "loss/hidden": 0.21875, "loss/idx": 0.0, "loss/logits": 0.06571735441684723, "step": 202 }, { "epoch": 0.0016730263645878833, "grad_norm": 8.1875, "grad_norm_var": 2530.3101308186847, "learning_rate": 5e-05, "loss": 0.3526, "loss/crossentropy": 2.462019681930542, "loss/dist_ce": 0.0, "loss/hidden": 0.28125, "loss/idx": 0.0, "loss/logits": 0.0713062509894371, "step": 203 }, { "epoch": 0.0016812678737730454, "grad_norm": 2.046875, "grad_norm_var": 2531.50295308431, "learning_rate": 5e-05, "loss": 0.2505, "loss/crossentropy": 2.9555587768554688, "loss/dist_ce": 0.0, "loss/hidden": 0.1923828125, "loss/idx": 0.0, "loss/logits": 0.05816446244716644, "step": 204 }, { "epoch": 0.0016895093829582073, "grad_norm": 4.3125, "grad_norm_var": 2527.0187459309896, "learning_rate": 5e-05, "loss": 0.3139, "loss/crossentropy": 1.3578487634658813, "loss/dist_ce": 0.0, "loss/hidden": 0.267578125, "loss/idx": 0.0, "loss/logits": 0.04633466899394989, "step": 205 }, { "epoch": 0.0016977508921433692, "grad_norm": 5.03125, "grad_norm_var": 2535.7589192708333, "learning_rate": 5e-05, "loss": 0.2753, "loss/crossentropy": 2.804027557373047, "loss/dist_ce": 0.0, "loss/hidden": 0.2109375, "loss/idx": 0.0, "loss/logits": 0.06441053748130798, "step": 206 }, { "epoch": 0.0017059924013285313, "grad_norm": 2.859375, "grad_norm_var": 2541.3487782796224, "learning_rate": 5e-05, "loss": 0.2116, "loss/crossentropy": 1.8921546936035156, "loss/dist_ce": 0.0, "loss/hidden": 0.1767578125, "loss/idx": 0.0, "loss/logits": 0.034831516444683075, "step": 207 }, { "epoch": 0.0017142339105136932, "grad_norm": 1.765625, "grad_norm_var": 2542.324095662435, "learning_rate": 5e-05, "loss": 0.2004, "loss/crossentropy": 2.6469180583953857, "loss/dist_ce": 0.0, "loss/hidden": 0.1572265625, "loss/idx": 0.0, "loss/logits": 0.04312657564878464, "step": 208 }, { "epoch": 0.0017224754196988553, "grad_norm": 5.65625, "grad_norm_var": 47.41833394368489, "learning_rate": 5e-05, "loss": 0.3427, "loss/crossentropy": 1.994149088859558, "loss/dist_ce": 0.0, "loss/hidden": 0.27734375, "loss/idx": 0.0, "loss/logits": 0.06540031731128693, "step": 209 }, { "epoch": 0.0017307169288840172, "grad_norm": 3.671875, "grad_norm_var": 47.59491780598958, "learning_rate": 5e-05, "loss": 0.2391, "loss/crossentropy": 1.5324124097824097, "loss/dist_ce": 0.0, "loss/hidden": 0.1953125, "loss/idx": 0.0, "loss/logits": 0.04377663880586624, "step": 210 }, { "epoch": 0.0017389584380691793, "grad_norm": 7.125, "grad_norm_var": 47.61689046223958, "learning_rate": 5e-05, "loss": 0.5363, "loss/crossentropy": 2.5077903270721436, "loss/dist_ce": 0.0, "loss/hidden": 0.4453125, "loss/idx": 0.0, "loss/logits": 0.09098894894123077, "step": 211 }, { "epoch": 0.0017471999472543412, "grad_norm": 2.5, "grad_norm_var": 47.61689046223958, "learning_rate": 5e-05, "loss": 0.2245, "loss/crossentropy": 1.6434502601623535, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.042850345373153687, "step": 212 }, { "epoch": 0.0017554414564395031, "grad_norm": 3.0625, "grad_norm_var": 47.140462239583336, "learning_rate": 5e-05, "loss": 0.2987, "loss/crossentropy": 2.1260766983032227, "loss/dist_ce": 0.0, "loss/hidden": 0.228515625, "loss/idx": 0.0, "loss/logits": 0.07022828608751297, "step": 213 }, { "epoch": 0.0017636829656246652, "grad_norm": 2.703125, "grad_norm_var": 47.61895243326823, "learning_rate": 5e-05, "loss": 0.3036, "loss/crossentropy": 2.342567205429077, "loss/dist_ce": 0.0, "loss/hidden": 0.24609375, "loss/idx": 0.0, "loss/logits": 0.057552557438611984, "step": 214 }, { "epoch": 0.0017719244748098271, "grad_norm": 3.5625, "grad_norm_var": 47.66232096354167, "learning_rate": 5e-05, "loss": 0.2831, "loss/crossentropy": 2.4342286586761475, "loss/dist_ce": 0.0, "loss/hidden": 0.21875, "loss/idx": 0.0, "loss/logits": 0.06433624029159546, "step": 215 }, { "epoch": 0.0017801659839949892, "grad_norm": 4.59375, "grad_norm_var": 4.341304524739583, "learning_rate": 5e-05, "loss": 0.2732, "loss/crossentropy": 1.6944836378097534, "loss/dist_ce": 0.0, "loss/hidden": 0.220703125, "loss/idx": 0.0, "loss/logits": 0.05249807611107826, "step": 216 }, { "epoch": 0.0017884074931801511, "grad_norm": 3.015625, "grad_norm_var": 3.197980753580729, "learning_rate": 5e-05, "loss": 0.2028, "loss/crossentropy": 1.4322035312652588, "loss/dist_ce": 0.0, "loss/hidden": 0.173828125, "loss/idx": 0.0, "loss/logits": 0.028966199606657028, "step": 217 }, { "epoch": 0.0017966490023653132, "grad_norm": 8.1875, "grad_norm_var": 4.275614420572917, "learning_rate": 5e-05, "loss": 0.4244, "loss/crossentropy": 2.7989346981048584, "loss/dist_ce": 0.0, "loss/hidden": 0.3359375, "loss/idx": 0.0, "loss/logits": 0.08842961490154266, "step": 218 }, { "epoch": 0.0018048905115504751, "grad_norm": 2.359375, "grad_norm_var": 3.3524485270182294, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 0.49160024523735046, "loss/dist_ce": 0.0, "loss/hidden": 0.15625, "loss/idx": 0.0, "loss/logits": 0.020072361454367638, "step": 219 }, { "epoch": 0.001813132020735637, "grad_norm": 2.03125, "grad_norm_var": 3.356331380208333, "learning_rate": 5e-05, "loss": 0.1975, "loss/crossentropy": 0.9580312967300415, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.027562592178583145, "step": 220 }, { "epoch": 0.0018213735299207991, "grad_norm": 4.46875, "grad_norm_var": 3.366402180989583, "learning_rate": 5e-05, "loss": 0.2313, "loss/crossentropy": 2.2378625869750977, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.04968283697962761, "step": 221 }, { "epoch": 0.001829615039105961, "grad_norm": 2.609375, "grad_norm_var": 3.3716054280598957, "learning_rate": 5e-05, "loss": 0.2638, "loss/crossentropy": 1.2911431789398193, "loss/dist_ce": 0.0, "loss/hidden": 0.21484375, "loss/idx": 0.0, "loss/logits": 0.04894676432013512, "step": 222 }, { "epoch": 0.0018378565482911231, "grad_norm": 4.125, "grad_norm_var": 3.3196126302083333, "learning_rate": 5e-05, "loss": 0.2351, "loss/crossentropy": 2.6005423069000244, "loss/dist_ce": 0.0, "loss/hidden": 0.185546875, "loss/idx": 0.0, "loss/logits": 0.04953521490097046, "step": 223 }, { "epoch": 0.001846098057476285, "grad_norm": 2.109375, "grad_norm_var": 3.2319295247395834, "learning_rate": 5e-05, "loss": 0.1785, "loss/crossentropy": 1.635225772857666, "loss/dist_ce": 0.0, "loss/hidden": 0.1455078125, "loss/idx": 0.0, "loss/logits": 0.03304152935743332, "step": 224 }, { "epoch": 0.0018543395666614471, "grad_norm": 6.4375, "grad_norm_var": 3.457047526041667, "learning_rate": 5e-05, "loss": 0.5431, "loss/crossentropy": 2.507209062576294, "loss/dist_ce": 0.0, "loss/hidden": 0.404296875, "loss/idx": 0.0, "loss/logits": 0.13885299861431122, "step": 225 }, { "epoch": 0.001862581075846609, "grad_norm": 2.984375, "grad_norm_var": 3.508430989583333, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 0.42544418573379517, "loss/dist_ce": 0.0, "loss/hidden": 0.1611328125, "loss/idx": 0.0, "loss/logits": 0.019888322800397873, "step": 226 }, { "epoch": 0.001870822585031771, "grad_norm": 3.140625, "grad_norm_var": 2.7699208577473957, "learning_rate": 5e-05, "loss": 0.2789, "loss/crossentropy": 2.700981378555298, "loss/dist_ce": 0.0, "loss/hidden": 0.2138671875, "loss/idx": 0.0, "loss/logits": 0.06499424576759338, "step": 227 }, { "epoch": 0.001879064094216933, "grad_norm": 5.3125, "grad_norm_var": 2.8449940999348957, "learning_rate": 5e-05, "loss": 0.2954, "loss/crossentropy": 1.6264232397079468, "loss/dist_ce": 0.0, "loss/hidden": 0.2451171875, "loss/idx": 0.0, "loss/logits": 0.050260186195373535, "step": 228 }, { "epoch": 0.001887305603402095, "grad_norm": 5.96875, "grad_norm_var": 3.089452107747396, "learning_rate": 5e-05, "loss": 0.1884, "loss/crossentropy": 1.3441599607467651, "loss/dist_ce": 0.0, "loss/hidden": 0.1669921875, "loss/idx": 0.0, "loss/logits": 0.021441757678985596, "step": 229 }, { "epoch": 0.001895547112587257, "grad_norm": 2.171875, "grad_norm_var": 3.197223917643229, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 0.4492271840572357, "loss/dist_ce": 0.0, "loss/hidden": 0.162109375, "loss/idx": 0.0, "loss/logits": 0.02026466839015484, "step": 230 }, { "epoch": 0.001903788621772419, "grad_norm": 3.21875, "grad_norm_var": 3.222020467122396, "learning_rate": 5e-05, "loss": 0.2551, "loss/crossentropy": 2.23905873298645, "loss/dist_ce": 0.0, "loss/hidden": 0.20703125, "loss/idx": 0.0, "loss/logits": 0.04803081601858139, "step": 231 }, { "epoch": 0.001912030130957581, "grad_norm": 2.1875, "grad_norm_var": 3.368024698893229, "learning_rate": 5e-05, "loss": 0.216, "loss/crossentropy": 1.9740031957626343, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.04604914411902428, "step": 232 }, { "epoch": 0.001920271640142743, "grad_norm": 2.125, "grad_norm_var": 3.5072428385416665, "learning_rate": 5e-05, "loss": 0.2327, "loss/crossentropy": 2.738755226135254, "loss/dist_ce": 0.0, "loss/hidden": 0.18359375, "loss/idx": 0.0, "loss/logits": 0.04907117411494255, "step": 233 }, { "epoch": 0.0019285131493279048, "grad_norm": 4.21875, "grad_norm_var": 2.1248982747395835, "learning_rate": 5e-05, "loss": 0.4989, "loss/crossentropy": 2.8038580417633057, "loss/dist_ce": 0.0, "loss/hidden": 0.390625, "loss/idx": 0.0, "loss/logits": 0.10827778279781342, "step": 234 }, { "epoch": 0.001936754658513067, "grad_norm": 2.84375, "grad_norm_var": 2.068040974934896, "learning_rate": 5e-05, "loss": 0.3216, "loss/crossentropy": 2.015542984008789, "loss/dist_ce": 0.0, "loss/hidden": 0.2734375, "loss/idx": 0.0, "loss/logits": 0.04818400368094444, "step": 235 }, { "epoch": 0.0019449961676982288, "grad_norm": 3.203125, "grad_norm_var": 1.9248372395833333, "learning_rate": 5e-05, "loss": 0.2473, "loss/crossentropy": 1.5457327365875244, "loss/dist_ce": 0.0, "loss/hidden": 0.205078125, "loss/idx": 0.0, "loss/logits": 0.04223756492137909, "step": 236 }, { "epoch": 0.001953237676883391, "grad_norm": 3.15625, "grad_norm_var": 1.8752766927083333, "learning_rate": 5e-05, "loss": 0.3016, "loss/crossentropy": 2.3469016551971436, "loss/dist_ce": 0.0, "loss/hidden": 0.24609375, "loss/idx": 0.0, "loss/logits": 0.05547412484884262, "step": 237 }, { "epoch": 0.001961479186068553, "grad_norm": 3.578125, "grad_norm_var": 1.8204060872395833, "learning_rate": 5e-05, "loss": 0.2486, "loss/crossentropy": 2.7459280490875244, "loss/dist_ce": 0.0, "loss/hidden": 0.197265625, "loss/idx": 0.0, "loss/logits": 0.051300592720508575, "step": 238 }, { "epoch": 0.0019697206952537147, "grad_norm": 1.9765625, "grad_norm_var": 1.9438433329264322, "learning_rate": 5e-05, "loss": 0.2187, "loss/crossentropy": 2.024442434310913, "loss/dist_ce": 0.0, "loss/hidden": 0.171875, "loss/idx": 0.0, "loss/logits": 0.04683014005422592, "step": 239 }, { "epoch": 0.001977962204438877, "grad_norm": 3.140625, "grad_norm_var": 1.8308489481608072, "learning_rate": 5e-05, "loss": 0.2406, "loss/crossentropy": 1.520363211631775, "loss/dist_ce": 0.0, "loss/hidden": 0.201171875, "loss/idx": 0.0, "loss/logits": 0.039388738572597504, "step": 240 }, { "epoch": 0.001986203713624039, "grad_norm": 2.609375, "grad_norm_var": 1.2366920471191407, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.6319429874420166, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.035828400403261185, "step": 241 }, { "epoch": 0.001994445222809201, "grad_norm": 9.3125, "grad_norm_var": 3.5240455627441407, "learning_rate": 5e-05, "loss": 0.2554, "loss/crossentropy": 1.4544413089752197, "loss/dist_ce": 0.0, "loss/hidden": 0.21875, "loss/idx": 0.0, "loss/logits": 0.036612022668123245, "step": 242 }, { "epoch": 0.0020026867319943627, "grad_norm": 2.96875, "grad_norm_var": 3.5372271219889324, "learning_rate": 5e-05, "loss": 0.2419, "loss/crossentropy": 2.548147201538086, "loss/dist_ce": 0.0, "loss/hidden": 0.1884765625, "loss/idx": 0.0, "loss/logits": 0.053470924496650696, "step": 243 }, { "epoch": 0.0020109282411795246, "grad_norm": 11.8125, "grad_norm_var": 7.640775299072265, "learning_rate": 5e-05, "loss": 0.3306, "loss/crossentropy": 0.6415009498596191, "loss/dist_ce": 0.0, "loss/hidden": 0.294921875, "loss/idx": 0.0, "loss/logits": 0.035646334290504456, "step": 244 }, { "epoch": 0.002019169750364687, "grad_norm": 3.203125, "grad_norm_var": 7.404184722900391, "learning_rate": 5e-05, "loss": 0.2558, "loss/crossentropy": 2.516376495361328, "loss/dist_ce": 0.0, "loss/hidden": 0.203125, "loss/idx": 0.0, "loss/logits": 0.052696891129016876, "step": 245 }, { "epoch": 0.002027411259549849, "grad_norm": 2.53125, "grad_norm_var": 7.3314674377441404, "learning_rate": 5e-05, "loss": 0.1816, "loss/crossentropy": 0.31695130467414856, "loss/dist_ce": 0.0, "loss/hidden": 0.166015625, "loss/idx": 0.0, "loss/logits": 0.015581747516989708, "step": 246 }, { "epoch": 0.0020356527687350108, "grad_norm": 2.515625, "grad_norm_var": 7.4243934631347654, "learning_rate": 5e-05, "loss": 0.2397, "loss/crossentropy": 1.937793493270874, "loss/dist_ce": 0.0, "loss/hidden": 0.1875, "loss/idx": 0.0, "loss/logits": 0.05216747149825096, "step": 247 }, { "epoch": 0.0020438942779201726, "grad_norm": 2.46875, "grad_norm_var": 7.367502593994141, "learning_rate": 5e-05, "loss": 0.2293, "loss/crossentropy": 2.4479126930236816, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.04766194522380829, "step": 248 }, { "epoch": 0.0020521357871053345, "grad_norm": 2.25, "grad_norm_var": 7.339662424723307, "learning_rate": 5e-05, "loss": 0.1789, "loss/crossentropy": 1.441886067390442, "loss/dist_ce": 0.0, "loss/hidden": 0.1484375, "loss/idx": 0.0, "loss/logits": 0.030450304970145226, "step": 249 }, { "epoch": 0.002060377296290497, "grad_norm": 6.25, "grad_norm_var": 7.694205474853516, "learning_rate": 5e-05, "loss": 0.2986, "loss/crossentropy": 2.495968818664551, "loss/dist_ce": 0.0, "loss/hidden": 0.234375, "loss/idx": 0.0, "loss/logits": 0.06425687670707703, "step": 250 }, { "epoch": 0.0020686188054756588, "grad_norm": 1.765625, "grad_norm_var": 7.931449127197266, "learning_rate": 5e-05, "loss": 0.1676, "loss/crossentropy": 1.5723477602005005, "loss/dist_ce": 0.0, "loss/hidden": 0.1416015625, "loss/idx": 0.0, "loss/logits": 0.025984089821577072, "step": 251 }, { "epoch": 0.0020768603146608207, "grad_norm": 5.3125, "grad_norm_var": 8.00752944946289, "learning_rate": 5e-05, "loss": 0.4023, "loss/crossentropy": 1.5475258827209473, "loss/dist_ce": 0.0, "loss/hidden": 0.34375, "loss/idx": 0.0, "loss/logits": 0.05857189744710922, "step": 252 }, { "epoch": 0.0020851018238459825, "grad_norm": 4.96875, "grad_norm_var": 7.996083323160807, "learning_rate": 5e-05, "loss": 0.3194, "loss/crossentropy": 2.286716938018799, "loss/dist_ce": 0.0, "loss/hidden": 0.244140625, "loss/idx": 0.0, "loss/logits": 0.07523184269666672, "step": 253 }, { "epoch": 0.002093343333031145, "grad_norm": 5.5, "grad_norm_var": 8.076161448160807, "learning_rate": 5e-05, "loss": 0.2753, "loss/crossentropy": 1.5914890766143799, "loss/dist_ce": 0.0, "loss/hidden": 0.23046875, "loss/idx": 0.0, "loss/logits": 0.044802576303482056, "step": 254 }, { "epoch": 0.0021015848422163068, "grad_norm": 3.34375, "grad_norm_var": 7.771882120768229, "learning_rate": 5e-05, "loss": 0.2477, "loss/crossentropy": 2.1449875831604004, "loss/dist_ce": 0.0, "loss/hidden": 0.1962890625, "loss/idx": 0.0, "loss/logits": 0.0514422208070755, "step": 255 }, { "epoch": 0.0021098263514014687, "grad_norm": 9.3125, "grad_norm_var": 9.1392578125, "learning_rate": 5e-05, "loss": 0.5091, "loss/crossentropy": 2.605140447616577, "loss/dist_ce": 0.0, "loss/hidden": 0.408203125, "loss/idx": 0.0, "loss/logits": 0.10090796649456024, "step": 256 }, { "epoch": 0.0021180678605866306, "grad_norm": 13.75, "grad_norm_var": 13.705028279622395, "learning_rate": 5e-05, "loss": 0.3277, "loss/crossentropy": 2.162487745285034, "loss/dist_ce": 0.0, "loss/hidden": 0.26953125, "loss/idx": 0.0, "loss/logits": 0.058200109750032425, "step": 257 }, { "epoch": 0.0021263093697717925, "grad_norm": 3.140625, "grad_norm_var": 12.910640462239583, "learning_rate": 5e-05, "loss": 0.2509, "loss/crossentropy": 2.1336512565612793, "loss/dist_ce": 0.0, "loss/hidden": 0.197265625, "loss/idx": 0.0, "loss/logits": 0.053586918860673904, "step": 258 }, { "epoch": 0.0021345508789569548, "grad_norm": 5.71875, "grad_norm_var": 12.61343994140625, "learning_rate": 5e-05, "loss": 0.2007, "loss/crossentropy": 0.37939441204071045, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.019016824662685394, "step": 259 }, { "epoch": 0.0021427923881421167, "grad_norm": 2.59375, "grad_norm_var": 9.846614583333333, "learning_rate": 5e-05, "loss": 0.2159, "loss/crossentropy": 1.1070560216903687, "loss/dist_ce": 0.0, "loss/hidden": 0.185546875, "loss/idx": 0.0, "loss/logits": 0.03035794384777546, "step": 260 }, { "epoch": 0.0021510338973272786, "grad_norm": 4.28125, "grad_norm_var": 9.709251912434896, "learning_rate": 5e-05, "loss": 0.2899, "loss/crossentropy": 1.604844331741333, "loss/dist_ce": 0.0, "loss/hidden": 0.2294921875, "loss/idx": 0.0, "loss/logits": 0.06044600158929825, "step": 261 }, { "epoch": 0.0021592754065124405, "grad_norm": 2.84375, "grad_norm_var": 9.623680623372396, "learning_rate": 5e-05, "loss": 0.2858, "loss/crossentropy": 2.6856131553649902, "loss/dist_ce": 0.0, "loss/hidden": 0.220703125, "loss/idx": 0.0, "loss/logits": 0.06507028639316559, "step": 262 }, { "epoch": 0.0021675169156976024, "grad_norm": 3.734375, "grad_norm_var": 9.353270467122396, "learning_rate": 5e-05, "loss": 0.3012, "loss/crossentropy": 2.49045991897583, "loss/dist_ce": 0.0, "loss/hidden": 0.24609375, "loss/idx": 0.0, "loss/logits": 0.055099453777074814, "step": 263 }, { "epoch": 0.0021757584248827647, "grad_norm": 4.65625, "grad_norm_var": 8.964476521809896, "learning_rate": 5e-05, "loss": 0.3876, "loss/crossentropy": 1.3125131130218506, "loss/dist_ce": 0.0, "loss/hidden": 0.330078125, "loss/idx": 0.0, "loss/logits": 0.057526711374521255, "step": 264 }, { "epoch": 0.0021839999340679266, "grad_norm": 11.625, "grad_norm_var": 11.065306599934896, "learning_rate": 5e-05, "loss": 0.5553, "loss/crossentropy": 2.620126485824585, "loss/dist_ce": 0.0, "loss/hidden": 0.43359375, "loss/idx": 0.0, "loss/logits": 0.12169644981622696, "step": 265 }, { "epoch": 0.0021922414432530885, "grad_norm": 2.09375, "grad_norm_var": 11.756932576497396, "learning_rate": 5e-05, "loss": 0.2092, "loss/crossentropy": 1.3833715915679932, "loss/dist_ce": 0.0, "loss/hidden": 0.173828125, "loss/idx": 0.0, "loss/logits": 0.035347893834114075, "step": 266 }, { "epoch": 0.0022004829524382504, "grad_norm": 2.5, "grad_norm_var": 11.445540364583334, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 0.3456151485443115, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.015461962670087814, "step": 267 }, { "epoch": 0.0022087244616234127, "grad_norm": 2.546875, "grad_norm_var": 11.932225545247396, "learning_rate": 5e-05, "loss": 0.1921, "loss/crossentropy": 2.6017909049987793, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.03972596302628517, "step": 268 }, { "epoch": 0.0022169659708085746, "grad_norm": 14.4375, "grad_norm_var": 17.290453084309895, "learning_rate": 5e-05, "loss": 0.368, "loss/crossentropy": 2.35398006439209, "loss/dist_ce": 0.0, "loss/hidden": 0.2890625, "loss/idx": 0.0, "loss/logits": 0.0789838507771492, "step": 269 }, { "epoch": 0.0022252074799937365, "grad_norm": 3.53125, "grad_norm_var": 17.599608357747396, "learning_rate": 5e-05, "loss": 0.2167, "loss/crossentropy": 2.4693641662597656, "loss/dist_ce": 0.0, "loss/hidden": 0.166015625, "loss/idx": 0.0, "loss/logits": 0.05065637826919556, "step": 270 }, { "epoch": 0.0022334489891788984, "grad_norm": 2.515625, "grad_norm_var": 17.895113118489583, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 1.3819841146469116, "loss/dist_ce": 0.0, "loss/hidden": 0.154296875, "loss/idx": 0.0, "loss/logits": 0.027211952954530716, "step": 271 }, { "epoch": 0.0022416904983640603, "grad_norm": 7.0, "grad_norm_var": 17.078511555989582, "learning_rate": 5e-05, "loss": 0.1861, "loss/crossentropy": 1.4722107648849487, "loss/dist_ce": 0.0, "loss/hidden": 0.1572265625, "loss/idx": 0.0, "loss/logits": 0.028832225129008293, "step": 272 }, { "epoch": 0.0022499320075492226, "grad_norm": 4.21875, "grad_norm_var": 12.190022786458334, "learning_rate": 5e-05, "loss": 0.3099, "loss/crossentropy": 1.6392327547073364, "loss/dist_ce": 0.0, "loss/hidden": 0.251953125, "loss/idx": 0.0, "loss/logits": 0.05795075744390488, "step": 273 }, { "epoch": 0.0022581735167343845, "grad_norm": 5.21875, "grad_norm_var": 11.989110310872396, "learning_rate": 5e-05, "loss": 0.4204, "loss/crossentropy": 2.4640941619873047, "loss/dist_ce": 0.0, "loss/hidden": 0.34765625, "loss/idx": 0.0, "loss/logits": 0.0727241188287735, "step": 274 }, { "epoch": 0.0022664150259195464, "grad_norm": 3.21875, "grad_norm_var": 12.130060831705729, "learning_rate": 5e-05, "loss": 0.2795, "loss/crossentropy": 1.452579140663147, "loss/dist_ce": 0.0, "loss/hidden": 0.2314453125, "loss/idx": 0.0, "loss/logits": 0.048067688941955566, "step": 275 }, { "epoch": 0.0022746565351047083, "grad_norm": 6.9375, "grad_norm_var": 12.023729451497395, "learning_rate": 5e-05, "loss": 0.2885, "loss/crossentropy": 1.5026805400848389, "loss/dist_ce": 0.0, "loss/hidden": 0.23828125, "loss/idx": 0.0, "loss/logits": 0.05024395138025284, "step": 276 }, { "epoch": 0.00228289804428987, "grad_norm": 2.875, "grad_norm_var": 12.298021443684895, "learning_rate": 5e-05, "loss": 0.1911, "loss/crossentropy": 1.6457816362380981, "loss/dist_ce": 0.0, "loss/hidden": 0.1552734375, "loss/idx": 0.0, "loss/logits": 0.03582204133272171, "step": 277 }, { "epoch": 0.0022911395534750325, "grad_norm": 2.453125, "grad_norm_var": 12.419710286458333, "learning_rate": 5e-05, "loss": 0.226, "loss/crossentropy": 2.44157338142395, "loss/dist_ce": 0.0, "loss/hidden": 0.1767578125, "loss/idx": 0.0, "loss/logits": 0.0492391511797905, "step": 278 }, { "epoch": 0.0022993810626601944, "grad_norm": 3.09375, "grad_norm_var": 12.55113016764323, "learning_rate": 5e-05, "loss": 0.285, "loss/crossentropy": 2.398951292037964, "loss/dist_ce": 0.0, "loss/hidden": 0.21484375, "loss/idx": 0.0, "loss/logits": 0.07012955844402313, "step": 279 }, { "epoch": 0.0023076225718453563, "grad_norm": 4.78125, "grad_norm_var": 12.547500610351562, "learning_rate": 5e-05, "loss": 0.3511, "loss/crossentropy": 2.1601598262786865, "loss/dist_ce": 0.0, "loss/hidden": 0.267578125, "loss/idx": 0.0, "loss/logits": 0.08354485034942627, "step": 280 }, { "epoch": 0.002315864081030518, "grad_norm": 1.8046875, "grad_norm_var": 9.82229995727539, "learning_rate": 5e-05, "loss": 0.2235, "loss/crossentropy": 1.5090404748916626, "loss/dist_ce": 0.0, "loss/hidden": 0.18359375, "loss/idx": 0.0, "loss/logits": 0.03991977125406265, "step": 281 }, { "epoch": 0.0023241055902156805, "grad_norm": 3.375, "grad_norm_var": 9.5434445699056, "learning_rate": 5e-05, "loss": 0.3108, "loss/crossentropy": 2.371715545654297, "loss/dist_ce": 0.0, "loss/hidden": 0.251953125, "loss/idx": 0.0, "loss/logits": 0.05880487337708473, "step": 282 }, { "epoch": 0.0023323470994008424, "grad_norm": 4.78125, "grad_norm_var": 9.288734690348308, "learning_rate": 5e-05, "loss": 0.3743, "loss/crossentropy": 1.7449641227722168, "loss/dist_ce": 0.0, "loss/hidden": 0.296875, "loss/idx": 0.0, "loss/logits": 0.07747267186641693, "step": 283 }, { "epoch": 0.0023405886085860043, "grad_norm": 2.34375, "grad_norm_var": 9.345546213785807, "learning_rate": 5e-05, "loss": 0.2385, "loss/crossentropy": 2.332099199295044, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.05684517323970795, "step": 284 }, { "epoch": 0.002348830117771166, "grad_norm": 8.875, "grad_norm_var": 3.936232248942057, "learning_rate": 5e-05, "loss": 0.2679, "loss/crossentropy": 2.6605751514434814, "loss/dist_ce": 0.0, "loss/hidden": 0.2197265625, "loss/idx": 0.0, "loss/logits": 0.04814404994249344, "step": 285 }, { "epoch": 0.002357071626956328, "grad_norm": 2.515625, "grad_norm_var": 4.089766184488933, "learning_rate": 5e-05, "loss": 0.2346, "loss/crossentropy": 2.4595489501953125, "loss/dist_ce": 0.0, "loss/hidden": 0.189453125, "loss/idx": 0.0, "loss/logits": 0.0451187826693058, "step": 286 }, { "epoch": 0.0023653131361414904, "grad_norm": 6.3125, "grad_norm_var": 4.175789133707682, "learning_rate": 5e-05, "loss": 0.4335, "loss/crossentropy": 3.0684797763824463, "loss/dist_ce": 0.0, "loss/hidden": 0.3125, "loss/idx": 0.0, "loss/logits": 0.12099675089120865, "step": 287 }, { "epoch": 0.0023735546453266523, "grad_norm": 2.046875, "grad_norm_var": 3.967474110921224, "learning_rate": 5e-05, "loss": 0.1653, "loss/crossentropy": 2.7369492053985596, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.03439934179186821, "step": 288 }, { "epoch": 0.002381796154511814, "grad_norm": 37.0, "grad_norm_var": 71.8541135152181, "learning_rate": 5e-05, "loss": 0.2523, "loss/crossentropy": 1.4466071128845215, "loss/dist_ce": 0.0, "loss/hidden": 0.21484375, "loss/idx": 0.0, "loss/logits": 0.037433233112096786, "step": 289 }, { "epoch": 0.002390037663696976, "grad_norm": 2.71875, "grad_norm_var": 72.5391721089681, "learning_rate": 5e-05, "loss": 0.183, "loss/crossentropy": 0.8366924524307251, "loss/dist_ce": 0.0, "loss/hidden": 0.154296875, "loss/idx": 0.0, "loss/logits": 0.028669871389865875, "step": 290 }, { "epoch": 0.002398279172882138, "grad_norm": 3.296875, "grad_norm_var": 72.5111467997233, "learning_rate": 5e-05, "loss": 0.2282, "loss/crossentropy": 2.2352423667907715, "loss/dist_ce": 0.0, "loss/hidden": 0.1865234375, "loss/idx": 0.0, "loss/logits": 0.04168039560317993, "step": 291 }, { "epoch": 0.0024065206820673003, "grad_norm": 7.25, "grad_norm_var": 72.55836766560873, "learning_rate": 5e-05, "loss": 0.5589, "loss/crossentropy": 3.05739426612854, "loss/dist_ce": 0.0, "loss/hidden": 0.45703125, "loss/idx": 0.0, "loss/logits": 0.10190241038799286, "step": 292 }, { "epoch": 0.002414762191252462, "grad_norm": 8.4375, "grad_norm_var": 72.19658788045247, "learning_rate": 5e-05, "loss": 0.337, "loss/crossentropy": 1.8930912017822266, "loss/dist_ce": 0.0, "loss/hidden": 0.29296875, "loss/idx": 0.0, "loss/logits": 0.04408019781112671, "step": 293 }, { "epoch": 0.002423003700437624, "grad_norm": 10.9375, "grad_norm_var": 72.32363255818684, "learning_rate": 5e-05, "loss": 0.2491, "loss/crossentropy": 1.5801359415054321, "loss/dist_ce": 0.0, "loss/hidden": 0.208984375, "loss/idx": 0.0, "loss/logits": 0.04009478539228439, "step": 294 }, { "epoch": 0.002431245209622786, "grad_norm": 5.09375, "grad_norm_var": 71.57246068318685, "learning_rate": 5e-05, "loss": 0.1903, "loss/crossentropy": 0.9831718802452087, "loss/dist_ce": 0.0, "loss/hidden": 0.158203125, "loss/idx": 0.0, "loss/logits": 0.032129500061273575, "step": 295 }, { "epoch": 0.0024394867188079483, "grad_norm": 2.765625, "grad_norm_var": 72.41545384724935, "learning_rate": 5e-05, "loss": 0.3458, "loss/crossentropy": 2.587148666381836, "loss/dist_ce": 0.0, "loss/hidden": 0.2470703125, "loss/idx": 0.0, "loss/logits": 0.09877443313598633, "step": 296 }, { "epoch": 0.00244772822799311, "grad_norm": 2.8125, "grad_norm_var": 71.80135091145833, "learning_rate": 5e-05, "loss": 0.2355, "loss/crossentropy": 2.281587839126587, "loss/dist_ce": 0.0, "loss/hidden": 0.1865234375, "loss/idx": 0.0, "loss/logits": 0.04900825023651123, "step": 297 }, { "epoch": 0.002455969737178272, "grad_norm": 2.171875, "grad_norm_var": 72.45891825358073, "learning_rate": 5e-05, "loss": 0.1823, "loss/crossentropy": 1.4672614336013794, "loss/dist_ce": 0.0, "loss/hidden": 0.1552734375, "loss/idx": 0.0, "loss/logits": 0.02706265263259411, "step": 298 }, { "epoch": 0.002464211246363434, "grad_norm": 2.953125, "grad_norm_var": 73.16838785807292, "learning_rate": 5e-05, "loss": 0.2346, "loss/crossentropy": 2.4047231674194336, "loss/dist_ce": 0.0, "loss/hidden": 0.1865234375, "loss/idx": 0.0, "loss/logits": 0.048083603382110596, "step": 299 }, { "epoch": 0.002472452755548596, "grad_norm": 11.1875, "grad_norm_var": 72.89547526041666, "learning_rate": 5e-05, "loss": 0.3081, "loss/crossentropy": 0.815432071685791, "loss/dist_ce": 0.0, "loss/hidden": 0.279296875, "loss/idx": 0.0, "loss/logits": 0.028755802661180496, "step": 300 }, { "epoch": 0.0024806942647337582, "grad_norm": 5.34375, "grad_norm_var": 72.92076416015625, "learning_rate": 5e-05, "loss": 0.3042, "loss/crossentropy": 1.8723258972167969, "loss/dist_ce": 0.0, "loss/hidden": 0.248046875, "loss/idx": 0.0, "loss/logits": 0.05616258084774017, "step": 301 }, { "epoch": 0.00248893577391892, "grad_norm": 18.0, "grad_norm_var": 78.5388905843099, "learning_rate": 5e-05, "loss": 0.3228, "loss/crossentropy": 1.394120693206787, "loss/dist_ce": 0.0, "loss/hidden": 0.2890625, "loss/idx": 0.0, "loss/logits": 0.03374548256397247, "step": 302 }, { "epoch": 0.002497177283104082, "grad_norm": 3.984375, "grad_norm_var": 79.40784505208333, "learning_rate": 5e-05, "loss": 0.1944, "loss/crossentropy": 1.3347328901290894, "loss/dist_ce": 0.0, "loss/hidden": 0.16796875, "loss/idx": 0.0, "loss/logits": 0.02643435075879097, "step": 303 }, { "epoch": 0.002505418792289244, "grad_norm": 3.3125, "grad_norm_var": 78.5244618733724, "learning_rate": 5e-05, "loss": 0.2445, "loss/crossentropy": 1.3129806518554688, "loss/dist_ce": 0.0, "loss/hidden": 0.201171875, "loss/idx": 0.0, "loss/logits": 0.04336348548531532, "step": 304 }, { "epoch": 0.002513660301474406, "grad_norm": 2.5, "grad_norm_var": 19.303954060872396, "learning_rate": 5e-05, "loss": 0.2061, "loss/crossentropy": 1.494554042816162, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.02449900656938553, "step": 305 }, { "epoch": 0.002521901810659568, "grad_norm": 2.0625, "grad_norm_var": 19.600291951497397, "learning_rate": 5e-05, "loss": 0.2326, "loss/crossentropy": 2.698347806930542, "loss/dist_ce": 0.0, "loss/hidden": 0.19140625, "loss/idx": 0.0, "loss/logits": 0.04119253158569336, "step": 306 }, { "epoch": 0.00253014331984473, "grad_norm": 3.890625, "grad_norm_var": 19.427578735351563, "learning_rate": 5e-05, "loss": 0.4184, "loss/crossentropy": 1.5166016817092896, "loss/dist_ce": 0.0, "loss/hidden": 0.3515625, "loss/idx": 0.0, "loss/logits": 0.06688607484102249, "step": 307 }, { "epoch": 0.002538384829029892, "grad_norm": 7.28125, "grad_norm_var": 19.43370666503906, "learning_rate": 5e-05, "loss": 0.2909, "loss/crossentropy": 2.696192979812622, "loss/dist_ce": 0.0, "loss/hidden": 0.22265625, "loss/idx": 0.0, "loss/logits": 0.068264901638031, "step": 308 }, { "epoch": 0.002546626338215054, "grad_norm": 4.875, "grad_norm_var": 18.97215881347656, "learning_rate": 5e-05, "loss": 0.2964, "loss/crossentropy": 2.594907522201538, "loss/dist_ce": 0.0, "loss/hidden": 0.23828125, "loss/idx": 0.0, "loss/logits": 0.05810549482703209, "step": 309 }, { "epoch": 0.002554867847400216, "grad_norm": 2.59375, "grad_norm_var": 17.355557250976563, "learning_rate": 5e-05, "loss": 0.2423, "loss/crossentropy": 2.760004758834839, "loss/dist_ce": 0.0, "loss/hidden": 0.1884765625, "loss/idx": 0.0, "loss/logits": 0.05380372703075409, "step": 310 }, { "epoch": 0.002563109356585378, "grad_norm": 2.84375, "grad_norm_var": 17.659365844726562, "learning_rate": 5e-05, "loss": 0.2252, "loss/crossentropy": 1.6147583723068237, "loss/dist_ce": 0.0, "loss/hidden": 0.189453125, "loss/idx": 0.0, "loss/logits": 0.035766348242759705, "step": 311 }, { "epoch": 0.00257135086577054, "grad_norm": 1.8671875, "grad_norm_var": 17.966829172770183, "learning_rate": 5e-05, "loss": 0.1994, "loss/crossentropy": 2.396746873855591, "loss/dist_ce": 0.0, "loss/hidden": 0.1591796875, "loss/idx": 0.0, "loss/logits": 0.04022689908742905, "step": 312 }, { "epoch": 0.002579592374955702, "grad_norm": 2.71875, "grad_norm_var": 17.992909495035807, "learning_rate": 5e-05, "loss": 0.2099, "loss/crossentropy": 1.5479542016983032, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.03212570399045944, "step": 313 }, { "epoch": 0.0025878338841408637, "grad_norm": 1.34375, "grad_norm_var": 18.331384023030598, "learning_rate": 5e-05, "loss": 0.1331, "loss/crossentropy": 0.4866638779640198, "loss/dist_ce": 0.0, "loss/hidden": 0.1171875, "loss/idx": 0.0, "loss/logits": 0.01593683287501335, "step": 314 }, { "epoch": 0.002596075393326026, "grad_norm": 2.3125, "grad_norm_var": 18.5145627339681, "learning_rate": 5e-05, "loss": 0.2208, "loss/crossentropy": 2.5530097484588623, "loss/dist_ce": 0.0, "loss/hidden": 0.18359375, "loss/idx": 0.0, "loss/logits": 0.037243057042360306, "step": 315 }, { "epoch": 0.002604316902511188, "grad_norm": 9.3125, "grad_norm_var": 17.1267453511556, "learning_rate": 5e-05, "loss": 0.2906, "loss/crossentropy": 1.721799373626709, "loss/dist_ce": 0.0, "loss/hidden": 0.251953125, "loss/idx": 0.0, "loss/logits": 0.03867912292480469, "step": 316 }, { "epoch": 0.00261255841169635, "grad_norm": 4.21875, "grad_norm_var": 17.100304921468098, "learning_rate": 5e-05, "loss": 0.2352, "loss/crossentropy": 2.6998496055603027, "loss/dist_ce": 0.0, "loss/hidden": 0.189453125, "loss/idx": 0.0, "loss/logits": 0.04570027440786362, "step": 317 }, { "epoch": 0.0026207999208815117, "grad_norm": 2.9375, "grad_norm_var": 4.307966868082683, "learning_rate": 5e-05, "loss": 0.268, "loss/crossentropy": 2.9610462188720703, "loss/dist_ce": 0.0, "loss/hidden": 0.203125, "loss/idx": 0.0, "loss/logits": 0.06485921144485474, "step": 318 }, { "epoch": 0.0026290414300666736, "grad_norm": 14.9375, "grad_norm_var": 12.325996653238933, "learning_rate": 5e-05, "loss": 0.4275, "loss/crossentropy": 0.4579806327819824, "loss/dist_ce": 0.0, "loss/hidden": 0.369140625, "loss/idx": 0.0, "loss/logits": 0.058338165283203125, "step": 319 }, { "epoch": 0.002637282939251836, "grad_norm": 2.6875, "grad_norm_var": 12.433784739176433, "learning_rate": 5e-05, "loss": 0.2148, "loss/crossentropy": 0.9076347947120667, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.033187899738550186, "step": 320 }, { "epoch": 0.002645524448436998, "grad_norm": 7.5, "grad_norm_var": 12.813667551676433, "learning_rate": 5e-05, "loss": 0.2431, "loss/crossentropy": 1.4940351247787476, "loss/dist_ce": 0.0, "loss/hidden": 0.197265625, "loss/idx": 0.0, "loss/logits": 0.04587508738040924, "step": 321 }, { "epoch": 0.0026537659576221597, "grad_norm": 3.09375, "grad_norm_var": 12.533095041910807, "learning_rate": 5e-05, "loss": 0.2585, "loss/crossentropy": 2.5319809913635254, "loss/dist_ce": 0.0, "loss/hidden": 0.193359375, "loss/idx": 0.0, "loss/logits": 0.06514355540275574, "step": 322 }, { "epoch": 0.0026620074668073216, "grad_norm": 2.34375, "grad_norm_var": 12.839448801676433, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 1.5948070287704468, "loss/dist_ce": 0.0, "loss/hidden": 0.150390625, "loss/idx": 0.0, "loss/logits": 0.03354639932513237, "step": 323 }, { "epoch": 0.002670248975992484, "grad_norm": 2.109375, "grad_norm_var": 12.630688222249349, "learning_rate": 5e-05, "loss": 0.2014, "loss/crossentropy": 0.86527419090271, "loss/dist_ce": 0.0, "loss/hidden": 0.173828125, "loss/idx": 0.0, "loss/logits": 0.027525369077920914, "step": 324 }, { "epoch": 0.002678490485177646, "grad_norm": 5.4375, "grad_norm_var": 12.698766835530598, "learning_rate": 5e-05, "loss": 0.3587, "loss/crossentropy": 1.7473679780960083, "loss/dist_ce": 0.0, "loss/hidden": 0.27734375, "loss/idx": 0.0, "loss/logits": 0.081350177526474, "step": 325 }, { "epoch": 0.0026867319943628077, "grad_norm": 3.8125, "grad_norm_var": 12.519842274983723, "learning_rate": 5e-05, "loss": 0.2829, "loss/crossentropy": 2.330885410308838, "loss/dist_ce": 0.0, "loss/hidden": 0.232421875, "loss/idx": 0.0, "loss/logits": 0.05051898583769798, "step": 326 }, { "epoch": 0.0026949735035479696, "grad_norm": 3.078125, "grad_norm_var": 12.476446278889973, "learning_rate": 5e-05, "loss": 0.2309, "loss/crossentropy": 1.6335246562957764, "loss/dist_ce": 0.0, "loss/hidden": 0.19140625, "loss/idx": 0.0, "loss/logits": 0.0394761748611927, "step": 327 }, { "epoch": 0.0027032150127331315, "grad_norm": 3.53125, "grad_norm_var": 12.097102864583333, "learning_rate": 5e-05, "loss": 0.1913, "loss/crossentropy": 1.0116859674453735, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.021392133086919785, "step": 328 }, { "epoch": 0.002711456521918294, "grad_norm": 108.0, "grad_norm_var": 680.3999959309896, "learning_rate": 5e-05, "loss": 0.7489, "loss/crossentropy": 1.9841949939727783, "loss/dist_ce": 0.0, "loss/hidden": 0.609375, "loss/idx": 0.0, "loss/logits": 0.13949471712112427, "step": 329 }, { "epoch": 0.0027196980311034557, "grad_norm": 1.8515625, "grad_norm_var": 679.7595273335775, "learning_rate": 5e-05, "loss": 0.1969, "loss/crossentropy": 2.633579730987549, "loss/dist_ce": 0.0, "loss/hidden": 0.154296875, "loss/idx": 0.0, "loss/logits": 0.042630117386579514, "step": 330 }, { "epoch": 0.0027279395402886176, "grad_norm": 3.640625, "grad_norm_var": 678.318477121989, "learning_rate": 5e-05, "loss": 0.3044, "loss/crossentropy": 1.2868930101394653, "loss/dist_ce": 0.0, "loss/hidden": 0.25, "loss/idx": 0.0, "loss/logits": 0.054360825568437576, "step": 331 }, { "epoch": 0.0027361810494737795, "grad_norm": 2.0, "grad_norm_var": 683.4576983133952, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 1.3776507377624512, "loss/dist_ce": 0.0, "loss/hidden": 0.14453125, "loss/idx": 0.0, "loss/logits": 0.02663344331085682, "step": 332 }, { "epoch": 0.0027444225586589414, "grad_norm": 1.875, "grad_norm_var": 685.8260149637858, "learning_rate": 5e-05, "loss": 0.2246, "loss/crossentropy": 2.4259564876556396, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.04682992398738861, "step": 333 }, { "epoch": 0.0027526640678441038, "grad_norm": 2.484375, "grad_norm_var": 686.2989051818847, "learning_rate": 5e-05, "loss": 0.1464, "loss/crossentropy": 0.36722007393836975, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.011639876291155815, "step": 334 }, { "epoch": 0.0027609055770292656, "grad_norm": 2.484375, "grad_norm_var": 688.6630531311035, "learning_rate": 5e-05, "loss": 0.182, "loss/crossentropy": 1.3434722423553467, "loss/dist_ce": 0.0, "loss/hidden": 0.150390625, "loss/idx": 0.0, "loss/logits": 0.03163960948586464, "step": 335 }, { "epoch": 0.0027691470862144275, "grad_norm": 9.4375, "grad_norm_var": 685.1584144592285, "learning_rate": 5e-05, "loss": 0.5014, "loss/crossentropy": 1.7821474075317383, "loss/dist_ce": 0.0, "loss/hidden": 0.380859375, "loss/idx": 0.0, "loss/logits": 0.12050823867321014, "step": 336 }, { "epoch": 0.0027773885953995894, "grad_norm": 2.765625, "grad_norm_var": 688.2431556701661, "learning_rate": 5e-05, "loss": 0.1513, "loss/crossentropy": 2.026543378829956, "loss/dist_ce": 0.0, "loss/hidden": 0.1259765625, "loss/idx": 0.0, "loss/logits": 0.025330830365419388, "step": 337 }, { "epoch": 0.0027856301045847518, "grad_norm": 2.25, "grad_norm_var": 689.0501564025878, "learning_rate": 5e-05, "loss": 0.1708, "loss/crossentropy": 1.9071934223175049, "loss/dist_ce": 0.0, "loss/hidden": 0.13671875, "loss/idx": 0.0, "loss/logits": 0.03409082442522049, "step": 338 }, { "epoch": 0.0027938716137699137, "grad_norm": 1.75, "grad_norm_var": 689.6639686584473, "learning_rate": 5e-05, "loss": 0.1443, "loss/crossentropy": 0.5046422481536865, "loss/dist_ce": 0.0, "loss/hidden": 0.1298828125, "loss/idx": 0.0, "loss/logits": 0.014421624131500721, "step": 339 }, { "epoch": 0.0028021131229550755, "grad_norm": 2.59375, "grad_norm_var": 689.183125559489, "learning_rate": 5e-05, "loss": 0.2108, "loss/crossentropy": 1.3842849731445312, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.03306809440255165, "step": 340 }, { "epoch": 0.0028103546321402374, "grad_norm": 3.984375, "grad_norm_var": 690.1626604715983, "learning_rate": 5e-05, "loss": 0.3321, "loss/crossentropy": 2.6099562644958496, "loss/dist_ce": 0.0, "loss/hidden": 0.248046875, "loss/idx": 0.0, "loss/logits": 0.08409686386585236, "step": 341 }, { "epoch": 0.0028185961413253993, "grad_norm": 1.9296875, "grad_norm_var": 691.8675496419271, "learning_rate": 5e-05, "loss": 0.1953, "loss/crossentropy": 1.5371732711791992, "loss/dist_ce": 0.0, "loss/hidden": 0.1572265625, "loss/idx": 0.0, "loss/logits": 0.03804505988955498, "step": 342 }, { "epoch": 0.0028268376505105617, "grad_norm": 2.828125, "grad_norm_var": 692.0889689127604, "learning_rate": 5e-05, "loss": 0.2334, "loss/crossentropy": 2.7439146041870117, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.055648088455200195, "step": 343 }, { "epoch": 0.0028350791596957236, "grad_norm": 2.171875, "grad_norm_var": 693.3022288004557, "learning_rate": 5e-05, "loss": 0.1965, "loss/crossentropy": 1.5956981182098389, "loss/dist_ce": 0.0, "loss/hidden": 0.16015625, "loss/idx": 0.0, "loss/logits": 0.036296091973781586, "step": 344 }, { "epoch": 0.0028433206688808855, "grad_norm": 2.59375, "grad_norm_var": 3.4128326416015624, "learning_rate": 5e-05, "loss": 0.1868, "loss/crossentropy": 1.582602858543396, "loss/dist_ce": 0.0, "loss/hidden": 0.1533203125, "loss/idx": 0.0, "loss/logits": 0.033465512096881866, "step": 345 }, { "epoch": 0.0028515621780660473, "grad_norm": 2.234375, "grad_norm_var": 3.3677101135253906, "learning_rate": 5e-05, "loss": 0.1981, "loss/crossentropy": 1.295432209968567, "loss/dist_ce": 0.0, "loss/hidden": 0.16796875, "loss/idx": 0.0, "loss/logits": 0.030169658362865448, "step": 346 }, { "epoch": 0.0028598036872512092, "grad_norm": 2.4375, "grad_norm_var": 3.3456214904785155, "learning_rate": 5e-05, "loss": 0.2163, "loss/crossentropy": 1.2951956987380981, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.034686122089624405, "step": 347 }, { "epoch": 0.0028680451964363716, "grad_norm": 3.015625, "grad_norm_var": 3.293121083577474, "learning_rate": 5e-05, "loss": 0.2123, "loss/crossentropy": 0.4816242456436157, "loss/dist_ce": 0.0, "loss/hidden": 0.193359375, "loss/idx": 0.0, "loss/logits": 0.018970437347888947, "step": 348 }, { "epoch": 0.0028762867056215335, "grad_norm": 3.15625, "grad_norm_var": 3.2159624735514325, "learning_rate": 5e-05, "loss": 0.2762, "loss/crossentropy": 2.688483715057373, "loss/dist_ce": 0.0, "loss/hidden": 0.2060546875, "loss/idx": 0.0, "loss/logits": 0.07011875510215759, "step": 349 }, { "epoch": 0.0028845282148066954, "grad_norm": 3.5625, "grad_norm_var": 3.2134356180826824, "learning_rate": 5e-05, "loss": 0.2319, "loss/crossentropy": 2.7942285537719727, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.050256311893463135, "step": 350 }, { "epoch": 0.0028927697239918572, "grad_norm": 2.890625, "grad_norm_var": 3.1917742411295573, "learning_rate": 5e-05, "loss": 0.2219, "loss/crossentropy": 0.3606606721878052, "loss/dist_ce": 0.0, "loss/hidden": 0.201171875, "loss/idx": 0.0, "loss/logits": 0.020754382014274597, "step": 351 }, { "epoch": 0.0029010112331770196, "grad_norm": 2.640625, "grad_norm_var": 0.33584772745768227, "learning_rate": 5e-05, "loss": 0.2024, "loss/crossentropy": 1.9477823972702026, "loss/dist_ce": 0.0, "loss/hidden": 0.158203125, "loss/idx": 0.0, "loss/logits": 0.04415898397564888, "step": 352 }, { "epoch": 0.0029092527423621815, "grad_norm": 2.90625, "grad_norm_var": 0.33877741495768227, "learning_rate": 5e-05, "loss": 0.2767, "loss/crossentropy": 1.250917673110962, "loss/dist_ce": 0.0, "loss/hidden": 0.2314453125, "loss/idx": 0.0, "loss/logits": 0.04520602151751518, "step": 353 }, { "epoch": 0.0029174942515473434, "grad_norm": 6.28125, "grad_norm_var": 1.1211443583170573, "learning_rate": 5e-05, "loss": 0.3595, "loss/crossentropy": 2.2711970806121826, "loss/dist_ce": 0.0, "loss/hidden": 0.28125, "loss/idx": 0.0, "loss/logits": 0.07829815149307251, "step": 354 }, { "epoch": 0.0029257357607325053, "grad_norm": 3.1875, "grad_norm_var": 1.0229713439941406, "learning_rate": 5e-05, "loss": 0.319, "loss/crossentropy": 2.352555990219116, "loss/dist_ce": 0.0, "loss/hidden": 0.251953125, "loss/idx": 0.0, "loss/logits": 0.06703340262174606, "step": 355 }, { "epoch": 0.002933977269917667, "grad_norm": 7.3125, "grad_norm_var": 2.142752838134766, "learning_rate": 5e-05, "loss": 0.3474, "loss/crossentropy": 2.538165807723999, "loss/dist_ce": 0.0, "loss/hidden": 0.2890625, "loss/idx": 0.0, "loss/logits": 0.05830331891775131, "step": 356 }, { "epoch": 0.0029422187791028295, "grad_norm": 6.53125, "grad_norm_var": 2.7735023498535156, "learning_rate": 5e-05, "loss": 0.222, "loss/crossentropy": 1.2392635345458984, "loss/dist_ce": 0.0, "loss/hidden": 0.19140625, "loss/idx": 0.0, "loss/logits": 0.030571604147553444, "step": 357 }, { "epoch": 0.0029504602882879914, "grad_norm": 2.140625, "grad_norm_var": 2.7326812744140625, "learning_rate": 5e-05, "loss": 0.1149, "loss/crossentropy": 0.3575584590435028, "loss/dist_ce": 0.0, "loss/hidden": 0.107421875, "loss/idx": 0.0, "loss/logits": 0.007437488064169884, "step": 358 }, { "epoch": 0.0029587017974731533, "grad_norm": 1.953125, "grad_norm_var": 2.8581207275390623, "learning_rate": 5e-05, "loss": 0.1523, "loss/crossentropy": 1.4491117000579834, "loss/dist_ce": 0.0, "loss/hidden": 0.126953125, "loss/idx": 0.0, "loss/logits": 0.025374623015522957, "step": 359 }, { "epoch": 0.002966943306658315, "grad_norm": 6.875, "grad_norm_var": 3.4463175455729167, "learning_rate": 5e-05, "loss": 0.3615, "loss/crossentropy": 2.15775203704834, "loss/dist_ce": 0.0, "loss/hidden": 0.294921875, "loss/idx": 0.0, "loss/logits": 0.06661273539066315, "step": 360 }, { "epoch": 0.002975184815843477, "grad_norm": 9.4375, "grad_norm_var": 5.334586588541667, "learning_rate": 5e-05, "loss": 0.2964, "loss/crossentropy": 1.3742177486419678, "loss/dist_ce": 0.0, "loss/hidden": 0.259765625, "loss/idx": 0.0, "loss/logits": 0.036625977605581284, "step": 361 }, { "epoch": 0.0029834263250286394, "grad_norm": 1.75, "grad_norm_var": 5.473623657226563, "learning_rate": 5e-05, "loss": 0.1747, "loss/crossentropy": 1.4426945447921753, "loss/dist_ce": 0.0, "loss/hidden": 0.1455078125, "loss/idx": 0.0, "loss/logits": 0.02921513468027115, "step": 362 }, { "epoch": 0.0029916678342138013, "grad_norm": 1.6640625, "grad_norm_var": 5.685538482666016, "learning_rate": 5e-05, "loss": 0.1638, "loss/crossentropy": 1.559606909751892, "loss/dist_ce": 0.0, "loss/hidden": 0.1357421875, "loss/idx": 0.0, "loss/logits": 0.028026653453707695, "step": 363 }, { "epoch": 0.002999909343398963, "grad_norm": 2.109375, "grad_norm_var": 5.865667470296224, "learning_rate": 5e-05, "loss": 0.1489, "loss/crossentropy": 0.7043201923370361, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.018008584156632423, "step": 364 }, { "epoch": 0.003008150852584125, "grad_norm": 4.1875, "grad_norm_var": 5.812695058186849, "learning_rate": 5e-05, "loss": 0.2217, "loss/crossentropy": 1.9148753881454468, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.0439751073718071, "step": 365 }, { "epoch": 0.0030163923617692874, "grad_norm": 4.125, "grad_norm_var": 5.792956288655599, "learning_rate": 5e-05, "loss": 0.3083, "loss/crossentropy": 2.424750328063965, "loss/dist_ce": 0.0, "loss/hidden": 0.23046875, "loss/idx": 0.0, "loss/logits": 0.07787832617759705, "step": 366 }, { "epoch": 0.0030246338709544493, "grad_norm": 2.09375, "grad_norm_var": 5.963744862874349, "learning_rate": 5e-05, "loss": 0.1791, "loss/crossentropy": 1.68095862865448, "loss/dist_ce": 0.0, "loss/hidden": 0.146484375, "loss/idx": 0.0, "loss/logits": 0.03266463428735733, "step": 367 }, { "epoch": 0.003032875380139611, "grad_norm": 11.875, "grad_norm_var": 9.52763646443685, "learning_rate": 5e-05, "loss": 0.5786, "loss/crossentropy": 1.7796623706817627, "loss/dist_ce": 0.0, "loss/hidden": 0.4765625, "loss/idx": 0.0, "loss/logits": 0.10202518105506897, "step": 368 }, { "epoch": 0.003041116889324773, "grad_norm": 2.953125, "grad_norm_var": 9.516863759358724, "learning_rate": 5e-05, "loss": 0.2031, "loss/crossentropy": 1.285621166229248, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.025359109044075012, "step": 369 }, { "epoch": 0.003049358398509935, "grad_norm": 1.953125, "grad_norm_var": 9.749049631754557, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 1.5697388648986816, "loss/dist_ce": 0.0, "loss/hidden": 0.14453125, "loss/idx": 0.0, "loss/logits": 0.03151218220591545, "step": 370 }, { "epoch": 0.0030575999076950973, "grad_norm": 3.0, "grad_norm_var": 9.781166330973308, "learning_rate": 5e-05, "loss": 0.2175, "loss/crossentropy": 1.4509150981903076, "loss/dist_ce": 0.0, "loss/hidden": 0.1845703125, "loss/idx": 0.0, "loss/logits": 0.03292187303304672, "step": 371 }, { "epoch": 0.003065841416880259, "grad_norm": 1.75, "grad_norm_var": 9.534547678629558, "learning_rate": 5e-05, "loss": 0.1677, "loss/crossentropy": 2.5047202110290527, "loss/dist_ce": 0.0, "loss/hidden": 0.1318359375, "loss/idx": 0.0, "loss/logits": 0.03586728125810623, "step": 372 }, { "epoch": 0.003074082926065421, "grad_norm": 5.5, "grad_norm_var": 9.256392161051432, "learning_rate": 5e-05, "loss": 0.2845, "loss/crossentropy": 1.728163719177246, "loss/dist_ce": 0.0, "loss/hidden": 0.23828125, "loss/idx": 0.0, "loss/logits": 0.04620472714304924, "step": 373 }, { "epoch": 0.003082324435250583, "grad_norm": 3.109375, "grad_norm_var": 9.079986317952473, "learning_rate": 5e-05, "loss": 0.2467, "loss/crossentropy": 2.323497772216797, "loss/dist_ce": 0.0, "loss/hidden": 0.201171875, "loss/idx": 0.0, "loss/logits": 0.04549071192741394, "step": 374 }, { "epoch": 0.003090565944435745, "grad_norm": 2.734375, "grad_norm_var": 8.902730051676432, "learning_rate": 5e-05, "loss": 0.1624, "loss/crossentropy": 1.6974822282791138, "loss/dist_ce": 0.0, "loss/hidden": 0.1357421875, "loss/idx": 0.0, "loss/logits": 0.026702899485826492, "step": 375 }, { "epoch": 0.003098807453620907, "grad_norm": 2.046875, "grad_norm_var": 8.553822580973307, "learning_rate": 5e-05, "loss": 0.1575, "loss/crossentropy": 1.636716604232788, "loss/dist_ce": 0.0, "loss/hidden": 0.12890625, "loss/idx": 0.0, "loss/logits": 0.02862347848713398, "step": 376 }, { "epoch": 0.003107048962806069, "grad_norm": 3.953125, "grad_norm_var": 6.287947336832683, "learning_rate": 5e-05, "loss": 0.366, "loss/crossentropy": 2.5682785511016846, "loss/dist_ce": 0.0, "loss/hidden": 0.28125, "loss/idx": 0.0, "loss/logits": 0.08478732407093048, "step": 377 }, { "epoch": 0.003115290471991231, "grad_norm": 4.4375, "grad_norm_var": 6.139050038655599, "learning_rate": 5e-05, "loss": 0.236, "loss/crossentropy": 1.9300963878631592, "loss/dist_ce": 0.0, "loss/hidden": 0.1884765625, "loss/idx": 0.0, "loss/logits": 0.04753483831882477, "step": 378 }, { "epoch": 0.003123531981176393, "grad_norm": 8.0, "grad_norm_var": 7.0182851155598955, "learning_rate": 5e-05, "loss": 0.3625, "loss/crossentropy": 1.655861496925354, "loss/dist_ce": 0.0, "loss/hidden": 0.2890625, "loss/idx": 0.0, "loss/logits": 0.0734243243932724, "step": 379 }, { "epoch": 0.003131773490361555, "grad_norm": 4.71875, "grad_norm_var": 6.789794921875, "learning_rate": 5e-05, "loss": 0.3235, "loss/crossentropy": 2.3060801029205322, "loss/dist_ce": 0.0, "loss/hidden": 0.2578125, "loss/idx": 0.0, "loss/logits": 0.06571874022483826, "step": 380 }, { "epoch": 0.003140014999546717, "grad_norm": 3.640625, "grad_norm_var": 6.8059234619140625, "learning_rate": 5e-05, "loss": 0.2763, "loss/crossentropy": 2.67515230178833, "loss/dist_ce": 0.0, "loss/hidden": 0.2236328125, "loss/idx": 0.0, "loss/logits": 0.05268421396613121, "step": 381 }, { "epoch": 0.003148256508731879, "grad_norm": 8.6875, "grad_norm_var": 8.111107381184896, "learning_rate": 5e-05, "loss": 0.2852, "loss/crossentropy": 2.6763105392456055, "loss/dist_ce": 0.0, "loss/hidden": 0.220703125, "loss/idx": 0.0, "loss/logits": 0.06450507789850235, "step": 382 }, { "epoch": 0.003156498017917041, "grad_norm": 3.78125, "grad_norm_var": 7.769432576497396, "learning_rate": 5e-05, "loss": 0.2415, "loss/crossentropy": 2.8496878147125244, "loss/dist_ce": 0.0, "loss/hidden": 0.189453125, "loss/idx": 0.0, "loss/logits": 0.052030615508556366, "step": 383 }, { "epoch": 0.0031647395271022028, "grad_norm": 4.21875, "grad_norm_var": 3.9133941650390627, "learning_rate": 5e-05, "loss": 0.1925, "loss/crossentropy": 1.3341301679611206, "loss/dist_ce": 0.0, "loss/hidden": 0.1630859375, "loss/idx": 0.0, "loss/logits": 0.02942117676138878, "step": 384 }, { "epoch": 0.003172981036287365, "grad_norm": 1.984375, "grad_norm_var": 4.1111806233723955, "learning_rate": 5e-05, "loss": 0.1985, "loss/crossentropy": 2.383344888687134, "loss/dist_ce": 0.0, "loss/hidden": 0.1533203125, "loss/idx": 0.0, "loss/logits": 0.04519602656364441, "step": 385 }, { "epoch": 0.003181222545472527, "grad_norm": 2.046875, "grad_norm_var": 4.0865224202473955, "learning_rate": 5e-05, "loss": 0.235, "loss/crossentropy": 2.4512088298797607, "loss/dist_ce": 0.0, "loss/hidden": 0.1806640625, "loss/idx": 0.0, "loss/logits": 0.054316744208335876, "step": 386 }, { "epoch": 0.003189464054657689, "grad_norm": 2.03125, "grad_norm_var": 4.271190388997396, "learning_rate": 5e-05, "loss": 0.187, "loss/crossentropy": 1.7021174430847168, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.034697070717811584, "step": 387 }, { "epoch": 0.003197705563842851, "grad_norm": 6.78125, "grad_norm_var": 4.400902303059896, "learning_rate": 5e-05, "loss": 0.3263, "loss/crossentropy": 2.0133919715881348, "loss/dist_ce": 0.0, "loss/hidden": 0.25, "loss/idx": 0.0, "loss/logits": 0.07632862031459808, "step": 388 }, { "epoch": 0.0032059470730280127, "grad_norm": 2.484375, "grad_norm_var": 4.458426920572917, "learning_rate": 5e-05, "loss": 0.2092, "loss/crossentropy": 2.6211116313934326, "loss/dist_ce": 0.0, "loss/hidden": 0.162109375, "loss/idx": 0.0, "loss/logits": 0.04707195237278938, "step": 389 }, { "epoch": 0.003214188582213175, "grad_norm": 1.671875, "grad_norm_var": 4.766141764322916, "learning_rate": 5e-05, "loss": 0.1619, "loss/crossentropy": 1.861954689025879, "loss/dist_ce": 0.0, "loss/hidden": 0.1298828125, "loss/idx": 0.0, "loss/logits": 0.031997717916965485, "step": 390 }, { "epoch": 0.003222430091398337, "grad_norm": 3.953125, "grad_norm_var": 4.661246744791667, "learning_rate": 5e-05, "loss": 0.2297, "loss/crossentropy": 1.4026638269424438, "loss/dist_ce": 0.0, "loss/hidden": 0.19140625, "loss/idx": 0.0, "loss/logits": 0.03833915665745735, "step": 391 }, { "epoch": 0.003230671600583499, "grad_norm": 4.71875, "grad_norm_var": 4.401887003580729, "learning_rate": 5e-05, "loss": 0.3195, "loss/crossentropy": 2.687175750732422, "loss/dist_ce": 0.0, "loss/hidden": 0.251953125, "loss/idx": 0.0, "loss/logits": 0.06756812334060669, "step": 392 }, { "epoch": 0.0032389131097686607, "grad_norm": 1.8828125, "grad_norm_var": 4.736358388264974, "learning_rate": 5e-05, "loss": 0.1761, "loss/crossentropy": 1.6555976867675781, "loss/dist_ce": 0.0, "loss/hidden": 0.1435546875, "loss/idx": 0.0, "loss/logits": 0.032518401741981506, "step": 393 }, { "epoch": 0.003247154618953823, "grad_norm": 3.640625, "grad_norm_var": 4.736462148030599, "learning_rate": 5e-05, "loss": 0.3446, "loss/crossentropy": 2.2599704265594482, "loss/dist_ce": 0.0, "loss/hidden": 0.279296875, "loss/idx": 0.0, "loss/logits": 0.06530951708555222, "step": 394 }, { "epoch": 0.003255396128138985, "grad_norm": 1.8828125, "grad_norm_var": 3.825056966145833, "learning_rate": 5e-05, "loss": 0.204, "loss/crossentropy": 2.564816951751709, "loss/dist_ce": 0.0, "loss/hidden": 0.15625, "loss/idx": 0.0, "loss/logits": 0.04773015156388283, "step": 395 }, { "epoch": 0.003263637637324147, "grad_norm": 2.390625, "grad_norm_var": 3.8267242431640627, "learning_rate": 5e-05, "loss": 0.1961, "loss/crossentropy": 2.249958038330078, "loss/dist_ce": 0.0, "loss/hidden": 0.15625, "loss/idx": 0.0, "loss/logits": 0.039828941226005554, "step": 396 }, { "epoch": 0.0032718791465093087, "grad_norm": 1.3671875, "grad_norm_var": 4.103281402587891, "learning_rate": 5e-05, "loss": 0.1342, "loss/crossentropy": 1.058944821357727, "loss/dist_ce": 0.0, "loss/hidden": 0.11474609375, "loss/idx": 0.0, "loss/logits": 0.019409142434597015, "step": 397 }, { "epoch": 0.0032801206556944706, "grad_norm": 13.375, "grad_norm_var": 8.815500640869141, "learning_rate": 5e-05, "loss": 0.3495, "loss/crossentropy": 2.6670608520507812, "loss/dist_ce": 0.0, "loss/hidden": 0.28125, "loss/idx": 0.0, "loss/logits": 0.06825672090053558, "step": 398 }, { "epoch": 0.003288362164879633, "grad_norm": 2.140625, "grad_norm_var": 8.952433013916016, "learning_rate": 5e-05, "loss": 0.2057, "loss/crossentropy": 2.589582920074463, "loss/dist_ce": 0.0, "loss/hidden": 0.1669921875, "loss/idx": 0.0, "loss/logits": 0.038732096552848816, "step": 399 }, { "epoch": 0.003296603674064795, "grad_norm": 2.265625, "grad_norm_var": 9.012959543863932, "learning_rate": 5e-05, "loss": 0.134, "loss/crossentropy": 0.9774411916732788, "loss/dist_ce": 0.0, "loss/hidden": 0.1171875, "loss/idx": 0.0, "loss/logits": 0.01683815009891987, "step": 400 }, { "epoch": 0.0033048451832499567, "grad_norm": 5.3125, "grad_norm_var": 9.071028391520182, "learning_rate": 5e-05, "loss": 0.3151, "loss/crossentropy": 2.308528184890747, "loss/dist_ce": 0.0, "loss/hidden": 0.2578125, "loss/idx": 0.0, "loss/logits": 0.057268112897872925, "step": 401 }, { "epoch": 0.0033130866924351186, "grad_norm": 1.75, "grad_norm_var": 9.138868967692057, "learning_rate": 5e-05, "loss": 0.1162, "loss/crossentropy": 0.1983821541070938, "loss/dist_ce": 0.0, "loss/hidden": 0.1083984375, "loss/idx": 0.0, "loss/logits": 0.007823487743735313, "step": 402 }, { "epoch": 0.0033213282016202805, "grad_norm": 3.578125, "grad_norm_var": 8.964241282145183, "learning_rate": 5e-05, "loss": 0.1463, "loss/crossentropy": 0.282149076461792, "loss/dist_ce": 0.0, "loss/hidden": 0.1337890625, "loss/idx": 0.0, "loss/logits": 0.012518523260951042, "step": 403 }, { "epoch": 0.003329569710805443, "grad_norm": 5.4375, "grad_norm_var": 8.52498550415039, "learning_rate": 5e-05, "loss": 0.3084, "loss/crossentropy": 2.364654302597046, "loss/dist_ce": 0.0, "loss/hidden": 0.2333984375, "loss/idx": 0.0, "loss/logits": 0.07504182308912277, "step": 404 }, { "epoch": 0.0033378112199906047, "grad_norm": 6.40625, "grad_norm_var": 8.894703928629557, "learning_rate": 5e-05, "loss": 0.2316, "loss/crossentropy": 1.4973769187927246, "loss/dist_ce": 0.0, "loss/hidden": 0.1953125, "loss/idx": 0.0, "loss/logits": 0.036246173083782196, "step": 405 }, { "epoch": 0.0033460527291757666, "grad_norm": 14.9375, "grad_norm_var": 16.021522776285806, "learning_rate": 5e-05, "loss": 0.3451, "loss/crossentropy": 2.6580722332000732, "loss/dist_ce": 0.0, "loss/hidden": 0.28515625, "loss/idx": 0.0, "loss/logits": 0.05991474539041519, "step": 406 }, { "epoch": 0.0033542942383609285, "grad_norm": 1.7109375, "grad_norm_var": 16.55601298014323, "learning_rate": 5e-05, "loss": 0.1524, "loss/crossentropy": 2.4848172664642334, "loss/dist_ce": 0.0, "loss/hidden": 0.1201171875, "loss/idx": 0.0, "loss/logits": 0.0323210209608078, "step": 407 }, { "epoch": 0.003362535747546091, "grad_norm": 25.625, "grad_norm_var": 44.34390360514323, "learning_rate": 5e-05, "loss": 0.3535, "loss/crossentropy": 2.135502338409424, "loss/dist_ce": 0.0, "loss/hidden": 0.28515625, "loss/idx": 0.0, "loss/logits": 0.06830734014511108, "step": 408 }, { "epoch": 0.0033707772567312527, "grad_norm": 2.15625, "grad_norm_var": 44.203704579671225, "learning_rate": 5e-05, "loss": 0.1807, "loss/crossentropy": 1.5167546272277832, "loss/dist_ce": 0.0, "loss/hidden": 0.1484375, "loss/idx": 0.0, "loss/logits": 0.032285355031490326, "step": 409 }, { "epoch": 0.0033790187659164146, "grad_norm": 2.25, "grad_norm_var": 44.73858820597331, "learning_rate": 5e-05, "loss": 0.1845, "loss/crossentropy": 2.5554723739624023, "loss/dist_ce": 0.0, "loss/hidden": 0.146484375, "loss/idx": 0.0, "loss/logits": 0.03800758719444275, "step": 410 }, { "epoch": 0.0033872602751015765, "grad_norm": 5.125, "grad_norm_var": 43.70799051920573, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 1.3965002298355103, "loss/dist_ce": 0.0, "loss/hidden": 0.162109375, "loss/idx": 0.0, "loss/logits": 0.019777944311499596, "step": 411 }, { "epoch": 0.0033955017842867384, "grad_norm": 2.640625, "grad_norm_var": 43.591942342122394, "learning_rate": 5e-05, "loss": 0.2145, "loss/crossentropy": 1.4451302289962769, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.04454413428902626, "step": 412 }, { "epoch": 0.0034037432934719007, "grad_norm": 2.125, "grad_norm_var": 43.159234364827476, "learning_rate": 5e-05, "loss": 0.1472, "loss/crossentropy": 0.820690929889679, "loss/dist_ce": 0.0, "loss/hidden": 0.1259765625, "loss/idx": 0.0, "loss/logits": 0.02119414508342743, "step": 413 }, { "epoch": 0.0034119848026570626, "grad_norm": 2.546875, "grad_norm_var": 39.915026601155596, "learning_rate": 5e-05, "loss": 0.189, "loss/crossentropy": 2.5425262451171875, "loss/dist_ce": 0.0, "loss/hidden": 0.1484375, "loss/idx": 0.0, "loss/logits": 0.0405312180519104, "step": 414 }, { "epoch": 0.0034202263118422245, "grad_norm": 1.96875, "grad_norm_var": 39.991005198160806, "learning_rate": 5e-05, "loss": 0.161, "loss/crossentropy": 1.5188648700714111, "loss/dist_ce": 0.0, "loss/hidden": 0.1337890625, "loss/idx": 0.0, "loss/logits": 0.027209658175706863, "step": 415 }, { "epoch": 0.0034284678210273864, "grad_norm": 3.40625, "grad_norm_var": 39.60099461873372, "learning_rate": 5e-05, "loss": 0.2137, "loss/crossentropy": 1.8609509468078613, "loss/dist_ce": 0.0, "loss/hidden": 0.173828125, "loss/idx": 0.0, "loss/logits": 0.03991951048374176, "step": 416 }, { "epoch": 0.0034367093302125483, "grad_norm": 4.03125, "grad_norm_var": 39.72469863891602, "learning_rate": 5e-05, "loss": 0.3144, "loss/crossentropy": 2.865185260772705, "loss/dist_ce": 0.0, "loss/hidden": 0.244140625, "loss/idx": 0.0, "loss/logits": 0.07025311887264252, "step": 417 }, { "epoch": 0.0034449508393977106, "grad_norm": 5.0, "grad_norm_var": 38.82227350870768, "learning_rate": 5e-05, "loss": 0.2031, "loss/crossentropy": 1.3800697326660156, "loss/dist_ce": 0.0, "loss/hidden": 0.17578125, "loss/idx": 0.0, "loss/logits": 0.027345672249794006, "step": 418 }, { "epoch": 0.0034531923485828725, "grad_norm": 2.921875, "grad_norm_var": 39.02252375284831, "learning_rate": 5e-05, "loss": 0.1523, "loss/crossentropy": 0.4079228937625885, "loss/dist_ce": 0.0, "loss/hidden": 0.13671875, "loss/idx": 0.0, "loss/logits": 0.015567103400826454, "step": 419 }, { "epoch": 0.0034614338577680344, "grad_norm": 4.96875, "grad_norm_var": 39.04129206339518, "learning_rate": 5e-05, "loss": 0.1958, "loss/crossentropy": 2.5757017135620117, "loss/dist_ce": 0.0, "loss/hidden": 0.1640625, "loss/idx": 0.0, "loss/logits": 0.031719379127025604, "step": 420 }, { "epoch": 0.0034696753669531963, "grad_norm": 2.15625, "grad_norm_var": 39.65029271443685, "learning_rate": 5e-05, "loss": 0.2189, "loss/crossentropy": 1.558744192123413, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.04112962260842323, "step": 421 }, { "epoch": 0.0034779168761383586, "grad_norm": 2.4375, "grad_norm_var": 33.22532526652018, "learning_rate": 5e-05, "loss": 0.2209, "loss/crossentropy": 2.4556710720062256, "loss/dist_ce": 0.0, "loss/hidden": 0.171875, "loss/idx": 0.0, "loss/logits": 0.04905615746974945, "step": 422 }, { "epoch": 0.0034861583853235205, "grad_norm": 4.03125, "grad_norm_var": 32.71692606608073, "learning_rate": 5e-05, "loss": 0.2253, "loss/crossentropy": 2.399423360824585, "loss/dist_ce": 0.0, "loss/hidden": 0.18359375, "loss/idx": 0.0, "loss/logits": 0.04174065962433815, "step": 423 }, { "epoch": 0.0034943998945086824, "grad_norm": 6.53125, "grad_norm_var": 1.9431711832682292, "learning_rate": 5e-05, "loss": 0.3418, "loss/crossentropy": 1.4518251419067383, "loss/dist_ce": 0.0, "loss/hidden": 0.30859375, "loss/idx": 0.0, "loss/logits": 0.03323998302221298, "step": 424 }, { "epoch": 0.0035026414036938443, "grad_norm": 2.703125, "grad_norm_var": 1.87164306640625, "learning_rate": 5e-05, "loss": 0.2027, "loss/crossentropy": 2.5503337383270264, "loss/dist_ce": 0.0, "loss/hidden": 0.1611328125, "loss/idx": 0.0, "loss/logits": 0.04152850806713104, "step": 425 }, { "epoch": 0.0035108829128790062, "grad_norm": 3.140625, "grad_norm_var": 1.7813629150390624, "learning_rate": 5e-05, "loss": 0.21, "loss/crossentropy": 2.405348539352417, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.0400310643017292, "step": 426 }, { "epoch": 0.0035191244220641685, "grad_norm": 2.078125, "grad_norm_var": 1.694677734375, "learning_rate": 5e-05, "loss": 0.2251, "loss/crossentropy": 2.5670955181121826, "loss/dist_ce": 0.0, "loss/hidden": 0.173828125, "loss/idx": 0.0, "loss/logits": 0.05131090059876442, "step": 427 }, { "epoch": 0.0035273659312493304, "grad_norm": 5.46875, "grad_norm_var": 1.9485829671223958, "learning_rate": 5e-05, "loss": 0.2417, "loss/crossentropy": 1.348537564277649, "loss/dist_ce": 0.0, "loss/hidden": 0.208984375, "loss/idx": 0.0, "loss/logits": 0.032746195793151855, "step": 428 }, { "epoch": 0.0035356074404344923, "grad_norm": 2.796875, "grad_norm_var": 1.8563313802083334, "learning_rate": 5e-05, "loss": 0.2383, "loss/crossentropy": 2.7552454471588135, "loss/dist_ce": 0.0, "loss/hidden": 0.189453125, "loss/idx": 0.0, "loss/logits": 0.04886690154671669, "step": 429 }, { "epoch": 0.0035438489496196542, "grad_norm": 2.703125, "grad_norm_var": 1.83775634765625, "learning_rate": 5e-05, "loss": 0.3084, "loss/crossentropy": 2.6097259521484375, "loss/dist_ce": 0.0, "loss/hidden": 0.2578125, "loss/idx": 0.0, "loss/logits": 0.05057002976536751, "step": 430 }, { "epoch": 0.003552090458804816, "grad_norm": 4.3125, "grad_norm_var": 1.695849609375, "learning_rate": 5e-05, "loss": 0.2062, "loss/crossentropy": 1.8095245361328125, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.036256495863199234, "step": 431 }, { "epoch": 0.0035603319679899785, "grad_norm": 6.3125, "grad_norm_var": 2.12232666015625, "learning_rate": 5e-05, "loss": 0.2564, "loss/crossentropy": 2.088921546936035, "loss/dist_ce": 0.0, "loss/hidden": 0.201171875, "loss/idx": 0.0, "loss/logits": 0.0551944300532341, "step": 432 }, { "epoch": 0.0035685734771751403, "grad_norm": 42.0, "grad_norm_var": 93.143505859375, "learning_rate": 5e-05, "loss": 0.7318, "loss/crossentropy": 2.4523119926452637, "loss/dist_ce": 0.0, "loss/hidden": 0.5859375, "loss/idx": 0.0, "loss/logits": 0.14583945274353027, "step": 433 }, { "epoch": 0.0035768149863603022, "grad_norm": 3.09375, "grad_norm_var": 93.68137613932292, "learning_rate": 5e-05, "loss": 0.2037, "loss/crossentropy": 2.0388007164001465, "loss/dist_ce": 0.0, "loss/hidden": 0.1689453125, "loss/idx": 0.0, "loss/logits": 0.03475724905729294, "step": 434 }, { "epoch": 0.003585056495545464, "grad_norm": 4.1875, "grad_norm_var": 93.24458719889323, "learning_rate": 5e-05, "loss": 0.2542, "loss/crossentropy": 2.6730620861053467, "loss/dist_ce": 0.0, "loss/hidden": 0.1953125, "loss/idx": 0.0, "loss/logits": 0.05885430425405502, "step": 435 }, { "epoch": 0.0035932980047306265, "grad_norm": 3.34375, "grad_norm_var": 93.6726308186849, "learning_rate": 5e-05, "loss": 0.1555, "loss/crossentropy": 0.4195747375488281, "loss/dist_ce": 0.0, "loss/hidden": 0.1416015625, "loss/idx": 0.0, "loss/logits": 0.013927996158599854, "step": 436 }, { "epoch": 0.0036015395139157884, "grad_norm": 3.421875, "grad_norm_var": 93.11043294270833, "learning_rate": 5e-05, "loss": 0.254, "loss/crossentropy": 1.7030478715896606, "loss/dist_ce": 0.0, "loss/hidden": 0.208984375, "loss/idx": 0.0, "loss/logits": 0.04505797103047371, "step": 437 }, { "epoch": 0.0036097810231009502, "grad_norm": 1.71875, "grad_norm_var": 93.49947509765624, "learning_rate": 5e-05, "loss": 0.1532, "loss/crossentropy": 0.4963390529155731, "loss/dist_ce": 0.0, "loss/hidden": 0.1357421875, "loss/idx": 0.0, "loss/logits": 0.017449375241994858, "step": 438 }, { "epoch": 0.003618022532286112, "grad_norm": 3.125, "grad_norm_var": 93.80262044270833, "learning_rate": 5e-05, "loss": 0.2339, "loss/crossentropy": 2.744438648223877, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.05223686248064041, "step": 439 }, { "epoch": 0.003626264041471274, "grad_norm": 1.1875, "grad_norm_var": 95.25058186848959, "learning_rate": 5e-05, "loss": 0.1237, "loss/crossentropy": 0.5037131905555725, "loss/dist_ce": 0.0, "loss/hidden": 0.109375, "loss/idx": 0.0, "loss/logits": 0.014304354786872864, "step": 440 }, { "epoch": 0.0036345055506564364, "grad_norm": 4.5, "grad_norm_var": 94.72848205566406, "learning_rate": 5e-05, "loss": 0.1741, "loss/crossentropy": 1.277227759361267, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.021763307973742485, "step": 441 }, { "epoch": 0.0036427470598415983, "grad_norm": 8.5625, "grad_norm_var": 94.61658528645833, "learning_rate": 5e-05, "loss": 0.3688, "loss/crossentropy": 2.51446533203125, "loss/dist_ce": 0.0, "loss/hidden": 0.2890625, "loss/idx": 0.0, "loss/logits": 0.07975561916828156, "step": 442 }, { "epoch": 0.00365098856902676, "grad_norm": 2.515625, "grad_norm_var": 94.38951822916667, "learning_rate": 5e-05, "loss": 0.2143, "loss/crossentropy": 2.614020347595215, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.044419120997190475, "step": 443 }, { "epoch": 0.003659230078211922, "grad_norm": 2.203125, "grad_norm_var": 95.37579650878907, "learning_rate": 5e-05, "loss": 0.1516, "loss/crossentropy": 0.43124791979789734, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.01679244264960289, "step": 444 }, { "epoch": 0.0036674715873970844, "grad_norm": 1.4609375, "grad_norm_var": 96.05772476196289, "learning_rate": 5e-05, "loss": 0.1449, "loss/crossentropy": 1.3757065534591675, "loss/dist_ce": 0.0, "loss/hidden": 0.119140625, "loss/idx": 0.0, "loss/logits": 0.02579795941710472, "step": 445 }, { "epoch": 0.0036757130965822463, "grad_norm": 2.859375, "grad_norm_var": 95.99232559204101, "learning_rate": 5e-05, "loss": 0.2612, "loss/crossentropy": 1.5693522691726685, "loss/dist_ce": 0.0, "loss/hidden": 0.2265625, "loss/idx": 0.0, "loss/logits": 0.034596264362335205, "step": 446 }, { "epoch": 0.003683954605767408, "grad_norm": 2.25, "grad_norm_var": 96.70171279907227, "learning_rate": 5e-05, "loss": 0.1481, "loss/crossentropy": 1.425809621810913, "loss/dist_ce": 0.0, "loss/hidden": 0.12060546875, "loss/idx": 0.0, "loss/logits": 0.02751866541802883, "step": 447 }, { "epoch": 0.00369219611495257, "grad_norm": 4.53125, "grad_norm_var": 96.77743911743164, "learning_rate": 5e-05, "loss": 0.2282, "loss/crossentropy": 1.8082743883132935, "loss/dist_ce": 0.0, "loss/hidden": 0.1875, "loss/idx": 0.0, "loss/logits": 0.04072072356939316, "step": 448 }, { "epoch": 0.003700437624137732, "grad_norm": 4.5, "grad_norm_var": 3.0933570861816406, "learning_rate": 5e-05, "loss": 0.284, "loss/crossentropy": 0.5547680854797363, "loss/dist_ce": 0.0, "loss/hidden": 0.25, "loss/idx": 0.0, "loss/logits": 0.0339670293033123, "step": 449 }, { "epoch": 0.0037086791333228943, "grad_norm": 4.25, "grad_norm_var": 3.1387489318847654, "learning_rate": 5e-05, "loss": 0.2519, "loss/crossentropy": 1.6840208768844604, "loss/dist_ce": 0.0, "loss/hidden": 0.205078125, "loss/idx": 0.0, "loss/logits": 0.04682979732751846, "step": 450 }, { "epoch": 0.003716920642508056, "grad_norm": 3.765625, "grad_norm_var": 3.1063392639160154, "learning_rate": 5e-05, "loss": 0.2045, "loss/crossentropy": 2.407160520553589, "loss/dist_ce": 0.0, "loss/hidden": 0.162109375, "loss/idx": 0.0, "loss/logits": 0.04241305589675903, "step": 451 }, { "epoch": 0.003725162151693218, "grad_norm": 5.6875, "grad_norm_var": 3.4360816955566404, "learning_rate": 5e-05, "loss": 0.3723, "loss/crossentropy": 2.6594908237457275, "loss/dist_ce": 0.0, "loss/hidden": 0.279296875, "loss/idx": 0.0, "loss/logits": 0.09303879737854004, "step": 452 }, { "epoch": 0.00373340366087838, "grad_norm": 1.6328125, "grad_norm_var": 3.6628011067708335, "learning_rate": 5e-05, "loss": 0.118, "loss/crossentropy": 0.4520578682422638, "loss/dist_ce": 0.0, "loss/hidden": 0.1044921875, "loss/idx": 0.0, "loss/logits": 0.01347460225224495, "step": 453 }, { "epoch": 0.003741645170063542, "grad_norm": 2.15625, "grad_norm_var": 3.5754150390625, "learning_rate": 5e-05, "loss": 0.2743, "loss/crossentropy": 2.884896755218506, "loss/dist_ce": 0.0, "loss/hidden": 0.2265625, "loss/idx": 0.0, "loss/logits": 0.04774241894483566, "step": 454 }, { "epoch": 0.003749886679248704, "grad_norm": 1.8359375, "grad_norm_var": 3.7349952697753905, "learning_rate": 5e-05, "loss": 0.1811, "loss/crossentropy": 2.609929323196411, "loss/dist_ce": 0.0, "loss/hidden": 0.146484375, "loss/idx": 0.0, "loss/logits": 0.034633196890354156, "step": 455 }, { "epoch": 0.003758128188433866, "grad_norm": 10.6875, "grad_norm_var": 6.612827301025391, "learning_rate": 5e-05, "loss": 0.7979, "loss/crossentropy": 2.925989866256714, "loss/dist_ce": 0.0, "loss/hidden": 0.5390625, "loss/idx": 0.0, "loss/logits": 0.2587950825691223, "step": 456 }, { "epoch": 0.003766369697619028, "grad_norm": 3.109375, "grad_norm_var": 6.634012603759766, "learning_rate": 5e-05, "loss": 0.2532, "loss/crossentropy": 2.1211068630218506, "loss/dist_ce": 0.0, "loss/hidden": 0.193359375, "loss/idx": 0.0, "loss/logits": 0.059833116829395294, "step": 457 }, { "epoch": 0.00377461120680419, "grad_norm": 1.8359375, "grad_norm_var": 5.25826416015625, "learning_rate": 5e-05, "loss": 0.1663, "loss/crossentropy": 2.170849323272705, "loss/dist_ce": 0.0, "loss/hidden": 0.1337890625, "loss/idx": 0.0, "loss/logits": 0.03252778202295303, "step": 458 }, { "epoch": 0.003782852715989352, "grad_norm": 1.1640625, "grad_norm_var": 5.541731516520183, "learning_rate": 5e-05, "loss": 0.1449, "loss/crossentropy": 1.5572426319122314, "loss/dist_ce": 0.0, "loss/hidden": 0.123046875, "loss/idx": 0.0, "loss/logits": 0.021882327273488045, "step": 459 }, { "epoch": 0.003791094225174514, "grad_norm": 1.96875, "grad_norm_var": 5.581648508707683, "learning_rate": 5e-05, "loss": 0.1801, "loss/crossentropy": 2.4034504890441895, "loss/dist_ce": 0.0, "loss/hidden": 0.142578125, "loss/idx": 0.0, "loss/logits": 0.03750108927488327, "step": 460 }, { "epoch": 0.003799335734359676, "grad_norm": 1.46875, "grad_norm_var": 5.579678344726562, "learning_rate": 5e-05, "loss": 0.1353, "loss/crossentropy": 2.3504481315612793, "loss/dist_ce": 0.0, "loss/hidden": 0.1103515625, "loss/idx": 0.0, "loss/logits": 0.024939250200986862, "step": 461 }, { "epoch": 0.003807577243544838, "grad_norm": 4.53125, "grad_norm_var": 5.643570963541666, "learning_rate": 5e-05, "loss": 0.1909, "loss/crossentropy": 0.8316883444786072, "loss/dist_ce": 0.0, "loss/hidden": 0.16796875, "loss/idx": 0.0, "loss/logits": 0.022978752851486206, "step": 462 }, { "epoch": 0.0038158187527299998, "grad_norm": 3.46875, "grad_norm_var": 5.539628092447916, "learning_rate": 5e-05, "loss": 0.2104, "loss/crossentropy": 2.7462053298950195, "loss/dist_ce": 0.0, "loss/hidden": 0.1640625, "loss/idx": 0.0, "loss/logits": 0.046341672539711, "step": 463 }, { "epoch": 0.003824060261915162, "grad_norm": 2.4375, "grad_norm_var": 5.536083984375, "learning_rate": 5e-05, "loss": 0.1901, "loss/crossentropy": 1.570056676864624, "loss/dist_ce": 0.0, "loss/hidden": 0.16015625, "loss/idx": 0.0, "loss/logits": 0.02994626574218273, "step": 464 }, { "epoch": 0.003832301771100324, "grad_norm": 5.125, "grad_norm_var": 5.651643880208334, "learning_rate": 5e-05, "loss": 0.2886, "loss/crossentropy": 3.014599084854126, "loss/dist_ce": 0.0, "loss/hidden": 0.234375, "loss/idx": 0.0, "loss/logits": 0.05423382669687271, "step": 465 }, { "epoch": 0.003840543280285486, "grad_norm": 4.65625, "grad_norm_var": 5.705546061197917, "learning_rate": 5e-05, "loss": 0.2568, "loss/crossentropy": 2.5307607650756836, "loss/dist_ce": 0.0, "loss/hidden": 0.212890625, "loss/idx": 0.0, "loss/logits": 0.043907828629016876, "step": 466 }, { "epoch": 0.0038487847894706478, "grad_norm": 2.65625, "grad_norm_var": 5.738841756184896, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.301478624343872, "loss/dist_ce": 0.0, "loss/hidden": 0.14453125, "loss/idx": 0.0, "loss/logits": 0.034871190786361694, "step": 467 }, { "epoch": 0.0038570262986558097, "grad_norm": 2.3125, "grad_norm_var": 5.421996053059896, "learning_rate": 5e-05, "loss": 0.1572, "loss/crossentropy": 2.81413197517395, "loss/dist_ce": 0.0, "loss/hidden": 0.1220703125, "loss/idx": 0.0, "loss/logits": 0.03511942923069, "step": 468 }, { "epoch": 0.003865267807840972, "grad_norm": 3.234375, "grad_norm_var": 5.24969253540039, "learning_rate": 5e-05, "loss": 0.2422, "loss/crossentropy": 2.552509069442749, "loss/dist_ce": 0.0, "loss/hidden": 0.193359375, "loss/idx": 0.0, "loss/logits": 0.048864759504795074, "step": 469 }, { "epoch": 0.003873509317026134, "grad_norm": 2.65625, "grad_norm_var": 5.189699045817057, "learning_rate": 5e-05, "loss": 0.2269, "loss/crossentropy": 1.746779441833496, "loss/dist_ce": 0.0, "loss/hidden": 0.1787109375, "loss/idx": 0.0, "loss/logits": 0.048161737620830536, "step": 470 }, { "epoch": 0.0038817508262112958, "grad_norm": 2.40625, "grad_norm_var": 5.097041829427083, "learning_rate": 5e-05, "loss": 0.2224, "loss/crossentropy": 1.6436595916748047, "loss/dist_ce": 0.0, "loss/hidden": 0.1806640625, "loss/idx": 0.0, "loss/logits": 0.04173795133829117, "step": 471 }, { "epoch": 0.0038899923353964577, "grad_norm": 1.8828125, "grad_norm_var": 1.336993153889974, "learning_rate": 5e-05, "loss": 0.1768, "loss/crossentropy": 2.6210246086120605, "loss/dist_ce": 0.0, "loss/hidden": 0.1357421875, "loss/idx": 0.0, "loss/logits": 0.041082605719566345, "step": 472 }, { "epoch": 0.00389823384458162, "grad_norm": 7.125, "grad_norm_var": 2.5066485087076824, "learning_rate": 5e-05, "loss": 0.2539, "loss/crossentropy": 2.7276291847229004, "loss/dist_ce": 0.0, "loss/hidden": 0.203125, "loss/idx": 0.0, "loss/logits": 0.050726860761642456, "step": 473 }, { "epoch": 0.003906475353766782, "grad_norm": 2.0, "grad_norm_var": 2.4815958658854167, "learning_rate": 5e-05, "loss": 0.1564, "loss/crossentropy": 2.6117374897003174, "loss/dist_ce": 0.0, "loss/hidden": 0.1220703125, "loss/idx": 0.0, "loss/logits": 0.034293532371520996, "step": 474 }, { "epoch": 0.003914716862951944, "grad_norm": 10.3125, "grad_norm_var": 5.389619700113932, "learning_rate": 5e-05, "loss": 0.3135, "loss/crossentropy": 2.8067679405212402, "loss/dist_ce": 0.0, "loss/hidden": 0.2451171875, "loss/idx": 0.0, "loss/logits": 0.06842821836471558, "step": 475 }, { "epoch": 0.003922958372137106, "grad_norm": 3.1875, "grad_norm_var": 5.210853830973307, "learning_rate": 5e-05, "loss": 0.237, "loss/crossentropy": 2.265045642852783, "loss/dist_ce": 0.0, "loss/hidden": 0.1875, "loss/idx": 0.0, "loss/logits": 0.049473538994789124, "step": 476 }, { "epoch": 0.003931199881322268, "grad_norm": 2.5, "grad_norm_var": 4.968281809488932, "learning_rate": 5e-05, "loss": 0.22, "loss/crossentropy": 2.7731900215148926, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.05008155107498169, "step": 477 }, { "epoch": 0.0039394413905074295, "grad_norm": 7.75, "grad_norm_var": 5.937888336181641, "learning_rate": 5e-05, "loss": 0.4192, "loss/crossentropy": 2.859137535095215, "loss/dist_ce": 0.0, "loss/hidden": 0.333984375, "loss/idx": 0.0, "loss/logits": 0.08519326895475388, "step": 478 }, { "epoch": 0.003947682899692591, "grad_norm": 3.734375, "grad_norm_var": 5.924122873942057, "learning_rate": 5e-05, "loss": 0.2565, "loss/crossentropy": 1.434816598892212, "loss/dist_ce": 0.0, "loss/hidden": 0.224609375, "loss/idx": 0.0, "loss/logits": 0.03185056895017624, "step": 479 }, { "epoch": 0.003955924408877754, "grad_norm": 1.90625, "grad_norm_var": 6.052335357666015, "learning_rate": 5e-05, "loss": 0.207, "loss/crossentropy": 2.1310391426086426, "loss/dist_ce": 0.0, "loss/hidden": 0.16015625, "loss/idx": 0.0, "loss/logits": 0.04687424749135971, "step": 480 }, { "epoch": 0.003964165918062916, "grad_norm": 3.25, "grad_norm_var": 5.982144927978515, "learning_rate": 5e-05, "loss": 0.2501, "loss/crossentropy": 2.3727288246154785, "loss/dist_ce": 0.0, "loss/hidden": 0.1982421875, "loss/idx": 0.0, "loss/logits": 0.05187632888555527, "step": 481 }, { "epoch": 0.003972407427248078, "grad_norm": 1.046875, "grad_norm_var": 6.40746841430664, "learning_rate": 5e-05, "loss": 0.1219, "loss/crossentropy": 1.4023224115371704, "loss/dist_ce": 0.0, "loss/hidden": 0.103515625, "loss/idx": 0.0, "loss/logits": 0.018374113366007805, "step": 482 }, { "epoch": 0.00398064893643324, "grad_norm": 4.0625, "grad_norm_var": 6.349881744384765, "learning_rate": 5e-05, "loss": 0.2168, "loss/crossentropy": 1.5084764957427979, "loss/dist_ce": 0.0, "loss/hidden": 0.1669921875, "loss/idx": 0.0, "loss/logits": 0.04980340600013733, "step": 483 }, { "epoch": 0.003988890445618402, "grad_norm": 2.09375, "grad_norm_var": 6.393645985921224, "learning_rate": 5e-05, "loss": 0.1647, "loss/crossentropy": 1.9611366987228394, "loss/dist_ce": 0.0, "loss/hidden": 0.1259765625, "loss/idx": 0.0, "loss/logits": 0.038717180490493774, "step": 484 }, { "epoch": 0.003997131954803564, "grad_norm": 3.296875, "grad_norm_var": 6.390036773681641, "learning_rate": 5e-05, "loss": 0.2526, "loss/crossentropy": 2.1912283897399902, "loss/dist_ce": 0.0, "loss/hidden": 0.201171875, "loss/idx": 0.0, "loss/logits": 0.05140523985028267, "step": 485 }, { "epoch": 0.0040053734639887255, "grad_norm": 2.28125, "grad_norm_var": 6.45104751586914, "learning_rate": 5e-05, "loss": 0.2049, "loss/crossentropy": 1.1990885734558105, "loss/dist_ce": 0.0, "loss/hidden": 0.173828125, "loss/idx": 0.0, "loss/logits": 0.03110179677605629, "step": 486 }, { "epoch": 0.004013614973173887, "grad_norm": 2.109375, "grad_norm_var": 6.506866200764974, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.469733715057373, "loss/dist_ce": 0.0, "loss/hidden": 0.150390625, "loss/idx": 0.0, "loss/logits": 0.04281339794397354, "step": 487 }, { "epoch": 0.004021856482359049, "grad_norm": 318.0, "grad_norm_var": 6177.285184733073, "learning_rate": 5e-05, "loss": 1.5086, "loss/crossentropy": 1.5801646709442139, "loss/dist_ce": 0.0, "loss/hidden": 1.390625, "loss/idx": 0.0, "loss/logits": 0.1180073618888855, "step": 488 }, { "epoch": 0.004030097991544211, "grad_norm": 11.75, "grad_norm_var": 6168.57597249349, "learning_rate": 5e-05, "loss": 0.4099, "loss/crossentropy": 2.5404622554779053, "loss/dist_ce": 0.0, "loss/hidden": 0.3359375, "loss/idx": 0.0, "loss/logits": 0.0739157497882843, "step": 489 }, { "epoch": 0.004038339500729374, "grad_norm": 4.125, "grad_norm_var": 6162.7084269205725, "learning_rate": 5e-05, "loss": 0.2788, "loss/crossentropy": 2.6295320987701416, "loss/dist_ce": 0.0, "loss/hidden": 0.2109375, "loss/idx": 0.0, "loss/logits": 0.06782936304807663, "step": 490 }, { "epoch": 0.004046581009914536, "grad_norm": 9.6875, "grad_norm_var": 6163.8599568684895, "learning_rate": 5e-05, "loss": 0.4341, "loss/crossentropy": 3.0478451251983643, "loss/dist_ce": 0.0, "loss/hidden": 0.310546875, "loss/idx": 0.0, "loss/logits": 0.12352639436721802, "step": 491 }, { "epoch": 0.004054822519099698, "grad_norm": 5.90625, "grad_norm_var": 6156.850325520833, "learning_rate": 5e-05, "loss": 0.1865, "loss/crossentropy": 1.3709689378738403, "loss/dist_ce": 0.0, "loss/hidden": 0.162109375, "loss/idx": 0.0, "loss/logits": 0.02439779043197632, "step": 492 }, { "epoch": 0.00406306402828486, "grad_norm": 1.625, "grad_norm_var": 6159.402864583333, "learning_rate": 5e-05, "loss": 0.1459, "loss/crossentropy": 1.324977159500122, "loss/dist_ce": 0.0, "loss/hidden": 0.12109375, "loss/idx": 0.0, "loss/logits": 0.02481193095445633, "step": 493 }, { "epoch": 0.0040713055374700215, "grad_norm": 8.4375, "grad_norm_var": 6157.950699869792, "learning_rate": 5e-05, "loss": 0.2132, "loss/crossentropy": 1.4893817901611328, "loss/dist_ce": 0.0, "loss/hidden": 0.173828125, "loss/idx": 0.0, "loss/logits": 0.039393968880176544, "step": 494 }, { "epoch": 0.004079547046655183, "grad_norm": 2.046875, "grad_norm_var": 6162.678776041666, "learning_rate": 5e-05, "loss": 0.132, "loss/crossentropy": 1.3435920476913452, "loss/dist_ce": 0.0, "loss/hidden": 0.1142578125, "loss/idx": 0.0, "loss/logits": 0.017779778689146042, "step": 495 }, { "epoch": 0.004087788555840345, "grad_norm": 3.65625, "grad_norm_var": 6157.749609375, "learning_rate": 5e-05, "loss": 0.2431, "loss/crossentropy": 2.082836151123047, "loss/dist_ce": 0.0, "loss/hidden": 0.185546875, "loss/idx": 0.0, "loss/logits": 0.057581763714551926, "step": 496 }, { "epoch": 0.004096030065025507, "grad_norm": 6.34375, "grad_norm_var": 6149.804553222656, "learning_rate": 5e-05, "loss": 0.2089, "loss/crossentropy": 1.231292486190796, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.02723127231001854, "step": 497 }, { "epoch": 0.004104271574210669, "grad_norm": 3.296875, "grad_norm_var": 6143.188732910156, "learning_rate": 5e-05, "loss": 0.2633, "loss/crossentropy": 2.7151858806610107, "loss/dist_ce": 0.0, "loss/hidden": 0.197265625, "loss/idx": 0.0, "loss/logits": 0.06607217341661453, "step": 498 }, { "epoch": 0.004112513083395832, "grad_norm": 2.515625, "grad_norm_var": 6147.511221313476, "learning_rate": 5e-05, "loss": 0.1795, "loss/crossentropy": 2.5187623500823975, "loss/dist_ce": 0.0, "loss/hidden": 0.138671875, "loss/idx": 0.0, "loss/logits": 0.040810734033584595, "step": 499 }, { "epoch": 0.004120754592580994, "grad_norm": 3.640625, "grad_norm_var": 6143.101721191406, "learning_rate": 5e-05, "loss": 0.3104, "loss/crossentropy": 2.562577962875366, "loss/dist_ce": 0.0, "loss/hidden": 0.23046875, "loss/idx": 0.0, "loss/logits": 0.07988132536411285, "step": 500 }, { "epoch": 0.004128996101766156, "grad_norm": 2.46875, "grad_norm_var": 6145.463117472331, "learning_rate": 5e-05, "loss": 0.1763, "loss/crossentropy": 2.076406478881836, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.04152751341462135, "step": 501 }, { "epoch": 0.0041372376109513175, "grad_norm": 2.15625, "grad_norm_var": 6145.830125935872, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 1.5978915691375732, "loss/dist_ce": 0.0, "loss/hidden": 0.1474609375, "loss/idx": 0.0, "loss/logits": 0.03506774455308914, "step": 502 }, { "epoch": 0.004145479120136479, "grad_norm": 6.5, "grad_norm_var": 6134.082059733073, "learning_rate": 5e-05, "loss": 0.343, "loss/crossentropy": 2.7203476428985596, "loss/dist_ce": 0.0, "loss/hidden": 0.25390625, "loss/idx": 0.0, "loss/logits": 0.0890902578830719, "step": 503 }, { "epoch": 0.004153720629321641, "grad_norm": 2.46875, "grad_norm_var": 9.201432291666666, "learning_rate": 5e-05, "loss": 0.197, "loss/crossentropy": 1.9945552349090576, "loss/dist_ce": 0.0, "loss/hidden": 0.15625, "loss/idx": 0.0, "loss/logits": 0.040720634162425995, "step": 504 }, { "epoch": 0.004161962138506803, "grad_norm": 2.71875, "grad_norm_var": 5.917020670572916, "learning_rate": 5e-05, "loss": 0.1455, "loss/crossentropy": 0.7928286790847778, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.010724226012825966, "step": 505 }, { "epoch": 0.004170203647691965, "grad_norm": 1.5234375, "grad_norm_var": 6.374580637613932, "learning_rate": 5e-05, "loss": 0.1175, "loss/crossentropy": 1.454852819442749, "loss/dist_ce": 0.0, "loss/hidden": 0.0986328125, "loss/idx": 0.0, "loss/logits": 0.018820609897375107, "step": 506 }, { "epoch": 0.004178445156877127, "grad_norm": 5.0, "grad_norm_var": 4.231941477457682, "learning_rate": 5e-05, "loss": 0.2223, "loss/crossentropy": 1.54378080368042, "loss/dist_ce": 0.0, "loss/hidden": 0.1865234375, "loss/idx": 0.0, "loss/logits": 0.03581738844513893, "step": 507 }, { "epoch": 0.00418668666606229, "grad_norm": 2.28125, "grad_norm_var": 4.020247141520183, "learning_rate": 5e-05, "loss": 0.1535, "loss/crossentropy": 2.197096109390259, "loss/dist_ce": 0.0, "loss/hidden": 0.1259765625, "loss/idx": 0.0, "loss/logits": 0.02750355750322342, "step": 508 }, { "epoch": 0.004194928175247452, "grad_norm": 3.890625, "grad_norm_var": 3.761824289957682, "learning_rate": 5e-05, "loss": 0.2321, "loss/crossentropy": 2.152005195617676, "loss/dist_ce": 0.0, "loss/hidden": 0.19140625, "loss/idx": 0.0, "loss/logits": 0.040720127522945404, "step": 509 }, { "epoch": 0.0042031696844326135, "grad_norm": 2.625, "grad_norm_var": 2.189497629801432, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 1.419573187828064, "loss/dist_ce": 0.0, "loss/hidden": 0.1455078125, "loss/idx": 0.0, "loss/logits": 0.03389629349112511, "step": 510 }, { "epoch": 0.004211411193617775, "grad_norm": 2.59375, "grad_norm_var": 2.1152992248535156, "learning_rate": 5e-05, "loss": 0.1537, "loss/crossentropy": 1.2792004346847534, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.020872898399829865, "step": 511 }, { "epoch": 0.004219652702802937, "grad_norm": 1.8984375, "grad_norm_var": 2.2378082275390625, "learning_rate": 5e-05, "loss": 0.1677, "loss/crossentropy": 2.1597256660461426, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.03487637639045715, "step": 512 }, { "epoch": 0.004227894211988099, "grad_norm": 3.3125, "grad_norm_var": 1.5597239176432292, "learning_rate": 5e-05, "loss": 0.2512, "loss/crossentropy": 1.6841063499450684, "loss/dist_ce": 0.0, "loss/hidden": 0.19921875, "loss/idx": 0.0, "loss/logits": 0.05194816365838051, "step": 513 }, { "epoch": 0.004236135721173261, "grad_norm": 4.78125, "grad_norm_var": 1.7451741536458334, "learning_rate": 5e-05, "loss": 0.2239, "loss/crossentropy": 2.816648006439209, "loss/dist_ce": 0.0, "loss/hidden": 0.171875, "loss/idx": 0.0, "loss/logits": 0.051988691091537476, "step": 514 }, { "epoch": 0.004244377230358423, "grad_norm": 11.6875, "grad_norm_var": 6.229002888997396, "learning_rate": 5e-05, "loss": 0.3317, "loss/crossentropy": 2.0484461784362793, "loss/dist_ce": 0.0, "loss/hidden": 0.275390625, "loss/idx": 0.0, "loss/logits": 0.05626022815704346, "step": 515 }, { "epoch": 0.004252618739543585, "grad_norm": 3.546875, "grad_norm_var": 6.2305653889973955, "learning_rate": 5e-05, "loss": 0.1705, "loss/crossentropy": 1.2704802751541138, "loss/dist_ce": 0.0, "loss/hidden": 0.140625, "loss/idx": 0.0, "loss/logits": 0.029856139793992043, "step": 516 }, { "epoch": 0.004260860248728747, "grad_norm": 2.859375, "grad_norm_var": 6.175150553385417, "learning_rate": 5e-05, "loss": 0.2187, "loss/crossentropy": 1.2983540296554565, "loss/dist_ce": 0.0, "loss/hidden": 0.1826171875, "loss/idx": 0.0, "loss/logits": 0.03611702471971512, "step": 517 }, { "epoch": 0.0042691017579139096, "grad_norm": 2.390625, "grad_norm_var": 6.1290842692057295, "learning_rate": 5e-05, "loss": 0.1968, "loss/crossentropy": 2.924328088760376, "loss/dist_ce": 0.0, "loss/hidden": 0.150390625, "loss/idx": 0.0, "loss/logits": 0.04644050449132919, "step": 518 }, { "epoch": 0.0042773432670990714, "grad_norm": 2.453125, "grad_norm_var": 5.671439615885417, "learning_rate": 5e-05, "loss": 0.2026, "loss/crossentropy": 2.4892868995666504, "loss/dist_ce": 0.0, "loss/hidden": 0.1611328125, "loss/idx": 0.0, "loss/logits": 0.0414496548473835, "step": 519 }, { "epoch": 0.004285584776284233, "grad_norm": 2.453125, "grad_norm_var": 5.673607381184896, "learning_rate": 5e-05, "loss": 0.243, "loss/crossentropy": 0.931300163269043, "loss/dist_ce": 0.0, "loss/hidden": 0.1796875, "loss/idx": 0.0, "loss/logits": 0.06334017217159271, "step": 520 }, { "epoch": 0.004293826285469395, "grad_norm": 1.5390625, "grad_norm_var": 5.883624013264974, "learning_rate": 5e-05, "loss": 0.1502, "loss/crossentropy": 1.3612920045852661, "loss/dist_ce": 0.0, "loss/hidden": 0.12109375, "loss/idx": 0.0, "loss/logits": 0.02915302664041519, "step": 521 }, { "epoch": 0.004302067794654557, "grad_norm": 1.4609375, "grad_norm_var": 5.899733225504558, "learning_rate": 5e-05, "loss": 0.1209, "loss/crossentropy": 0.7956821918487549, "loss/dist_ce": 0.0, "loss/hidden": 0.10498046875, "loss/idx": 0.0, "loss/logits": 0.01589544117450714, "step": 522 }, { "epoch": 0.004310309303839719, "grad_norm": 8.9375, "grad_norm_var": 7.696473948160807, "learning_rate": 5e-05, "loss": 0.466, "loss/crossentropy": 2.8766582012176514, "loss/dist_ce": 0.0, "loss/hidden": 0.40234375, "loss/idx": 0.0, "loss/logits": 0.06366438418626785, "step": 523 }, { "epoch": 0.004318550813024881, "grad_norm": 10.75, "grad_norm_var": 10.611466217041016, "learning_rate": 5e-05, "loss": 0.2406, "loss/crossentropy": 1.433667540550232, "loss/dist_ce": 0.0, "loss/hidden": 0.201171875, "loss/idx": 0.0, "loss/logits": 0.03946308791637421, "step": 524 }, { "epoch": 0.004326792322210043, "grad_norm": 1.8828125, "grad_norm_var": 10.94590555826823, "learning_rate": 5e-05, "loss": 0.1674, "loss/crossentropy": 0.8868244290351868, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.03266747295856476, "step": 525 }, { "epoch": 0.004335033831395205, "grad_norm": 10.9375, "grad_norm_var": 13.659373982747395, "learning_rate": 5e-05, "loss": 0.2533, "loss/crossentropy": 1.5202522277832031, "loss/dist_ce": 0.0, "loss/hidden": 0.2236328125, "loss/idx": 0.0, "loss/logits": 0.02963770553469658, "step": 526 }, { "epoch": 0.0043432753405803675, "grad_norm": 2.5, "grad_norm_var": 13.684911092122396, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.2358787059783936, "loss/dist_ce": 0.0, "loss/hidden": 0.1494140625, "loss/idx": 0.0, "loss/logits": 0.041103295981884, "step": 527 }, { "epoch": 0.004351516849765529, "grad_norm": 5.65625, "grad_norm_var": 13.220444488525391, "learning_rate": 5e-05, "loss": 0.2986, "loss/crossentropy": 2.475597381591797, "loss/dist_ce": 0.0, "loss/hidden": 0.2333984375, "loss/idx": 0.0, "loss/logits": 0.06522452086210251, "step": 528 }, { "epoch": 0.004359758358950691, "grad_norm": 6.125, "grad_norm_var": 13.148850250244141, "learning_rate": 5e-05, "loss": 0.2751, "loss/crossentropy": 2.1268441677093506, "loss/dist_ce": 0.0, "loss/hidden": 0.2275390625, "loss/idx": 0.0, "loss/logits": 0.047553326934576035, "step": 529 }, { "epoch": 0.004367999868135853, "grad_norm": 4.1875, "grad_norm_var": 13.188008371988932, "learning_rate": 5e-05, "loss": 0.2687, "loss/crossentropy": 1.8196269273757935, "loss/dist_ce": 0.0, "loss/hidden": 0.22265625, "loss/idx": 0.0, "loss/logits": 0.04601012170314789, "step": 530 }, { "epoch": 0.004376241377321015, "grad_norm": 5.96875, "grad_norm_var": 10.102638498942058, "learning_rate": 5e-05, "loss": 0.4053, "loss/crossentropy": 2.056028127670288, "loss/dist_ce": 0.0, "loss/hidden": 0.291015625, "loss/idx": 0.0, "loss/logits": 0.11430098116397858, "step": 531 }, { "epoch": 0.004384482886506177, "grad_norm": 2.828125, "grad_norm_var": 10.236140696207682, "learning_rate": 5e-05, "loss": 0.1589, "loss/crossentropy": 1.7495375871658325, "loss/dist_ce": 0.0, "loss/hidden": 0.1298828125, "loss/idx": 0.0, "loss/logits": 0.02899114228785038, "step": 532 }, { "epoch": 0.004392724395691339, "grad_norm": 2.65625, "grad_norm_var": 10.28472671508789, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 1.6073510646820068, "loss/dist_ce": 0.0, "loss/hidden": 0.142578125, "loss/idx": 0.0, "loss/logits": 0.034396156668663025, "step": 533 }, { "epoch": 0.004400965904876501, "grad_norm": 2.484375, "grad_norm_var": 10.258341217041016, "learning_rate": 5e-05, "loss": 0.1938, "loss/crossentropy": 1.503987193107605, "loss/dist_ce": 0.0, "loss/hidden": 0.154296875, "loss/idx": 0.0, "loss/logits": 0.03948179632425308, "step": 534 }, { "epoch": 0.004409207414061663, "grad_norm": 1.140625, "grad_norm_var": 10.733182525634765, "learning_rate": 5e-05, "loss": 0.1132, "loss/crossentropy": 1.3690646886825562, "loss/dist_ce": 0.0, "loss/hidden": 0.09375, "loss/idx": 0.0, "loss/logits": 0.019410330802202225, "step": 535 }, { "epoch": 0.004417448923246825, "grad_norm": 1.3671875, "grad_norm_var": 11.098802693684895, "learning_rate": 5e-05, "loss": 0.1406, "loss/crossentropy": 1.3874551057815552, "loss/dist_ce": 0.0, "loss/hidden": 0.1171875, "loss/idx": 0.0, "loss/logits": 0.023373104631900787, "step": 536 }, { "epoch": 0.004425690432431987, "grad_norm": 2.078125, "grad_norm_var": 10.911236317952474, "learning_rate": 5e-05, "loss": 0.1453, "loss/crossentropy": 1.0550464391708374, "loss/dist_ce": 0.0, "loss/hidden": 0.12353515625, "loss/idx": 0.0, "loss/logits": 0.021797355264425278, "step": 537 }, { "epoch": 0.004433931941617149, "grad_norm": 4.65625, "grad_norm_var": 10.28226318359375, "learning_rate": 5e-05, "loss": 0.229, "loss/crossentropy": 2.7873692512512207, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.05122203379869461, "step": 538 }, { "epoch": 0.004442173450802311, "grad_norm": 4.40625, "grad_norm_var": 8.965958658854166, "learning_rate": 5e-05, "loss": 0.3081, "loss/crossentropy": 2.410203695297241, "loss/dist_ce": 0.0, "loss/hidden": 0.25, "loss/idx": 0.0, "loss/logits": 0.058051109313964844, "step": 539 }, { "epoch": 0.004450414959987473, "grad_norm": 2.671875, "grad_norm_var": 6.152814737955729, "learning_rate": 5e-05, "loss": 0.1837, "loss/crossentropy": 1.0460572242736816, "loss/dist_ce": 0.0, "loss/hidden": 0.1591796875, "loss/idx": 0.0, "loss/logits": 0.024545256048440933, "step": 540 }, { "epoch": 0.004458656469172635, "grad_norm": 2.25, "grad_norm_var": 6.065093739827474, "learning_rate": 5e-05, "loss": 0.1368, "loss/crossentropy": 0.29145750403404236, "loss/dist_ce": 0.0, "loss/hidden": 0.126953125, "loss/idx": 0.0, "loss/logits": 0.00982777401804924, "step": 541 }, { "epoch": 0.004466897978357797, "grad_norm": 3.53125, "grad_norm_var": 2.513854726155599, "learning_rate": 5e-05, "loss": 0.1929, "loss/crossentropy": 1.3329319953918457, "loss/dist_ce": 0.0, "loss/hidden": 0.162109375, "loss/idx": 0.0, "loss/logits": 0.030750418081879616, "step": 542 }, { "epoch": 0.004475139487542959, "grad_norm": 3.265625, "grad_norm_var": 2.457928212483724, "learning_rate": 5e-05, "loss": 0.2152, "loss/crossentropy": 2.0753843784332275, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.045312896370887756, "step": 543 }, { "epoch": 0.0044833809967281205, "grad_norm": 3.1875, "grad_norm_var": 2.114135487874349, "learning_rate": 5e-05, "loss": 0.1793, "loss/crossentropy": 1.0670427083969116, "loss/dist_ce": 0.0, "loss/hidden": 0.1455078125, "loss/idx": 0.0, "loss/logits": 0.033770665526390076, "step": 544 }, { "epoch": 0.004491622505913282, "grad_norm": 1.6328125, "grad_norm_var": 1.6834879557291667, "learning_rate": 5e-05, "loss": 0.1622, "loss/crossentropy": 1.4867587089538574, "loss/dist_ce": 0.0, "loss/hidden": 0.1337890625, "loss/idx": 0.0, "loss/logits": 0.02845672518014908, "step": 545 }, { "epoch": 0.004499864015098445, "grad_norm": 2.859375, "grad_norm_var": 1.5869049072265624, "learning_rate": 5e-05, "loss": 0.1723, "loss/crossentropy": 1.544826865196228, "loss/dist_ce": 0.0, "loss/hidden": 0.146484375, "loss/idx": 0.0, "loss/logits": 0.025784984230995178, "step": 546 }, { "epoch": 0.004508105524283607, "grad_norm": 2.1875, "grad_norm_var": 0.9517730712890625, "learning_rate": 5e-05, "loss": 0.2222, "loss/crossentropy": 2.5712733268737793, "loss/dist_ce": 0.0, "loss/hidden": 0.16796875, "loss/idx": 0.0, "loss/logits": 0.05422845110297203, "step": 547 }, { "epoch": 0.004516347033468769, "grad_norm": 1.6875, "grad_norm_var": 1.0136311848958333, "learning_rate": 5e-05, "loss": 0.1554, "loss/crossentropy": 1.6466069221496582, "loss/dist_ce": 0.0, "loss/hidden": 0.1279296875, "loss/idx": 0.0, "loss/logits": 0.027493983507156372, "step": 548 }, { "epoch": 0.004524588542653931, "grad_norm": 1.5546875, "grad_norm_var": 1.085455067952474, "learning_rate": 5e-05, "loss": 0.1578, "loss/crossentropy": 2.487321615219116, "loss/dist_ce": 0.0, "loss/hidden": 0.12451171875, "loss/idx": 0.0, "loss/logits": 0.03331330791115761, "step": 549 }, { "epoch": 0.004532830051839093, "grad_norm": 2.6875, "grad_norm_var": 1.0859840393066407, "learning_rate": 5e-05, "loss": 0.2285, "loss/crossentropy": 2.7870802879333496, "loss/dist_ce": 0.0, "loss/hidden": 0.173828125, "loss/idx": 0.0, "loss/logits": 0.054664455354213715, "step": 550 }, { "epoch": 0.004541071561024255, "grad_norm": 1.6875, "grad_norm_var": 1.000249989827474, "learning_rate": 5e-05, "loss": 0.1342, "loss/crossentropy": 1.667282223701477, "loss/dist_ce": 0.0, "loss/hidden": 0.1123046875, "loss/idx": 0.0, "loss/logits": 0.02186622843146324, "step": 551 }, { "epoch": 0.0045493130702094165, "grad_norm": 1.265625, "grad_norm_var": 1.0176829020182292, "learning_rate": 5e-05, "loss": 0.1424, "loss/crossentropy": 2.4660463333129883, "loss/dist_ce": 0.0, "loss/hidden": 0.11328125, "loss/idx": 0.0, "loss/logits": 0.029071927070617676, "step": 552 }, { "epoch": 0.0045575545793945784, "grad_norm": 1.4609375, "grad_norm_var": 1.084484608968099, "learning_rate": 5e-05, "loss": 0.1525, "loss/crossentropy": 2.228982925415039, "loss/dist_ce": 0.0, "loss/hidden": 0.1220703125, "loss/idx": 0.0, "loss/logits": 0.030380506068468094, "step": 553 }, { "epoch": 0.00456579608857974, "grad_norm": 1.4296875, "grad_norm_var": 0.8341949462890625, "learning_rate": 5e-05, "loss": 0.1534, "loss/crossentropy": 2.0212230682373047, "loss/dist_ce": 0.0, "loss/hidden": 0.1171875, "loss/idx": 0.0, "loss/logits": 0.0362619124352932, "step": 554 }, { "epoch": 0.004574037597764903, "grad_norm": 2.890625, "grad_norm_var": 0.5643229166666667, "learning_rate": 5e-05, "loss": 0.171, "loss/crossentropy": 1.549071192741394, "loss/dist_ce": 0.0, "loss/hidden": 0.1455078125, "loss/idx": 0.0, "loss/logits": 0.02545047551393509, "step": 555 }, { "epoch": 0.004582279106950065, "grad_norm": 4.3125, "grad_norm_var": 0.8214182535807292, "learning_rate": 5e-05, "loss": 0.3118, "loss/crossentropy": 2.652635097503662, "loss/dist_ce": 0.0, "loss/hidden": 0.25390625, "loss/idx": 0.0, "loss/logits": 0.057920753955841064, "step": 556 }, { "epoch": 0.004590520616135227, "grad_norm": 1.140625, "grad_norm_var": 0.9158162434895833, "learning_rate": 5e-05, "loss": 0.1287, "loss/crossentropy": 1.6453478336334229, "loss/dist_ce": 0.0, "loss/hidden": 0.1064453125, "loss/idx": 0.0, "loss/logits": 0.022242678329348564, "step": 557 }, { "epoch": 0.004598762125320389, "grad_norm": 3.265625, "grad_norm_var": 0.8765777587890625, "learning_rate": 5e-05, "loss": 0.2774, "loss/crossentropy": 1.622141718864441, "loss/dist_ce": 0.0, "loss/hidden": 0.2373046875, "loss/idx": 0.0, "loss/logits": 0.04007745534181595, "step": 558 }, { "epoch": 0.004607003634505551, "grad_norm": 1.859375, "grad_norm_var": 0.8157867431640625, "learning_rate": 5e-05, "loss": 0.1513, "loss/crossentropy": 1.737197995185852, "loss/dist_ce": 0.0, "loss/hidden": 0.123046875, "loss/idx": 0.0, "loss/logits": 0.0283003281801939, "step": 559 }, { "epoch": 0.0046152451436907126, "grad_norm": 2.328125, "grad_norm_var": 0.74814453125, "learning_rate": 5e-05, "loss": 0.1046, "loss/crossentropy": 0.2462574690580368, "loss/dist_ce": 0.0, "loss/hidden": 0.0966796875, "loss/idx": 0.0, "loss/logits": 0.00796109065413475, "step": 560 }, { "epoch": 0.0046234866528758745, "grad_norm": 3.421875, "grad_norm_var": 0.8270566304524739, "learning_rate": 5e-05, "loss": 0.2059, "loss/crossentropy": 2.6141371726989746, "loss/dist_ce": 0.0, "loss/hidden": 0.162109375, "loss/idx": 0.0, "loss/logits": 0.043790802359580994, "step": 561 }, { "epoch": 0.004631728162061036, "grad_norm": 3.421875, "grad_norm_var": 0.892352040608724, "learning_rate": 5e-05, "loss": 0.1974, "loss/crossentropy": 1.422098994255066, "loss/dist_ce": 0.0, "loss/hidden": 0.15625, "loss/idx": 0.0, "loss/logits": 0.0411381721496582, "step": 562 }, { "epoch": 0.004639969671246198, "grad_norm": 6.125, "grad_norm_var": 1.808794911702474, "learning_rate": 5e-05, "loss": 0.3067, "loss/crossentropy": 2.6955533027648926, "loss/dist_ce": 0.0, "loss/hidden": 0.232421875, "loss/idx": 0.0, "loss/logits": 0.07427150756120682, "step": 563 }, { "epoch": 0.004648211180431361, "grad_norm": 3.28125, "grad_norm_var": 1.787731679280599, "learning_rate": 5e-05, "loss": 0.2236, "loss/crossentropy": 2.841552972793579, "loss/dist_ce": 0.0, "loss/hidden": 0.181640625, "loss/idx": 0.0, "loss/logits": 0.041970379650592804, "step": 564 }, { "epoch": 0.004656452689616523, "grad_norm": 3.453125, "grad_norm_var": 1.7399617513020833, "learning_rate": 5e-05, "loss": 0.1892, "loss/crossentropy": 2.6695666313171387, "loss/dist_ce": 0.0, "loss/hidden": 0.1494140625, "loss/idx": 0.0, "loss/logits": 0.039814580231904984, "step": 565 }, { "epoch": 0.004664694198801685, "grad_norm": 23.0, "grad_norm_var": 27.352754720052083, "learning_rate": 5e-05, "loss": 0.3797, "loss/crossentropy": 2.7561914920806885, "loss/dist_ce": 0.0, "loss/hidden": 0.306640625, "loss/idx": 0.0, "loss/logits": 0.0730535015463829, "step": 566 }, { "epoch": 0.004672935707986847, "grad_norm": 4.125, "grad_norm_var": 26.965547688802083, "learning_rate": 5e-05, "loss": 0.2569, "loss/crossentropy": 1.6983141899108887, "loss/dist_ce": 0.0, "loss/hidden": 0.212890625, "loss/idx": 0.0, "loss/logits": 0.04405267536640167, "step": 567 }, { "epoch": 0.004681177217172009, "grad_norm": 3.28125, "grad_norm_var": 26.437889607747397, "learning_rate": 5e-05, "loss": 0.1744, "loss/crossentropy": 1.50763738155365, "loss/dist_ce": 0.0, "loss/hidden": 0.14453125, "loss/idx": 0.0, "loss/logits": 0.029890574514865875, "step": 568 }, { "epoch": 0.0046894187263571705, "grad_norm": 4.1875, "grad_norm_var": 25.870477040608723, "learning_rate": 5e-05, "loss": 0.2786, "loss/crossentropy": 2.469428300857544, "loss/dist_ce": 0.0, "loss/hidden": 0.216796875, "loss/idx": 0.0, "loss/logits": 0.06181073188781738, "step": 569 }, { "epoch": 0.004697660235542332, "grad_norm": 1.4140625, "grad_norm_var": 25.87682673136393, "learning_rate": 5e-05, "loss": 0.1614, "loss/crossentropy": 1.521830439567566, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.02856556512415409, "step": 570 }, { "epoch": 0.004705901744727494, "grad_norm": 3.109375, "grad_norm_var": 25.83377456665039, "learning_rate": 5e-05, "loss": 0.1871, "loss/crossentropy": 2.5410780906677246, "loss/dist_ce": 0.0, "loss/hidden": 0.1484375, "loss/idx": 0.0, "loss/logits": 0.038688138127326965, "step": 571 }, { "epoch": 0.004714143253912656, "grad_norm": 20.25, "grad_norm_var": 41.34689712524414, "learning_rate": 5e-05, "loss": 0.3154, "loss/crossentropy": 0.9852694272994995, "loss/dist_ce": 0.0, "loss/hidden": 0.27734375, "loss/idx": 0.0, "loss/logits": 0.0380379781126976, "step": 572 }, { "epoch": 0.004722384763097818, "grad_norm": 6.46875, "grad_norm_var": 40.0391476949056, "learning_rate": 5e-05, "loss": 0.3102, "loss/crossentropy": 2.0467264652252197, "loss/dist_ce": 0.0, "loss/hidden": 0.25, "loss/idx": 0.0, "loss/logits": 0.06023257598280907, "step": 573 }, { "epoch": 0.004730626272282981, "grad_norm": 4.40625, "grad_norm_var": 39.73319880167643, "learning_rate": 5e-05, "loss": 0.2448, "loss/crossentropy": 2.768284320831299, "loss/dist_ce": 0.0, "loss/hidden": 0.1875, "loss/idx": 0.0, "loss/logits": 0.057292141020298004, "step": 574 }, { "epoch": 0.004738867781468143, "grad_norm": 3.25, "grad_norm_var": 39.10796076456706, "learning_rate": 5e-05, "loss": 0.1371, "loss/crossentropy": 0.4907649755477905, "loss/dist_ce": 0.0, "loss/hidden": 0.125, "loss/idx": 0.0, "loss/logits": 0.01206925604492426, "step": 575 }, { "epoch": 0.004747109290653305, "grad_norm": 1.1328125, "grad_norm_var": 39.77771708170573, "learning_rate": 5e-05, "loss": 0.1305, "loss/crossentropy": 1.4613217115402222, "loss/dist_ce": 0.0, "loss/hidden": 0.109375, "loss/idx": 0.0, "loss/logits": 0.021137792617082596, "step": 576 }, { "epoch": 0.0047553507998384665, "grad_norm": 1.4453125, "grad_norm_var": 40.673797353108725, "learning_rate": 5e-05, "loss": 0.1593, "loss/crossentropy": 2.2154600620269775, "loss/dist_ce": 0.0, "loss/hidden": 0.12890625, "loss/idx": 0.0, "loss/logits": 0.030367335304617882, "step": 577 }, { "epoch": 0.004763592309023628, "grad_norm": 2.40625, "grad_norm_var": 41.05650812784831, "learning_rate": 5e-05, "loss": 0.1951, "loss/crossentropy": 2.493523120880127, "loss/dist_ce": 0.0, "loss/hidden": 0.1513671875, "loss/idx": 0.0, "loss/logits": 0.04371439293026924, "step": 578 }, { "epoch": 0.00477183381820879, "grad_norm": 2.28125, "grad_norm_var": 41.76645075480143, "learning_rate": 5e-05, "loss": 0.1766, "loss/crossentropy": 2.818694829940796, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.04187324270606041, "step": 579 }, { "epoch": 0.004780075327393952, "grad_norm": 2.5625, "grad_norm_var": 42.00832697550456, "learning_rate": 5e-05, "loss": 0.213, "loss/crossentropy": 3.1531453132629395, "loss/dist_ce": 0.0, "loss/hidden": 0.1630859375, "loss/idx": 0.0, "loss/logits": 0.049924593418836594, "step": 580 }, { "epoch": 0.004788316836579114, "grad_norm": 2.359375, "grad_norm_var": 42.370418039957684, "learning_rate": 5e-05, "loss": 0.1859, "loss/crossentropy": 2.076953172683716, "loss/dist_ce": 0.0, "loss/hidden": 0.14453125, "loss/idx": 0.0, "loss/logits": 0.041379667818546295, "step": 581 }, { "epoch": 0.004796558345764276, "grad_norm": 2.328125, "grad_norm_var": 20.444233957926432, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.5741026401519775, "loss/dist_ce": 0.0, "loss/hidden": 0.1396484375, "loss/idx": 0.0, "loss/logits": 0.03728485107421875, "step": 582 }, { "epoch": 0.004804799854949439, "grad_norm": 2.875, "grad_norm_var": 20.5315549214681, "learning_rate": 5e-05, "loss": 0.1681, "loss/crossentropy": 2.4309182167053223, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.033366359770298004, "step": 583 }, { "epoch": 0.004813041364134601, "grad_norm": 0.94921875, "grad_norm_var": 21.090232785542806, "learning_rate": 5e-05, "loss": 0.1001, "loss/crossentropy": 0.35457542538642883, "loss/dist_ce": 0.0, "loss/hidden": 0.09228515625, "loss/idx": 0.0, "loss/logits": 0.007841967046260834, "step": 584 }, { "epoch": 0.0048212828733197625, "grad_norm": 6.71875, "grad_norm_var": 21.60826562245687, "learning_rate": 5e-05, "loss": 0.3594, "loss/crossentropy": 2.4798266887664795, "loss/dist_ce": 0.0, "loss/hidden": 0.28515625, "loss/idx": 0.0, "loss/logits": 0.07420751452445984, "step": 585 }, { "epoch": 0.004829524382504924, "grad_norm": 4.15625, "grad_norm_var": 21.133738644917806, "learning_rate": 5e-05, "loss": 0.2067, "loss/crossentropy": 2.589935064315796, "loss/dist_ce": 0.0, "loss/hidden": 0.166015625, "loss/idx": 0.0, "loss/logits": 0.04071066156029701, "step": 586 }, { "epoch": 0.004837765891690086, "grad_norm": 2.640625, "grad_norm_var": 21.213679440816243, "learning_rate": 5e-05, "loss": 0.1794, "loss/crossentropy": 2.3674118518829346, "loss/dist_ce": 0.0, "loss/hidden": 0.1396484375, "loss/idx": 0.0, "loss/logits": 0.03970456123352051, "step": 587 }, { "epoch": 0.004846007400875248, "grad_norm": 3.921875, "grad_norm_var": 2.8025491714477537, "learning_rate": 5e-05, "loss": 0.1419, "loss/crossentropy": 0.37833818793296814, "loss/dist_ce": 0.0, "loss/hidden": 0.126953125, "loss/idx": 0.0, "loss/logits": 0.01495468057692051, "step": 588 }, { "epoch": 0.00485424891006041, "grad_norm": 5.21875, "grad_norm_var": 2.3418965021769207, "learning_rate": 5e-05, "loss": 0.2142, "loss/crossentropy": 2.5115513801574707, "loss/dist_ce": 0.0, "loss/hidden": 0.171875, "loss/idx": 0.0, "loss/logits": 0.04236599802970886, "step": 589 }, { "epoch": 0.004862490419245572, "grad_norm": 1.421875, "grad_norm_var": 2.3552057266235353, "learning_rate": 5e-05, "loss": 0.1423, "loss/crossentropy": 1.5357518196105957, "loss/dist_ce": 0.0, "loss/hidden": 0.1162109375, "loss/idx": 0.0, "loss/logits": 0.026050515472888947, "step": 590 }, { "epoch": 0.004870731928430734, "grad_norm": 2.59375, "grad_norm_var": 2.3474939346313475, "learning_rate": 5e-05, "loss": 0.1597, "loss/crossentropy": 2.9540340900421143, "loss/dist_ce": 0.0, "loss/hidden": 0.125, "loss/idx": 0.0, "loss/logits": 0.03472492843866348, "step": 591 }, { "epoch": 0.004878973437615897, "grad_norm": 1.40625, "grad_norm_var": 2.2909016291300457, "learning_rate": 5e-05, "loss": 0.127, "loss/crossentropy": 2.3022918701171875, "loss/dist_ce": 0.0, "loss/hidden": 0.10302734375, "loss/idx": 0.0, "loss/logits": 0.024019837379455566, "step": 592 }, { "epoch": 0.0048872149468010585, "grad_norm": 1.6171875, "grad_norm_var": 2.261008135477702, "learning_rate": 5e-05, "loss": 0.1432, "loss/crossentropy": 0.7338389158248901, "loss/dist_ce": 0.0, "loss/hidden": 0.12451171875, "loss/idx": 0.0, "loss/logits": 0.018692631274461746, "step": 593 }, { "epoch": 0.00489545645598622, "grad_norm": 2.046875, "grad_norm_var": 2.2899148941040037, "learning_rate": 5e-05, "loss": 0.1657, "loss/crossentropy": 2.504016160964966, "loss/dist_ce": 0.0, "loss/hidden": 0.12890625, "loss/idx": 0.0, "loss/logits": 0.036834657192230225, "step": 594 }, { "epoch": 0.004903697965171382, "grad_norm": 3.953125, "grad_norm_var": 2.344827715555827, "learning_rate": 5e-05, "loss": 0.1572, "loss/crossentropy": 1.7626408338546753, "loss/dist_ce": 0.0, "loss/hidden": 0.1279296875, "loss/idx": 0.0, "loss/logits": 0.029316924512386322, "step": 595 }, { "epoch": 0.004911939474356544, "grad_norm": 1.953125, "grad_norm_var": 2.3973347345987954, "learning_rate": 5e-05, "loss": 0.141, "loss/crossentropy": 1.9684767723083496, "loss/dist_ce": 0.0, "loss/hidden": 0.1142578125, "loss/idx": 0.0, "loss/logits": 0.026772357523441315, "step": 596 }, { "epoch": 0.004920180983541706, "grad_norm": 10.625, "grad_norm_var": 6.088076210021972, "learning_rate": 5e-05, "loss": 0.2187, "loss/crossentropy": 1.9169155359268188, "loss/dist_ce": 0.0, "loss/hidden": 0.1865234375, "loss/idx": 0.0, "loss/logits": 0.03213420510292053, "step": 597 }, { "epoch": 0.004928422492726868, "grad_norm": 4.5625, "grad_norm_var": 6.080293718973795, "learning_rate": 5e-05, "loss": 0.1759, "loss/crossentropy": 1.367024540901184, "loss/dist_ce": 0.0, "loss/hidden": 0.142578125, "loss/idx": 0.0, "loss/logits": 0.03334027901291847, "step": 598 }, { "epoch": 0.00493666400191203, "grad_norm": 2.4375, "grad_norm_var": 6.131121762593588, "learning_rate": 5e-05, "loss": 0.2373, "loss/crossentropy": 2.452223300933838, "loss/dist_ce": 0.0, "loss/hidden": 0.19921875, "loss/idx": 0.0, "loss/logits": 0.038047999143600464, "step": 599 }, { "epoch": 0.004944905511097192, "grad_norm": 3.859375, "grad_norm_var": 5.665278879801432, "learning_rate": 5e-05, "loss": 0.2239, "loss/crossentropy": 1.2548903226852417, "loss/dist_ce": 0.0, "loss/hidden": 0.1982421875, "loss/idx": 0.0, "loss/logits": 0.025662653148174286, "step": 600 }, { "epoch": 0.004953147020282354, "grad_norm": 3.265625, "grad_norm_var": 5.018717193603516, "learning_rate": 5e-05, "loss": 0.2458, "loss/crossentropy": 2.334947347640991, "loss/dist_ce": 0.0, "loss/hidden": 0.1875, "loss/idx": 0.0, "loss/logits": 0.058346427977085114, "step": 601 }, { "epoch": 0.0049613885294675164, "grad_norm": 1.140625, "grad_norm_var": 5.315175120035807, "learning_rate": 5e-05, "loss": 0.0935, "loss/crossentropy": 0.5309077501296997, "loss/dist_ce": 0.0, "loss/hidden": 0.0849609375, "loss/idx": 0.0, "loss/logits": 0.008535758592188358, "step": 602 }, { "epoch": 0.004969630038652678, "grad_norm": 2.859375, "grad_norm_var": 5.299181874593099, "learning_rate": 5e-05, "loss": 0.2036, "loss/crossentropy": 2.4964077472686768, "loss/dist_ce": 0.0, "loss/hidden": 0.158203125, "loss/idx": 0.0, "loss/logits": 0.045372504740953445, "step": 603 }, { "epoch": 0.00497787154783784, "grad_norm": 1.515625, "grad_norm_var": 5.463201649983724, "learning_rate": 5e-05, "loss": 0.1444, "loss/crossentropy": 1.4457087516784668, "loss/dist_ce": 0.0, "loss/hidden": 0.119140625, "loss/idx": 0.0, "loss/logits": 0.025238193571567535, "step": 604 }, { "epoch": 0.004986113057023002, "grad_norm": 2.34375, "grad_norm_var": 5.188616689046224, "learning_rate": 5e-05, "loss": 0.164, "loss/crossentropy": 1.5325767993927002, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.033134825527668, "step": 605 }, { "epoch": 0.004994354566208164, "grad_norm": 2.875, "grad_norm_var": 5.019653065999349, "learning_rate": 5e-05, "loss": 0.2116, "loss/crossentropy": 1.964009404182434, "loss/dist_ce": 0.0, "loss/hidden": 0.1494140625, "loss/idx": 0.0, "loss/logits": 0.06218406930565834, "step": 606 }, { "epoch": 0.005002596075393326, "grad_norm": 1.796875, "grad_norm_var": 5.109509023030599, "learning_rate": 5e-05, "loss": 0.1477, "loss/crossentropy": 1.4914216995239258, "loss/dist_ce": 0.0, "loss/hidden": 0.12353515625, "loss/idx": 0.0, "loss/logits": 0.024151228368282318, "step": 607 }, { "epoch": 0.005010837584578488, "grad_norm": 2.6875, "grad_norm_var": 4.937090810139974, "learning_rate": 5e-05, "loss": 0.167, "loss/crossentropy": 1.5185096263885498, "loss/dist_ce": 0.0, "loss/hidden": 0.140625, "loss/idx": 0.0, "loss/logits": 0.02637227438390255, "step": 608 }, { "epoch": 0.00501907909376365, "grad_norm": 3.21875, "grad_norm_var": 4.781574503580729, "learning_rate": 5e-05, "loss": 0.2422, "loss/crossentropy": 2.671537160873413, "loss/dist_ce": 0.0, "loss/hidden": 0.1904296875, "loss/idx": 0.0, "loss/logits": 0.05179464817047119, "step": 609 }, { "epoch": 0.005027320602948812, "grad_norm": 2.640625, "grad_norm_var": 4.712612915039062, "learning_rate": 5e-05, "loss": 0.1652, "loss/crossentropy": 1.1440718173980713, "loss/dist_ce": 0.0, "loss/hidden": 0.14453125, "loss/idx": 0.0, "loss/logits": 0.020696066319942474, "step": 610 }, { "epoch": 0.005035562112133974, "grad_norm": 2.265625, "grad_norm_var": 4.7286529541015625, "learning_rate": 5e-05, "loss": 0.1605, "loss/crossentropy": 0.47643014788627625, "loss/dist_ce": 0.0, "loss/hidden": 0.14453125, "loss/idx": 0.0, "loss/logits": 0.016013499349355698, "step": 611 }, { "epoch": 0.005043803621319136, "grad_norm": 1.7421875, "grad_norm_var": 4.764475250244141, "learning_rate": 5e-05, "loss": 0.1235, "loss/crossentropy": 1.5909593105316162, "loss/dist_ce": 0.0, "loss/hidden": 0.10546875, "loss/idx": 0.0, "loss/logits": 0.018061984330415726, "step": 612 }, { "epoch": 0.005052045130504298, "grad_norm": 1.4765625, "grad_norm_var": 0.8343994140625, "learning_rate": 5e-05, "loss": 0.1189, "loss/crossentropy": 2.336538076400757, "loss/dist_ce": 0.0, "loss/hidden": 0.0966796875, "loss/idx": 0.0, "loss/logits": 0.022249765694141388, "step": 613 }, { "epoch": 0.00506028663968946, "grad_norm": 4.9375, "grad_norm_var": 0.9441650390625, "learning_rate": 5e-05, "loss": 0.2263, "loss/crossentropy": 1.665325403213501, "loss/dist_ce": 0.0, "loss/hidden": 0.189453125, "loss/idx": 0.0, "loss/logits": 0.036880407482385635, "step": 614 }, { "epoch": 0.005068528148874622, "grad_norm": 3.765625, "grad_norm_var": 1.0315826416015625, "learning_rate": 5e-05, "loss": 0.2465, "loss/crossentropy": 1.6349374055862427, "loss/dist_ce": 0.0, "loss/hidden": 0.185546875, "loss/idx": 0.0, "loss/logits": 0.06092265248298645, "step": 615 }, { "epoch": 0.005076769658059784, "grad_norm": 3.8125, "grad_norm_var": 1.02415771484375, "learning_rate": 5e-05, "loss": 0.2903, "loss/crossentropy": 2.2787890434265137, "loss/dist_ce": 0.0, "loss/hidden": 0.2177734375, "loss/idx": 0.0, "loss/logits": 0.07248455286026001, "step": 616 }, { "epoch": 0.005085011167244946, "grad_norm": 1.421875, "grad_norm_var": 1.0844156901041666, "learning_rate": 5e-05, "loss": 0.1222, "loss/crossentropy": 0.9886749982833862, "loss/dist_ce": 0.0, "loss/hidden": 0.10595703125, "loss/idx": 0.0, "loss/logits": 0.016225244849920273, "step": 617 }, { "epoch": 0.005093252676430108, "grad_norm": 2.703125, "grad_norm_var": 0.9472900390625, "learning_rate": 5e-05, "loss": 0.217, "loss/crossentropy": 2.012622594833374, "loss/dist_ce": 0.0, "loss/hidden": 0.171875, "loss/idx": 0.0, "loss/logits": 0.0451560840010643, "step": 618 }, { "epoch": 0.0051014941856152695, "grad_norm": 1.9375, "grad_norm_var": 0.9720774332682292, "learning_rate": 5e-05, "loss": 0.1792, "loss/crossentropy": 2.2141380310058594, "loss/dist_ce": 0.0, "loss/hidden": 0.1435546875, "loss/idx": 0.0, "loss/logits": 0.03566785901784897, "step": 619 }, { "epoch": 0.005109735694800432, "grad_norm": 1.9765625, "grad_norm_var": 0.9204770406087239, "learning_rate": 5e-05, "loss": 0.1819, "loss/crossentropy": 2.5171334743499756, "loss/dist_ce": 0.0, "loss/hidden": 0.1474609375, "loss/idx": 0.0, "loss/logits": 0.03444764018058777, "step": 620 }, { "epoch": 0.005117977203985594, "grad_norm": 2.171875, "grad_norm_var": 0.9281979878743489, "learning_rate": 5e-05, "loss": 0.1252, "loss/crossentropy": 1.3747243881225586, "loss/dist_ce": 0.0, "loss/hidden": 0.1025390625, "loss/idx": 0.0, "loss/logits": 0.022622188553214073, "step": 621 }, { "epoch": 0.005126218713170756, "grad_norm": 3.5625, "grad_norm_var": 0.9839230855305989, "learning_rate": 5e-05, "loss": 0.1941, "loss/crossentropy": 1.6743167638778687, "loss/dist_ce": 0.0, "loss/hidden": 0.15625, "loss/idx": 0.0, "loss/logits": 0.03780867159366608, "step": 622 }, { "epoch": 0.005134460222355918, "grad_norm": 1.5703125, "grad_norm_var": 1.0123687744140626, "learning_rate": 5e-05, "loss": 0.1265, "loss/crossentropy": 2.2040176391601562, "loss/dist_ce": 0.0, "loss/hidden": 0.09814453125, "loss/idx": 0.0, "loss/logits": 0.028337322175502777, "step": 623 }, { "epoch": 0.00514270173154108, "grad_norm": 2.5625, "grad_norm_var": 1.0121897379557292, "learning_rate": 5e-05, "loss": 0.1402, "loss/crossentropy": 0.7447776794433594, "loss/dist_ce": 0.0, "loss/hidden": 0.1240234375, "loss/idx": 0.0, "loss/logits": 0.016176920384168625, "step": 624 }, { "epoch": 0.005150943240726242, "grad_norm": 1.6953125, "grad_norm_var": 1.0336626688639323, "learning_rate": 5e-05, "loss": 0.1657, "loss/crossentropy": 1.4739638566970825, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.03092530183494091, "step": 625 }, { "epoch": 0.005159184749911404, "grad_norm": 2.484375, "grad_norm_var": 1.032574208577474, "learning_rate": 5e-05, "loss": 0.1703, "loss/crossentropy": 2.6135871410369873, "loss/dist_ce": 0.0, "loss/hidden": 0.1318359375, "loss/idx": 0.0, "loss/logits": 0.03844151645898819, "step": 626 }, { "epoch": 0.0051674262590965655, "grad_norm": 2.75, "grad_norm_var": 1.0317543029785157, "learning_rate": 5e-05, "loss": 0.1908, "loss/crossentropy": 2.5848896503448486, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.03841109946370125, "step": 627 }, { "epoch": 0.005175667768281727, "grad_norm": 1.9375, "grad_norm_var": 1.0134755452473958, "learning_rate": 5e-05, "loss": 0.1499, "loss/crossentropy": 0.32518985867500305, "loss/dist_ce": 0.0, "loss/hidden": 0.1396484375, "loss/idx": 0.0, "loss/logits": 0.010256130248308182, "step": 628 }, { "epoch": 0.005183909277466889, "grad_norm": 3.296875, "grad_norm_var": 0.9605608622233073, "learning_rate": 5e-05, "loss": 0.1996, "loss/crossentropy": 2.3065407276153564, "loss/dist_ce": 0.0, "loss/hidden": 0.1640625, "loss/idx": 0.0, "loss/logits": 0.035511888563632965, "step": 629 }, { "epoch": 0.005192150786652052, "grad_norm": 1.828125, "grad_norm_var": 0.6212827046712239, "learning_rate": 5e-05, "loss": 0.1813, "loss/crossentropy": 2.5903186798095703, "loss/dist_ce": 0.0, "loss/hidden": 0.140625, "loss/idx": 0.0, "loss/logits": 0.040679510682821274, "step": 630 }, { "epoch": 0.005200392295837214, "grad_norm": 2.828125, "grad_norm_var": 0.5139218648274739, "learning_rate": 5e-05, "loss": 0.1537, "loss/crossentropy": 1.5172139406204224, "loss/dist_ce": 0.0, "loss/hidden": 0.1240234375, "loss/idx": 0.0, "loss/logits": 0.029673364013433456, "step": 631 }, { "epoch": 0.005208633805022376, "grad_norm": 2.765625, "grad_norm_var": 0.3864702860514323, "learning_rate": 5e-05, "loss": 0.1728, "loss/crossentropy": 1.9679391384124756, "loss/dist_ce": 0.0, "loss/hidden": 0.1396484375, "loss/idx": 0.0, "loss/logits": 0.03311806917190552, "step": 632 }, { "epoch": 0.005216875314207538, "grad_norm": 1.453125, "grad_norm_var": 0.38269220987955727, "learning_rate": 5e-05, "loss": 0.1564, "loss/crossentropy": 1.8976471424102783, "loss/dist_ce": 0.0, "loss/hidden": 0.1259765625, "loss/idx": 0.0, "loss/logits": 0.030387457460165024, "step": 633 }, { "epoch": 0.0052251168233927, "grad_norm": 1.4375, "grad_norm_var": 0.4224077860514323, "learning_rate": 5e-05, "loss": 0.1537, "loss/crossentropy": 2.396242618560791, "loss/dist_ce": 0.0, "loss/hidden": 0.1220703125, "loss/idx": 0.0, "loss/logits": 0.03165833652019501, "step": 634 }, { "epoch": 0.0052333583325778615, "grad_norm": 3.296875, "grad_norm_var": 0.4783404032389323, "learning_rate": 5e-05, "loss": 0.2148, "loss/crossentropy": 2.8331291675567627, "loss/dist_ce": 0.0, "loss/hidden": 0.1669921875, "loss/idx": 0.0, "loss/logits": 0.04783923923969269, "step": 635 }, { "epoch": 0.005241599841763023, "grad_norm": 2.640625, "grad_norm_var": 0.47274169921875, "learning_rate": 5e-05, "loss": 0.1847, "loss/crossentropy": 0.8493193984031677, "loss/dist_ce": 0.0, "loss/hidden": 0.15625, "loss/idx": 0.0, "loss/logits": 0.028483962640166283, "step": 636 }, { "epoch": 0.005249841350948185, "grad_norm": 2.203125, "grad_norm_var": 0.4718831380208333, "learning_rate": 5e-05, "loss": 0.157, "loss/crossentropy": 2.3299150466918945, "loss/dist_ce": 0.0, "loss/hidden": 0.125, "loss/idx": 0.0, "loss/logits": 0.032000426203012466, "step": 637 }, { "epoch": 0.005258082860133347, "grad_norm": 2.796875, "grad_norm_var": 0.3892893473307292, "learning_rate": 5e-05, "loss": 0.181, "loss/crossentropy": 1.8806333541870117, "loss/dist_ce": 0.0, "loss/hidden": 0.14453125, "loss/idx": 0.0, "loss/logits": 0.03642831742763519, "step": 638 }, { "epoch": 0.00526632436931851, "grad_norm": 3.828125, "grad_norm_var": 0.4741778055826823, "learning_rate": 5e-05, "loss": 0.2628, "loss/crossentropy": 2.58074951171875, "loss/dist_ce": 0.0, "loss/hidden": 0.2021484375, "loss/idx": 0.0, "loss/logits": 0.06061544269323349, "step": 639 }, { "epoch": 0.005274565878503672, "grad_norm": 3.0625, "grad_norm_var": 0.49478327433268227, "learning_rate": 5e-05, "loss": 0.2085, "loss/crossentropy": 2.6464812755584717, "loss/dist_ce": 0.0, "loss/hidden": 0.1630859375, "loss/idx": 0.0, "loss/logits": 0.04545789211988449, "step": 640 }, { "epoch": 0.005282807387688834, "grad_norm": 32.0, "grad_norm_var": 54.56477762858073, "learning_rate": 5e-05, "loss": 0.8256, "loss/crossentropy": 3.0567638874053955, "loss/dist_ce": 0.0, "loss/hidden": 0.546875, "loss/idx": 0.0, "loss/logits": 0.27873265743255615, "step": 641 }, { "epoch": 0.005291048896873996, "grad_norm": 3.09375, "grad_norm_var": 54.431278483072916, "learning_rate": 5e-05, "loss": 0.2481, "loss/crossentropy": 1.814937949180603, "loss/dist_ce": 0.0, "loss/hidden": 0.197265625, "loss/idx": 0.0, "loss/logits": 0.050825513899326324, "step": 642 }, { "epoch": 0.0052992904060591576, "grad_norm": 3.5, "grad_norm_var": 54.29631754557292, "learning_rate": 5e-05, "loss": 0.1537, "loss/crossentropy": 1.2851777076721191, "loss/dist_ce": 0.0, "loss/hidden": 0.1337890625, "loss/idx": 0.0, "loss/logits": 0.019936833530664444, "step": 643 }, { "epoch": 0.0053075319152443194, "grad_norm": 7.25, "grad_norm_var": 54.24651285807292, "learning_rate": 5e-05, "loss": 0.2315, "loss/crossentropy": 1.3410425186157227, "loss/dist_ce": 0.0, "loss/hidden": 0.2001953125, "loss/idx": 0.0, "loss/logits": 0.03128086030483246, "step": 644 }, { "epoch": 0.005315773424429481, "grad_norm": 2.34375, "grad_norm_var": 54.49813537597656, "learning_rate": 5e-05, "loss": 0.1844, "loss/crossentropy": 2.3179078102111816, "loss/dist_ce": 0.0, "loss/hidden": 0.1435546875, "loss/idx": 0.0, "loss/logits": 0.04083427041769028, "step": 645 }, { "epoch": 0.005324014933614643, "grad_norm": 3.90625, "grad_norm_var": 53.952762858072916, "learning_rate": 5e-05, "loss": 0.2196, "loss/crossentropy": 2.614328384399414, "loss/dist_ce": 0.0, "loss/hidden": 0.1689453125, "loss/idx": 0.0, "loss/logits": 0.05064046010375023, "step": 646 }, { "epoch": 0.005332256442799805, "grad_norm": 3088.0, "grad_norm_var": 594094.3569895426, "learning_rate": 5e-05, "loss": 74.0435, "loss/crossentropy": 5.280629634857178, "loss/dist_ce": 0.0, "loss/hidden": 70.5, "loss/idx": 0.0, "loss/logits": 3.543522834777832, "step": 647 }, { "epoch": 0.005340497951984968, "grad_norm": 6.65625, "grad_norm_var": 593994.1685831706, "learning_rate": 5e-05, "loss": 0.3181, "loss/crossentropy": 2.1775877475738525, "loss/dist_ce": 0.0, "loss/hidden": 0.26171875, "loss/idx": 0.0, "loss/logits": 0.05634221434593201, "step": 648 }, { "epoch": 0.00534873946117013, "grad_norm": 3.375, "grad_norm_var": 593944.0428049724, "learning_rate": 5e-05, "loss": 0.2034, "loss/crossentropy": 1.4645721912384033, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.03352264687418938, "step": 649 }, { "epoch": 0.005356980970355292, "grad_norm": 1.15625, "grad_norm_var": 593951.4221018474, "learning_rate": 5e-05, "loss": 0.1173, "loss/crossentropy": 1.4656540155410767, "loss/dist_ce": 0.0, "loss/hidden": 0.0986328125, "loss/idx": 0.0, "loss/logits": 0.018643483519554138, "step": 650 }, { "epoch": 0.005365222479540454, "grad_norm": 1.1953125, "grad_norm_var": 594006.2750038147, "learning_rate": 5e-05, "loss": 0.1105, "loss/crossentropy": 0.5469728708267212, "loss/dist_ce": 0.0, "loss/hidden": 0.095703125, "loss/idx": 0.0, "loss/logits": 0.014785300940275192, "step": 651 }, { "epoch": 0.0053734639887256155, "grad_norm": 1.796875, "grad_norm_var": 594028.2904518128, "learning_rate": 5e-05, "loss": 0.1277, "loss/crossentropy": 1.5997320413589478, "loss/dist_ce": 0.0, "loss/hidden": 0.103515625, "loss/idx": 0.0, "loss/logits": 0.024179790169000626, "step": 652 }, { "epoch": 0.005381705497910777, "grad_norm": 2.546875, "grad_norm_var": 594019.3290728251, "learning_rate": 5e-05, "loss": 0.2144, "loss/crossentropy": 1.897312045097351, "loss/dist_ce": 0.0, "loss/hidden": 0.16796875, "loss/idx": 0.0, "loss/logits": 0.04644346237182617, "step": 653 }, { "epoch": 0.005389947007095939, "grad_norm": 7.25, "grad_norm_var": 593904.7219866435, "learning_rate": 5e-05, "loss": 0.1896, "loss/crossentropy": 0.3107888996601105, "loss/dist_ce": 0.0, "loss/hidden": 0.16796875, "loss/idx": 0.0, "loss/logits": 0.021600116044282913, "step": 654 }, { "epoch": 0.005398188516281101, "grad_norm": 3.40625, "grad_norm_var": 593915.6656878154, "learning_rate": 5e-05, "loss": 0.2583, "loss/crossentropy": 2.793339967727661, "loss/dist_ce": 0.0, "loss/hidden": 0.201171875, "loss/idx": 0.0, "loss/logits": 0.057117462158203125, "step": 655 }, { "epoch": 0.005406430025466263, "grad_norm": 2.328125, "grad_norm_var": 593934.8025632222, "learning_rate": 5e-05, "loss": 0.18, "loss/crossentropy": 1.4796594381332397, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.027638476341962814, "step": 656 }, { "epoch": 0.005414671534651425, "grad_norm": 1.6875, "grad_norm_var": 594663.6030799865, "learning_rate": 5e-05, "loss": 0.1305, "loss/crossentropy": 1.4301519393920898, "loss/dist_ce": 0.0, "loss/hidden": 0.109375, "loss/idx": 0.0, "loss/logits": 0.021160051226615906, "step": 657 }, { "epoch": 0.005422913043836588, "grad_norm": 1.8203125, "grad_norm_var": 594696.4953653972, "learning_rate": 5e-05, "loss": 0.1742, "loss/crossentropy": 1.6321804523468018, "loss/dist_ce": 0.0, "loss/hidden": 0.1376953125, "loss/idx": 0.0, "loss/logits": 0.03655288740992546, "step": 658 }, { "epoch": 0.00543115455302175, "grad_norm": 3.46875, "grad_norm_var": 594697.2980875651, "learning_rate": 5e-05, "loss": 0.263, "loss/crossentropy": 2.723104476928711, "loss/dist_ce": 0.0, "loss/hidden": 0.19921875, "loss/idx": 0.0, "loss/logits": 0.0637696385383606, "step": 659 }, { "epoch": 0.0054393960622069115, "grad_norm": 3.140625, "grad_norm_var": 594801.8477040608, "learning_rate": 5e-05, "loss": 0.1395, "loss/crossentropy": 1.4695775508880615, "loss/dist_ce": 0.0, "loss/hidden": 0.1181640625, "loss/idx": 0.0, "loss/logits": 0.021351780742406845, "step": 660 }, { "epoch": 0.005447637571392073, "grad_norm": 1.9375, "grad_norm_var": 594812.3412261963, "learning_rate": 5e-05, "loss": 0.1788, "loss/crossentropy": 2.657424211502075, "loss/dist_ce": 0.0, "loss/hidden": 0.138671875, "loss/idx": 0.0, "loss/logits": 0.040120575577020645, "step": 661 }, { "epoch": 0.005455879080577235, "grad_norm": 5.0, "grad_norm_var": 594784.4235422771, "learning_rate": 5e-05, "loss": 0.1576, "loss/crossentropy": 0.4258406162261963, "loss/dist_ce": 0.0, "loss/hidden": 0.1396484375, "loss/idx": 0.0, "loss/logits": 0.01790551282465458, "step": 662 }, { "epoch": 0.005464120589762397, "grad_norm": 3.109375, "grad_norm_var": 3.251456705729167, "learning_rate": 5e-05, "loss": 0.2469, "loss/crossentropy": 1.6977794170379639, "loss/dist_ce": 0.0, "loss/hidden": 0.208984375, "loss/idx": 0.0, "loss/logits": 0.03788114711642265, "step": 663 }, { "epoch": 0.005472362098947559, "grad_norm": 1.8828125, "grad_norm_var": 2.4230974833170573, "learning_rate": 5e-05, "loss": 0.1297, "loss/crossentropy": 1.3648561239242554, "loss/dist_ce": 0.0, "loss/hidden": 0.1083984375, "loss/idx": 0.0, "loss/logits": 0.021297637373209, "step": 664 }, { "epoch": 0.005480603608132721, "grad_norm": 1.828125, "grad_norm_var": 2.4579424540201824, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 2.6582846641540527, "loss/dist_ce": 0.0, "loss/hidden": 0.1337890625, "loss/idx": 0.0, "loss/logits": 0.03840293735265732, "step": 665 }, { "epoch": 0.005488845117317883, "grad_norm": 1.703125, "grad_norm_var": 2.3624529520670574, "learning_rate": 5e-05, "loss": 0.1233, "loss/crossentropy": 1.4073052406311035, "loss/dist_ce": 0.0, "loss/hidden": 0.103515625, "loss/idx": 0.0, "loss/logits": 0.019805913791060448, "step": 666 }, { "epoch": 0.005497086626503046, "grad_norm": 3.015625, "grad_norm_var": 2.1906728108723956, "learning_rate": 5e-05, "loss": 0.1421, "loss/crossentropy": 1.2232915163040161, "loss/dist_ce": 0.0, "loss/hidden": 0.1181640625, "loss/idx": 0.0, "loss/logits": 0.02389139123260975, "step": 667 }, { "epoch": 0.0055053281356882075, "grad_norm": 2.078125, "grad_norm_var": 2.155370076497396, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 2.633976697921753, "loss/dist_ce": 0.0, "loss/hidden": 0.138671875, "loss/idx": 0.0, "loss/logits": 0.03825639933347702, "step": 668 }, { "epoch": 0.005513569644873369, "grad_norm": 1.3984375, "grad_norm_var": 2.289989980061849, "learning_rate": 5e-05, "loss": 0.1432, "loss/crossentropy": 2.7594661712646484, "loss/dist_ce": 0.0, "loss/hidden": 0.1123046875, "loss/idx": 0.0, "loss/logits": 0.030905555933713913, "step": 669 }, { "epoch": 0.005521811154058531, "grad_norm": 2.875, "grad_norm_var": 0.899731190999349, "learning_rate": 5e-05, "loss": 0.2301, "loss/crossentropy": 2.9316701889038086, "loss/dist_ce": 0.0, "loss/hidden": 0.171875, "loss/idx": 0.0, "loss/logits": 0.058252155780792236, "step": 670 }, { "epoch": 0.005530052663243693, "grad_norm": 2.109375, "grad_norm_var": 0.8554888407389323, "learning_rate": 5e-05, "loss": 0.1341, "loss/crossentropy": 1.2458611726760864, "loss/dist_ce": 0.0, "loss/hidden": 0.111328125, "loss/idx": 0.0, "loss/logits": 0.022815225645899773, "step": 671 }, { "epoch": 0.005538294172428855, "grad_norm": 1.859375, "grad_norm_var": 0.8775530497233073, "learning_rate": 5e-05, "loss": 0.1773, "loss/crossentropy": 1.787272572517395, "loss/dist_ce": 0.0, "loss/hidden": 0.138671875, "loss/idx": 0.0, "loss/logits": 0.03861699998378754, "step": 672 }, { "epoch": 0.005546535681614017, "grad_norm": 5.40625, "grad_norm_var": 1.3726600646972655, "learning_rate": 5e-05, "loss": 0.2194, "loss/crossentropy": 1.330421805381775, "loss/dist_ce": 0.0, "loss/hidden": 0.1943359375, "loss/idx": 0.0, "loss/logits": 0.025075137615203857, "step": 673 }, { "epoch": 0.005554777190799179, "grad_norm": 0.9609375, "grad_norm_var": 1.5155535380045573, "learning_rate": 5e-05, "loss": 0.1263, "loss/crossentropy": 2.6272873878479004, "loss/dist_ce": 0.0, "loss/hidden": 0.1015625, "loss/idx": 0.0, "loss/logits": 0.024767953902482986, "step": 674 }, { "epoch": 0.005563018699984341, "grad_norm": 5.375, "grad_norm_var": 1.9607175191243489, "learning_rate": 5e-05, "loss": 0.3144, "loss/crossentropy": 3.5015265941619873, "loss/dist_ce": 0.0, "loss/hidden": 0.23828125, "loss/idx": 0.0, "loss/logits": 0.07607264816761017, "step": 675 }, { "epoch": 0.0055712602091695035, "grad_norm": 2.546875, "grad_norm_var": 1.9502418518066407, "learning_rate": 5e-05, "loss": 0.2386, "loss/crossentropy": 2.711198091506958, "loss/dist_ce": 0.0, "loss/hidden": 0.1875, "loss/idx": 0.0, "loss/logits": 0.05106600373983383, "step": 676 }, { "epoch": 0.005579501718354665, "grad_norm": 2.765625, "grad_norm_var": 1.909698232014974, "learning_rate": 5e-05, "loss": 0.203, "loss/crossentropy": 1.9062882661819458, "loss/dist_ce": 0.0, "loss/hidden": 0.1640625, "loss/idx": 0.0, "loss/logits": 0.03898348659276962, "step": 677 }, { "epoch": 0.005587743227539827, "grad_norm": 1.796875, "grad_norm_var": 1.5877174377441405, "learning_rate": 5e-05, "loss": 0.1617, "loss/crossentropy": 2.5185117721557617, "loss/dist_ce": 0.0, "loss/hidden": 0.126953125, "loss/idx": 0.0, "loss/logits": 0.03473159298300743, "step": 678 }, { "epoch": 0.005595984736724989, "grad_norm": 3.71875, "grad_norm_var": 1.6568275451660157, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 1.0520906448364258, "loss/dist_ce": 0.0, "loss/hidden": 0.1640625, "loss/idx": 0.0, "loss/logits": 0.015656642615795135, "step": 679 }, { "epoch": 0.005604226245910151, "grad_norm": 3.34375, "grad_norm_var": 1.6539265950520834, "learning_rate": 5e-05, "loss": 0.1532, "loss/crossentropy": 1.6271475553512573, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.022353362292051315, "step": 680 }, { "epoch": 0.005612467755095313, "grad_norm": 1.6796875, "grad_norm_var": 1.6720415751139324, "learning_rate": 5e-05, "loss": 0.1228, "loss/crossentropy": 0.8219252228736877, "loss/dist_ce": 0.0, "loss/hidden": 0.1083984375, "loss/idx": 0.0, "loss/logits": 0.014401828870177269, "step": 681 }, { "epoch": 0.005620709264280475, "grad_norm": 2.46875, "grad_norm_var": 1.6105323791503907, "learning_rate": 5e-05, "loss": 0.1697, "loss/crossentropy": 1.7836802005767822, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.03692241013050079, "step": 682 }, { "epoch": 0.005628950773465637, "grad_norm": 5.0, "grad_norm_var": 1.9368690490722655, "learning_rate": 5e-05, "loss": 0.2266, "loss/crossentropy": 2.947277784347534, "loss/dist_ce": 0.0, "loss/hidden": 0.1796875, "loss/idx": 0.0, "loss/logits": 0.04695526510477066, "step": 683 }, { "epoch": 0.005637192282650799, "grad_norm": 3.171875, "grad_norm_var": 1.9010515848795573, "learning_rate": 5e-05, "loss": 0.1932, "loss/crossentropy": 2.684128999710083, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.04086027294397354, "step": 684 }, { "epoch": 0.005645433791835961, "grad_norm": 3.90625, "grad_norm_var": 1.7904368082682292, "learning_rate": 5e-05, "loss": 0.3706, "loss/crossentropy": 2.2005116939544678, "loss/dist_ce": 0.0, "loss/hidden": 0.30078125, "loss/idx": 0.0, "loss/logits": 0.06984560191631317, "step": 685 }, { "epoch": 0.005653675301021123, "grad_norm": 4.5625, "grad_norm_var": 1.9264475504557292, "learning_rate": 5e-05, "loss": 0.2539, "loss/crossentropy": 2.7152457237243652, "loss/dist_ce": 0.0, "loss/hidden": 0.19921875, "loss/idx": 0.0, "loss/logits": 0.05464334040880203, "step": 686 }, { "epoch": 0.005661916810206285, "grad_norm": 3.6875, "grad_norm_var": 1.8595621744791666, "learning_rate": 5e-05, "loss": 0.2398, "loss/crossentropy": 3.029996633529663, "loss/dist_ce": 0.0, "loss/hidden": 0.1796875, "loss/idx": 0.0, "loss/logits": 0.060077205300331116, "step": 687 }, { "epoch": 0.005670158319391447, "grad_norm": 2.3125, "grad_norm_var": 1.787433878580729, "learning_rate": 5e-05, "loss": 0.1699, "loss/crossentropy": 2.006049156188965, "loss/dist_ce": 0.0, "loss/hidden": 0.1376953125, "loss/idx": 0.0, "loss/logits": 0.032181136310100555, "step": 688 }, { "epoch": 0.005678399828576609, "grad_norm": 3.296875, "grad_norm_var": 1.4714396158854166, "learning_rate": 5e-05, "loss": 0.176, "loss/crossentropy": 2.032977819442749, "loss/dist_ce": 0.0, "loss/hidden": 0.138671875, "loss/idx": 0.0, "loss/logits": 0.037300050258636475, "step": 689 }, { "epoch": 0.005686641337761771, "grad_norm": 2.0, "grad_norm_var": 1.2339637756347657, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 1.572161316871643, "loss/dist_ce": 0.0, "loss/hidden": 0.1474609375, "loss/idx": 0.0, "loss/logits": 0.023761317133903503, "step": 690 }, { "epoch": 0.005694882846946933, "grad_norm": 1.578125, "grad_norm_var": 1.0475807189941406, "learning_rate": 5e-05, "loss": 0.1608, "loss/crossentropy": 0.5517882704734802, "loss/dist_ce": 0.0, "loss/hidden": 0.138671875, "loss/idx": 0.0, "loss/logits": 0.02215992659330368, "step": 691 }, { "epoch": 0.005703124356132095, "grad_norm": 2.1875, "grad_norm_var": 1.076873524983724, "learning_rate": 5e-05, "loss": 0.1669, "loss/crossentropy": 2.8021109104156494, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.036080196499824524, "step": 692 }, { "epoch": 0.005711365865317257, "grad_norm": 1.140625, "grad_norm_var": 1.2856056213378906, "learning_rate": 5e-05, "loss": 0.1113, "loss/crossentropy": 0.46256670355796814, "loss/dist_ce": 0.0, "loss/hidden": 0.09716796875, "loss/idx": 0.0, "loss/logits": 0.014169261790812016, "step": 693 }, { "epoch": 0.0057196073745024185, "grad_norm": 1.8046875, "grad_norm_var": 1.284496053059896, "learning_rate": 5e-05, "loss": 0.145, "loss/crossentropy": 1.454952359199524, "loss/dist_ce": 0.0, "loss/hidden": 0.12109375, "loss/idx": 0.0, "loss/logits": 0.023933004587888718, "step": 694 }, { "epoch": 0.005727848883687581, "grad_norm": 1.34375, "grad_norm_var": 1.3670644124348958, "learning_rate": 5e-05, "loss": 0.1255, "loss/crossentropy": 1.4848099946975708, "loss/dist_ce": 0.0, "loss/hidden": 0.103515625, "loss/idx": 0.0, "loss/logits": 0.021940922364592552, "step": 695 }, { "epoch": 0.005736090392872743, "grad_norm": 4.125, "grad_norm_var": 1.470417277018229, "learning_rate": 5e-05, "loss": 0.2243, "loss/crossentropy": 2.6562836170196533, "loss/dist_ce": 0.0, "loss/hidden": 0.1748046875, "loss/idx": 0.0, "loss/logits": 0.04948100447654724, "step": 696 }, { "epoch": 0.005744331902057905, "grad_norm": 3.71875, "grad_norm_var": 1.4347735087076823, "learning_rate": 5e-05, "loss": 0.1446, "loss/crossentropy": 1.2862632274627686, "loss/dist_ce": 0.0, "loss/hidden": 0.125, "loss/idx": 0.0, "loss/logits": 0.019616402685642242, "step": 697 }, { "epoch": 0.005752573411243067, "grad_norm": 2.53125, "grad_norm_var": 1.4314735412597657, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 2.1273880004882812, "loss/dist_ce": 0.0, "loss/hidden": 0.140625, "loss/idx": 0.0, "loss/logits": 0.036726418882608414, "step": 698 }, { "epoch": 0.005760814920428229, "grad_norm": 1.7421875, "grad_norm_var": 1.1817291259765625, "learning_rate": 5e-05, "loss": 0.1739, "loss/crossentropy": 2.4990146160125732, "loss/dist_ce": 0.0, "loss/hidden": 0.1357421875, "loss/idx": 0.0, "loss/logits": 0.03811194375157356, "step": 699 }, { "epoch": 0.005769056429613391, "grad_norm": 4.15625, "grad_norm_var": 1.30496826171875, "learning_rate": 5e-05, "loss": 0.252, "loss/crossentropy": 2.5132620334625244, "loss/dist_ce": 0.0, "loss/hidden": 0.2041015625, "loss/idx": 0.0, "loss/logits": 0.04793284088373184, "step": 700 }, { "epoch": 0.005777297938798553, "grad_norm": 3.359375, "grad_norm_var": 1.2397776285807292, "learning_rate": 5e-05, "loss": 0.2582, "loss/crossentropy": 2.5892200469970703, "loss/dist_ce": 0.0, "loss/hidden": 0.203125, "loss/idx": 0.0, "loss/logits": 0.05511770024895668, "step": 701 }, { "epoch": 0.0057855394479837145, "grad_norm": 3.328125, "grad_norm_var": 1.0320393880208334, "learning_rate": 5e-05, "loss": 0.2048, "loss/crossentropy": 1.6278858184814453, "loss/dist_ce": 0.0, "loss/hidden": 0.1650390625, "loss/idx": 0.0, "loss/logits": 0.03974189609289169, "step": 702 }, { "epoch": 0.005793780957168876, "grad_norm": 5.4375, "grad_norm_var": 1.4668050130208334, "learning_rate": 5e-05, "loss": 0.2686, "loss/crossentropy": 1.9843369722366333, "loss/dist_ce": 0.0, "loss/hidden": 0.220703125, "loss/idx": 0.0, "loss/logits": 0.047865502536296844, "step": 703 }, { "epoch": 0.005802022466354039, "grad_norm": 1.1796875, "grad_norm_var": 1.6136797587076823, "learning_rate": 5e-05, "loss": 0.1149, "loss/crossentropy": 1.3343570232391357, "loss/dist_ce": 0.0, "loss/hidden": 0.095703125, "loss/idx": 0.0, "loss/logits": 0.019155774265527725, "step": 704 }, { "epoch": 0.005810263975539201, "grad_norm": 2.78125, "grad_norm_var": 1.5880999247233072, "learning_rate": 5e-05, "loss": 0.2146, "loss/crossentropy": 1.3166966438293457, "loss/dist_ce": 0.0, "loss/hidden": 0.1796875, "loss/idx": 0.0, "loss/logits": 0.03489375486969948, "step": 705 }, { "epoch": 0.005818505484724363, "grad_norm": 3.109375, "grad_norm_var": 1.568743642171224, "learning_rate": 5e-05, "loss": 0.2285, "loss/crossentropy": 1.6788283586502075, "loss/dist_ce": 0.0, "loss/hidden": 0.1845703125, "loss/idx": 0.0, "loss/logits": 0.043936047703027725, "step": 706 }, { "epoch": 0.005826746993909525, "grad_norm": 3.3125, "grad_norm_var": 1.4926389058430989, "learning_rate": 5e-05, "loss": 0.1728, "loss/crossentropy": 2.161801815032959, "loss/dist_ce": 0.0, "loss/hidden": 0.138671875, "loss/idx": 0.0, "loss/logits": 0.034114591777324677, "step": 707 }, { "epoch": 0.005834988503094687, "grad_norm": 3.015625, "grad_norm_var": 1.4647112528483073, "learning_rate": 5e-05, "loss": 0.1577, "loss/crossentropy": 2.5755579471588135, "loss/dist_ce": 0.0, "loss/hidden": 0.125, "loss/idx": 0.0, "loss/logits": 0.032723598182201385, "step": 708 }, { "epoch": 0.005843230012279849, "grad_norm": 3.015625, "grad_norm_var": 1.2495012919108073, "learning_rate": 5e-05, "loss": 0.2659, "loss/crossentropy": 2.3093197345733643, "loss/dist_ce": 0.0, "loss/hidden": 0.21484375, "loss/idx": 0.0, "loss/logits": 0.05105920881032944, "step": 709 }, { "epoch": 0.0058514715214650105, "grad_norm": 3.3125, "grad_norm_var": 1.1517781575520833, "learning_rate": 5e-05, "loss": 0.2048, "loss/crossentropy": 1.972427248954773, "loss/dist_ce": 0.0, "loss/hidden": 0.1640625, "loss/idx": 0.0, "loss/logits": 0.04072684049606323, "step": 710 }, { "epoch": 0.005859713030650172, "grad_norm": 1.375, "grad_norm_var": 1.1445556640625, "learning_rate": 5e-05, "loss": 0.1302, "loss/crossentropy": 1.6267015933990479, "loss/dist_ce": 0.0, "loss/hidden": 0.107421875, "loss/idx": 0.0, "loss/logits": 0.022783808410167694, "step": 711 }, { "epoch": 0.005867954539835334, "grad_norm": 3.046875, "grad_norm_var": 1.0689605712890624, "learning_rate": 5e-05, "loss": 0.1712, "loss/crossentropy": 2.76011323928833, "loss/dist_ce": 0.0, "loss/hidden": 0.1337890625, "loss/idx": 0.0, "loss/logits": 0.037429310381412506, "step": 712 }, { "epoch": 0.005876196049020497, "grad_norm": 1.8125, "grad_norm_var": 1.1200917561848958, "learning_rate": 5e-05, "loss": 0.1373, "loss/crossentropy": 1.5537292957305908, "loss/dist_ce": 0.0, "loss/hidden": 0.11572265625, "loss/idx": 0.0, "loss/logits": 0.021540062502026558, "step": 713 }, { "epoch": 0.005884437558205659, "grad_norm": 1.90625, "grad_norm_var": 1.1758371988932292, "learning_rate": 5e-05, "loss": 0.1585, "loss/crossentropy": 2.612987995147705, "loss/dist_ce": 0.0, "loss/hidden": 0.1220703125, "loss/idx": 0.0, "loss/logits": 0.036412760615348816, "step": 714 }, { "epoch": 0.005892679067390821, "grad_norm": 1.8671875, "grad_norm_var": 1.1580474853515625, "learning_rate": 5e-05, "loss": 0.1586, "loss/crossentropy": 2.5890583992004395, "loss/dist_ce": 0.0, "loss/hidden": 0.125, "loss/idx": 0.0, "loss/logits": 0.033603958785533905, "step": 715 }, { "epoch": 0.005900920576575983, "grad_norm": 1.671875, "grad_norm_var": 1.11971435546875, "learning_rate": 5e-05, "loss": 0.1545, "loss/crossentropy": 2.612929344177246, "loss/dist_ce": 0.0, "loss/hidden": 0.1181640625, "loss/idx": 0.0, "loss/logits": 0.0363757461309433, "step": 716 }, { "epoch": 0.005909162085761145, "grad_norm": 3.078125, "grad_norm_var": 1.1007080078125, "learning_rate": 5e-05, "loss": 0.1632, "loss/crossentropy": 0.4507390856742859, "loss/dist_ce": 0.0, "loss/hidden": 0.1416015625, "loss/idx": 0.0, "loss/logits": 0.021595872938632965, "step": 717 }, { "epoch": 0.0059174035949463065, "grad_norm": 8.25, "grad_norm_var": 3.0249176025390625, "learning_rate": 5e-05, "loss": 0.1985, "loss/crossentropy": 0.8891280889511108, "loss/dist_ce": 0.0, "loss/hidden": 0.1748046875, "loss/idx": 0.0, "loss/logits": 0.023697488009929657, "step": 718 }, { "epoch": 0.005925645104131468, "grad_norm": 0.90625, "grad_norm_var": 2.842015584309896, "learning_rate": 5e-05, "loss": 0.1113, "loss/crossentropy": 1.2778152227401733, "loss/dist_ce": 0.0, "loss/hidden": 0.09423828125, "loss/idx": 0.0, "loss/logits": 0.017108086496591568, "step": 719 }, { "epoch": 0.00593388661331663, "grad_norm": 3.625, "grad_norm_var": 2.711073557535807, "learning_rate": 5e-05, "loss": 0.1612, "loss/crossentropy": 2.4469552040100098, "loss/dist_ce": 0.0, "loss/hidden": 0.1318359375, "loss/idx": 0.0, "loss/logits": 0.029361439868807793, "step": 720 }, { "epoch": 0.005942128122501792, "grad_norm": 1.4453125, "grad_norm_var": 2.8402750651041666, "learning_rate": 5e-05, "loss": 0.1502, "loss/crossentropy": 2.1400833129882812, "loss/dist_ce": 0.0, "loss/hidden": 0.11669921875, "loss/idx": 0.0, "loss/logits": 0.033529091626405716, "step": 721 }, { "epoch": 0.005950369631686954, "grad_norm": 2.109375, "grad_norm_var": 2.8611083984375, "learning_rate": 5e-05, "loss": 0.1749, "loss/crossentropy": 1.3967795372009277, "loss/dist_ce": 0.0, "loss/hidden": 0.142578125, "loss/idx": 0.0, "loss/logits": 0.03236193209886551, "step": 722 }, { "epoch": 0.005958611140872117, "grad_norm": 2.15625, "grad_norm_var": 2.855537923177083, "learning_rate": 5e-05, "loss": 0.2143, "loss/crossentropy": 2.475792646408081, "loss/dist_ce": 0.0, "loss/hidden": 0.166015625, "loss/idx": 0.0, "loss/logits": 0.048240065574645996, "step": 723 }, { "epoch": 0.005966852650057279, "grad_norm": 2.359375, "grad_norm_var": 2.8515218098958335, "learning_rate": 5e-05, "loss": 0.1735, "loss/crossentropy": 2.5644333362579346, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.04069886356592178, "step": 724 }, { "epoch": 0.005975094159242441, "grad_norm": 2.0625, "grad_norm_var": 2.8581614176432293, "learning_rate": 5e-05, "loss": 0.1677, "loss/crossentropy": 2.2065787315368652, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.0367942750453949, "step": 725 }, { "epoch": 0.0059833356684276025, "grad_norm": 2.734375, "grad_norm_var": 2.8211629231770834, "learning_rate": 5e-05, "loss": 0.2124, "loss/crossentropy": 2.876722574234009, "loss/dist_ce": 0.0, "loss/hidden": 0.1630859375, "loss/idx": 0.0, "loss/logits": 0.049362972378730774, "step": 726 }, { "epoch": 0.0059915771776127644, "grad_norm": 8.0, "grad_norm_var": 4.548148600260417, "learning_rate": 5e-05, "loss": 0.1838, "loss/crossentropy": 1.2283939123153687, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.03141150623559952, "step": 727 }, { "epoch": 0.005999818686797926, "grad_norm": 1.8671875, "grad_norm_var": 4.618230946858724, "learning_rate": 5e-05, "loss": 0.1644, "loss/crossentropy": 2.0428686141967773, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.03161695599555969, "step": 728 }, { "epoch": 0.006008060195983088, "grad_norm": 7.65625, "grad_norm_var": 5.931933339436849, "learning_rate": 5e-05, "loss": 0.3075, "loss/crossentropy": 2.5533790588378906, "loss/dist_ce": 0.0, "loss/hidden": 0.240234375, "loss/idx": 0.0, "loss/logits": 0.0672769546508789, "step": 729 }, { "epoch": 0.00601630170516825, "grad_norm": 2.46875, "grad_norm_var": 5.852355702718099, "learning_rate": 5e-05, "loss": 0.1774, "loss/crossentropy": 1.7351176738739014, "loss/dist_ce": 0.0, "loss/hidden": 0.150390625, "loss/idx": 0.0, "loss/logits": 0.027032926678657532, "step": 730 }, { "epoch": 0.006024543214353412, "grad_norm": 1.9453125, "grad_norm_var": 5.838165028889974, "learning_rate": 5e-05, "loss": 0.1815, "loss/crossentropy": 2.6725094318389893, "loss/dist_ce": 0.0, "loss/hidden": 0.142578125, "loss/idx": 0.0, "loss/logits": 0.038890693336725235, "step": 731 }, { "epoch": 0.006032784723538575, "grad_norm": 2.28125, "grad_norm_var": 5.731445058186849, "learning_rate": 5e-05, "loss": 0.1753, "loss/crossentropy": 1.6290442943572998, "loss/dist_ce": 0.0, "loss/hidden": 0.1435546875, "loss/idx": 0.0, "loss/logits": 0.031768545508384705, "step": 732 }, { "epoch": 0.006041026232723737, "grad_norm": 5.59375, "grad_norm_var": 6.049501291910807, "learning_rate": 5e-05, "loss": 0.2555, "loss/crossentropy": 2.4887194633483887, "loss/dist_ce": 0.0, "loss/hidden": 0.205078125, "loss/idx": 0.0, "loss/logits": 0.05038648098707199, "step": 733 }, { "epoch": 0.0060492677419088986, "grad_norm": 1.859375, "grad_norm_var": 4.525903065999349, "learning_rate": 5e-05, "loss": 0.1672, "loss/crossentropy": 1.4916491508483887, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.03243900462985039, "step": 734 }, { "epoch": 0.0060575092510940605, "grad_norm": 6.28125, "grad_norm_var": 4.783105214436849, "learning_rate": 5e-05, "loss": 0.2469, "loss/crossentropy": 1.9904972314834595, "loss/dist_ce": 0.0, "loss/hidden": 0.193359375, "loss/idx": 0.0, "loss/logits": 0.05358533933758736, "step": 735 }, { "epoch": 0.006065750760279222, "grad_norm": 4.90625, "grad_norm_var": 4.923659006754558, "learning_rate": 5e-05, "loss": 0.3124, "loss/crossentropy": 3.1595237255096436, "loss/dist_ce": 0.0, "loss/hidden": 0.240234375, "loss/idx": 0.0, "loss/logits": 0.07215666770935059, "step": 736 }, { "epoch": 0.006073992269464384, "grad_norm": 2.296875, "grad_norm_var": 4.737629191080729, "learning_rate": 5e-05, "loss": 0.1351, "loss/crossentropy": 0.5982239842414856, "loss/dist_ce": 0.0, "loss/hidden": 0.11767578125, "loss/idx": 0.0, "loss/logits": 0.017444239929318428, "step": 737 }, { "epoch": 0.006082233778649546, "grad_norm": 2.1875, "grad_norm_var": 4.723148600260417, "learning_rate": 5e-05, "loss": 0.1614, "loss/crossentropy": 1.076785922050476, "loss/dist_ce": 0.0, "loss/hidden": 0.13671875, "loss/idx": 0.0, "loss/logits": 0.02469494380056858, "step": 738 }, { "epoch": 0.006090475287834708, "grad_norm": 2.609375, "grad_norm_var": 4.6523183186848955, "learning_rate": 5e-05, "loss": 0.2057, "loss/crossentropy": 2.2849977016448975, "loss/dist_ce": 0.0, "loss/hidden": 0.1513671875, "loss/idx": 0.0, "loss/logits": 0.05430486798286438, "step": 739 }, { "epoch": 0.00609871679701987, "grad_norm": 2.578125, "grad_norm_var": 4.620018513997396, "learning_rate": 5e-05, "loss": 0.2088, "loss/crossentropy": 1.5881438255310059, "loss/dist_ce": 0.0, "loss/hidden": 0.1708984375, "loss/idx": 0.0, "loss/logits": 0.037879034876823425, "step": 740 }, { "epoch": 0.006106958306205033, "grad_norm": 1.671875, "grad_norm_var": 4.708748372395833, "learning_rate": 5e-05, "loss": 0.1337, "loss/crossentropy": 1.5729397535324097, "loss/dist_ce": 0.0, "loss/hidden": 0.1142578125, "loss/idx": 0.0, "loss/logits": 0.01942865364253521, "step": 741 }, { "epoch": 0.006115199815390195, "grad_norm": 1.7578125, "grad_norm_var": 4.875673166910807, "learning_rate": 5e-05, "loss": 0.1473, "loss/crossentropy": 2.5460681915283203, "loss/dist_ce": 0.0, "loss/hidden": 0.11865234375, "loss/idx": 0.0, "loss/logits": 0.028684455901384354, "step": 742 }, { "epoch": 0.0061234413245753565, "grad_norm": 3.484375, "grad_norm_var": 3.4392575581868488, "learning_rate": 5e-05, "loss": 0.197, "loss/crossentropy": 2.502234697341919, "loss/dist_ce": 0.0, "loss/hidden": 0.158203125, "loss/idx": 0.0, "loss/logits": 0.03875020891427994, "step": 743 }, { "epoch": 0.006131682833760518, "grad_norm": 1.5625, "grad_norm_var": 3.499828084309896, "learning_rate": 5e-05, "loss": 0.1167, "loss/crossentropy": 0.5112316608428955, "loss/dist_ce": 0.0, "loss/hidden": 0.10546875, "loss/idx": 0.0, "loss/logits": 0.011273887008428574, "step": 744 }, { "epoch": 0.00613992434294568, "grad_norm": 1.734375, "grad_norm_var": 2.17010498046875, "learning_rate": 5e-05, "loss": 0.1642, "loss/crossentropy": 1.5730743408203125, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.029403435066342354, "step": 745 }, { "epoch": 0.006148165852130842, "grad_norm": 5.875, "grad_norm_var": 2.7329345703125, "learning_rate": 5e-05, "loss": 0.2765, "loss/crossentropy": 2.3317244052886963, "loss/dist_ce": 0.0, "loss/hidden": 0.2236328125, "loss/idx": 0.0, "loss/logits": 0.05287738889455795, "step": 746 }, { "epoch": 0.006156407361316004, "grad_norm": 4.65625, "grad_norm_var": 2.7969134012858072, "learning_rate": 5e-05, "loss": 0.3948, "loss/crossentropy": 2.6969289779663086, "loss/dist_ce": 0.0, "loss/hidden": 0.287109375, "loss/idx": 0.0, "loss/logits": 0.10772477090358734, "step": 747 }, { "epoch": 0.006164648870501166, "grad_norm": 2.34375, "grad_norm_var": 2.7894304911295573, "learning_rate": 5e-05, "loss": 0.1563, "loss/crossentropy": 2.7726356983184814, "loss/dist_ce": 0.0, "loss/hidden": 0.1240234375, "loss/idx": 0.0, "loss/logits": 0.03231953829526901, "step": 748 }, { "epoch": 0.006172890379686328, "grad_norm": 2.3125, "grad_norm_var": 2.4205034891764323, "learning_rate": 5e-05, "loss": 0.1669, "loss/crossentropy": 2.8912436962127686, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.03605188801884651, "step": 749 }, { "epoch": 0.00618113188887149, "grad_norm": 1.6796875, "grad_norm_var": 2.4500244140625, "learning_rate": 5e-05, "loss": 0.1556, "loss/crossentropy": 1.5787498950958252, "loss/dist_ce": 0.0, "loss/hidden": 0.1318359375, "loss/idx": 0.0, "loss/logits": 0.02375878393650055, "step": 750 }, { "epoch": 0.0061893733980566525, "grad_norm": 6.1875, "grad_norm_var": 2.40950927734375, "learning_rate": 5e-05, "loss": 0.2622, "loss/crossentropy": 1.5021111965179443, "loss/dist_ce": 0.0, "loss/hidden": 0.212890625, "loss/idx": 0.0, "loss/logits": 0.04928001016378403, "step": 751 }, { "epoch": 0.006197614907241814, "grad_norm": 1.828125, "grad_norm_var": 2.2153228759765624, "learning_rate": 5e-05, "loss": 0.1812, "loss/crossentropy": 2.771916627883911, "loss/dist_ce": 0.0, "loss/hidden": 0.13671875, "loss/idx": 0.0, "loss/logits": 0.044479530304670334, "step": 752 }, { "epoch": 0.006205856416426976, "grad_norm": 2.265625, "grad_norm_var": 2.2174713134765627, "learning_rate": 5e-05, "loss": 0.1752, "loss/crossentropy": 1.6487168073654175, "loss/dist_ce": 0.0, "loss/hidden": 0.142578125, "loss/idx": 0.0, "loss/logits": 0.03264114260673523, "step": 753 }, { "epoch": 0.006214097925612138, "grad_norm": 2.90625, "grad_norm_var": 2.191454060872396, "learning_rate": 5e-05, "loss": 0.2076, "loss/crossentropy": 3.0564393997192383, "loss/dist_ce": 0.0, "loss/hidden": 0.158203125, "loss/idx": 0.0, "loss/logits": 0.04940104857087135, "step": 754 }, { "epoch": 0.0062223394347973, "grad_norm": 1.7109375, "grad_norm_var": 2.269628651936849, "learning_rate": 5e-05, "loss": 0.1733, "loss/crossentropy": 2.259493112564087, "loss/dist_ce": 0.0, "loss/hidden": 0.1337890625, "loss/idx": 0.0, "loss/logits": 0.03946584463119507, "step": 755 }, { "epoch": 0.006230580943982462, "grad_norm": 1.8046875, "grad_norm_var": 2.328316243489583, "learning_rate": 5e-05, "loss": 0.1575, "loss/crossentropy": 2.8051905632019043, "loss/dist_ce": 0.0, "loss/hidden": 0.1201171875, "loss/idx": 0.0, "loss/logits": 0.037370190024375916, "step": 756 }, { "epoch": 0.006238822453167624, "grad_norm": 2.65625, "grad_norm_var": 2.2491689046223957, "learning_rate": 5e-05, "loss": 0.193, "loss/crossentropy": 2.3121681213378906, "loss/dist_ce": 0.0, "loss/hidden": 0.1494140625, "loss/idx": 0.0, "loss/logits": 0.043607860803604126, "step": 757 }, { "epoch": 0.006247063962352786, "grad_norm": 1.7265625, "grad_norm_var": 2.2535634358723957, "learning_rate": 5e-05, "loss": 0.1707, "loss/crossentropy": 2.660825490951538, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.03984237462282181, "step": 758 }, { "epoch": 0.006255305471537948, "grad_norm": 2.46875, "grad_norm_var": 2.224800618489583, "learning_rate": 5e-05, "loss": 0.1356, "loss/crossentropy": 1.1409167051315308, "loss/dist_ce": 0.0, "loss/hidden": 0.11279296875, "loss/idx": 0.0, "loss/logits": 0.02284255623817444, "step": 759 }, { "epoch": 0.00626354698072311, "grad_norm": 1.96875, "grad_norm_var": 2.171744791666667, "learning_rate": 5e-05, "loss": 0.1432, "loss/crossentropy": 1.4278790950775146, "loss/dist_ce": 0.0, "loss/hidden": 0.115234375, "loss/idx": 0.0, "loss/logits": 0.02792993187904358, "step": 760 }, { "epoch": 0.006271788489908272, "grad_norm": 2.875, "grad_norm_var": 2.0974110921223956, "learning_rate": 5e-05, "loss": 0.1644, "loss/crossentropy": 1.693648099899292, "loss/dist_ce": 0.0, "loss/hidden": 0.1357421875, "loss/idx": 0.0, "loss/logits": 0.028608174994587898, "step": 761 }, { "epoch": 0.006280029999093434, "grad_norm": 1.4140625, "grad_norm_var": 1.5294837951660156, "learning_rate": 5e-05, "loss": 0.1372, "loss/crossentropy": 1.6515194177627563, "loss/dist_ce": 0.0, "loss/hidden": 0.1171875, "loss/idx": 0.0, "loss/logits": 0.020010514184832573, "step": 762 }, { "epoch": 0.006288271508278596, "grad_norm": 3.578125, "grad_norm_var": 1.2993995666503906, "learning_rate": 5e-05, "loss": 0.228, "loss/crossentropy": 2.8086752891540527, "loss/dist_ce": 0.0, "loss/hidden": 0.17578125, "loss/idx": 0.0, "loss/logits": 0.052195869386196136, "step": 763 }, { "epoch": 0.006296513017463758, "grad_norm": 2.515625, "grad_norm_var": 1.2980567932128906, "learning_rate": 5e-05, "loss": 0.1333, "loss/crossentropy": 2.0333292484283447, "loss/dist_ce": 0.0, "loss/hidden": 0.109375, "loss/idx": 0.0, "loss/logits": 0.023911556228995323, "step": 764 }, { "epoch": 0.00630475452664892, "grad_norm": 1.703125, "grad_norm_var": 1.3359840393066407, "learning_rate": 5e-05, "loss": 0.1194, "loss/crossentropy": 1.3590047359466553, "loss/dist_ce": 0.0, "loss/hidden": 0.10302734375, "loss/idx": 0.0, "loss/logits": 0.016341259703040123, "step": 765 }, { "epoch": 0.006312996035834082, "grad_norm": 1.875, "grad_norm_var": 1.3181630452473958, "learning_rate": 5e-05, "loss": 0.1641, "loss/crossentropy": 2.541020631790161, "loss/dist_ce": 0.0, "loss/hidden": 0.12890625, "loss/idx": 0.0, "loss/logits": 0.03523167222738266, "step": 766 }, { "epoch": 0.006321237545019244, "grad_norm": 2.15625, "grad_norm_var": 0.3344960530598958, "learning_rate": 5e-05, "loss": 0.1642, "loss/crossentropy": 3.1116065979003906, "loss/dist_ce": 0.0, "loss/hidden": 0.1259765625, "loss/idx": 0.0, "loss/logits": 0.038257598876953125, "step": 767 }, { "epoch": 0.0063294790542044056, "grad_norm": 3.984375, "grad_norm_var": 0.5136220296223958, "learning_rate": 5e-05, "loss": 0.2127, "loss/crossentropy": 0.4452613890171051, "loss/dist_ce": 0.0, "loss/hidden": 0.1884765625, "loss/idx": 0.0, "loss/logits": 0.024267811328172684, "step": 768 }, { "epoch": 0.006337720563389568, "grad_norm": 2.421875, "grad_norm_var": 0.5133778889973958, "learning_rate": 5e-05, "loss": 0.1991, "loss/crossentropy": 2.1372740268707275, "loss/dist_ce": 0.0, "loss/hidden": 0.1572265625, "loss/idx": 0.0, "loss/logits": 0.04187183082103729, "step": 769 }, { "epoch": 0.00634596207257473, "grad_norm": 2.1875, "grad_norm_var": 0.4933502197265625, "learning_rate": 5e-05, "loss": 0.147, "loss/crossentropy": 1.2034099102020264, "loss/dist_ce": 0.0, "loss/hidden": 0.1220703125, "loss/idx": 0.0, "loss/logits": 0.024930456653237343, "step": 770 }, { "epoch": 0.006354203581759892, "grad_norm": 1.640625, "grad_norm_var": 0.4993263244628906, "learning_rate": 5e-05, "loss": 0.154, "loss/crossentropy": 1.4210480451583862, "loss/dist_ce": 0.0, "loss/hidden": 0.125, "loss/idx": 0.0, "loss/logits": 0.0289776474237442, "step": 771 }, { "epoch": 0.006362445090945054, "grad_norm": 2.90625, "grad_norm_var": 0.5007965087890625, "learning_rate": 5e-05, "loss": 0.1915, "loss/crossentropy": 2.461935520172119, "loss/dist_ce": 0.0, "loss/hidden": 0.1494140625, "loss/idx": 0.0, "loss/logits": 0.042080432176589966, "step": 772 }, { "epoch": 0.006370686600130216, "grad_norm": 3.984375, "grad_norm_var": 0.6599812825520833, "learning_rate": 5e-05, "loss": 0.2646, "loss/crossentropy": 2.870908260345459, "loss/dist_ce": 0.0, "loss/hidden": 0.2041015625, "loss/idx": 0.0, "loss/logits": 0.06047297269105911, "step": 773 }, { "epoch": 0.006378928109315378, "grad_norm": 2.015625, "grad_norm_var": 0.6368242899576823, "learning_rate": 5e-05, "loss": 0.1692, "loss/crossentropy": 2.4187848567962646, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.03446546941995621, "step": 774 }, { "epoch": 0.00638716961850054, "grad_norm": 3.140625, "grad_norm_var": 0.6639442443847656, "learning_rate": 5e-05, "loss": 0.2993, "loss/crossentropy": 2.4693610668182373, "loss/dist_ce": 0.0, "loss/hidden": 0.21484375, "loss/idx": 0.0, "loss/logits": 0.08449941873550415, "step": 775 }, { "epoch": 0.006395411127685702, "grad_norm": 1.7578125, "grad_norm_var": 0.68231201171875, "learning_rate": 5e-05, "loss": 0.1579, "loss/crossentropy": 2.635270833969116, "loss/dist_ce": 0.0, "loss/hidden": 0.1259765625, "loss/idx": 0.0, "loss/logits": 0.031969744712114334, "step": 776 }, { "epoch": 0.0064036526368708635, "grad_norm": 1.203125, "grad_norm_var": 0.7755930582682292, "learning_rate": 5e-05, "loss": 0.1124, "loss/crossentropy": 1.4390558004379272, "loss/dist_ce": 0.0, "loss/hidden": 0.0966796875, "loss/idx": 0.0, "loss/logits": 0.015761706978082657, "step": 777 }, { "epoch": 0.006411894146056025, "grad_norm": 4.09375, "grad_norm_var": 0.8702369689941406, "learning_rate": 5e-05, "loss": 0.3235, "loss/crossentropy": 1.998376727104187, "loss/dist_ce": 0.0, "loss/hidden": 0.263671875, "loss/idx": 0.0, "loss/logits": 0.05982797592878342, "step": 778 }, { "epoch": 0.006420135655241188, "grad_norm": 2.015625, "grad_norm_var": 0.813372548421224, "learning_rate": 5e-05, "loss": 0.2049, "loss/crossentropy": 1.550020694732666, "loss/dist_ce": 0.0, "loss/hidden": 0.16015625, "loss/idx": 0.0, "loss/logits": 0.04472289979457855, "step": 779 }, { "epoch": 0.00642837716442635, "grad_norm": 1.2578125, "grad_norm_var": 0.90545654296875, "learning_rate": 5e-05, "loss": 0.1262, "loss/crossentropy": 2.6197104454040527, "loss/dist_ce": 0.0, "loss/hidden": 0.099609375, "loss/idx": 0.0, "loss/logits": 0.02659156545996666, "step": 780 }, { "epoch": 0.006436618673611512, "grad_norm": 2.25, "grad_norm_var": 0.8735911051432291, "learning_rate": 5e-05, "loss": 0.1781, "loss/crossentropy": 2.597325563430786, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.045299261808395386, "step": 781 }, { "epoch": 0.006444860182796674, "grad_norm": 2.09375, "grad_norm_var": 0.8603749593098958, "learning_rate": 5e-05, "loss": 0.1826, "loss/crossentropy": 1.9669688940048218, "loss/dist_ce": 0.0, "loss/hidden": 0.1455078125, "loss/idx": 0.0, "loss/logits": 0.03709650784730911, "step": 782 }, { "epoch": 0.006453101691981836, "grad_norm": 1.703125, "grad_norm_var": 0.89061279296875, "learning_rate": 5e-05, "loss": 0.1355, "loss/crossentropy": 1.5600085258483887, "loss/dist_ce": 0.0, "loss/hidden": 0.111328125, "loss/idx": 0.0, "loss/logits": 0.02414374053478241, "step": 783 }, { "epoch": 0.006461343201166998, "grad_norm": 3.65625, "grad_norm_var": 0.8287261962890625, "learning_rate": 5e-05, "loss": 0.2038, "loss/crossentropy": 2.052290678024292, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.033876098692417145, "step": 784 }, { "epoch": 0.0064695847103521595, "grad_norm": 1.78125, "grad_norm_var": 0.8521240234375, "learning_rate": 5e-05, "loss": 0.1581, "loss/crossentropy": 0.9798577427864075, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.023312989622354507, "step": 785 }, { "epoch": 0.006477826219537321, "grad_norm": 2.296875, "grad_norm_var": 0.8504221598307292, "learning_rate": 5e-05, "loss": 0.147, "loss/crossentropy": 2.4323980808258057, "loss/dist_ce": 0.0, "loss/hidden": 0.11865234375, "loss/idx": 0.0, "loss/logits": 0.028353385627269745, "step": 786 }, { "epoch": 0.006486067728722483, "grad_norm": 3.484375, "grad_norm_var": 0.8854726155598959, "learning_rate": 5e-05, "loss": 0.2295, "loss/crossentropy": 2.9406378269195557, "loss/dist_ce": 0.0, "loss/hidden": 0.17578125, "loss/idx": 0.0, "loss/logits": 0.05376865714788437, "step": 787 }, { "epoch": 0.006494309237907646, "grad_norm": 1.71875, "grad_norm_var": 0.9057281494140625, "learning_rate": 5e-05, "loss": 0.1408, "loss/crossentropy": 2.3078272342681885, "loss/dist_ce": 0.0, "loss/hidden": 0.111328125, "loss/idx": 0.0, "loss/logits": 0.029444556683301926, "step": 788 }, { "epoch": 0.006502550747092808, "grad_norm": 1.125, "grad_norm_var": 0.81395263671875, "learning_rate": 5e-05, "loss": 0.1168, "loss/crossentropy": 1.5063586235046387, "loss/dist_ce": 0.0, "loss/hidden": 0.095703125, "loss/idx": 0.0, "loss/logits": 0.021064041182398796, "step": 789 }, { "epoch": 0.00651079225627797, "grad_norm": 2.09375, "grad_norm_var": 0.8121571858723958, "learning_rate": 5e-05, "loss": 0.1659, "loss/crossentropy": 2.5850398540496826, "loss/dist_ce": 0.0, "loss/hidden": 0.1337890625, "loss/idx": 0.0, "loss/logits": 0.03208760544657707, "step": 790 }, { "epoch": 0.006519033765463132, "grad_norm": 3.03125, "grad_norm_var": 0.7996175130208333, "learning_rate": 5e-05, "loss": 0.1852, "loss/crossentropy": 1.5392999649047852, "loss/dist_ce": 0.0, "loss/hidden": 0.146484375, "loss/idx": 0.0, "loss/logits": 0.038750022649765015, "step": 791 }, { "epoch": 0.006527275274648294, "grad_norm": 1.7890625, "grad_norm_var": 0.79774169921875, "learning_rate": 5e-05, "loss": 0.1902, "loss/crossentropy": 2.854396343231201, "loss/dist_ce": 0.0, "loss/hidden": 0.146484375, "loss/idx": 0.0, "loss/logits": 0.043708011507987976, "step": 792 }, { "epoch": 0.0065355167838334555, "grad_norm": 3.671875, "grad_norm_var": 0.8424235026041667, "learning_rate": 5e-05, "loss": 0.2264, "loss/crossentropy": 2.2643284797668457, "loss/dist_ce": 0.0, "loss/hidden": 0.18359375, "loss/idx": 0.0, "loss/logits": 0.04277587682008743, "step": 793 }, { "epoch": 0.006543758293018617, "grad_norm": 2.96875, "grad_norm_var": 0.6642985026041667, "learning_rate": 5e-05, "loss": 0.161, "loss/crossentropy": 2.5523831844329834, "loss/dist_ce": 0.0, "loss/hidden": 0.1279296875, "loss/idx": 0.0, "loss/logits": 0.0331122949719429, "step": 794 }, { "epoch": 0.006551999802203779, "grad_norm": 2.328125, "grad_norm_var": 0.6581949869791667, "learning_rate": 5e-05, "loss": 0.177, "loss/crossentropy": 3.067322015762329, "loss/dist_ce": 0.0, "loss/hidden": 0.1357421875, "loss/idx": 0.0, "loss/logits": 0.041216038167476654, "step": 795 }, { "epoch": 0.006560241311388941, "grad_norm": 1.171875, "grad_norm_var": 0.6709205627441406, "learning_rate": 5e-05, "loss": 0.1258, "loss/crossentropy": 1.477942943572998, "loss/dist_ce": 0.0, "loss/hidden": 0.1044921875, "loss/idx": 0.0, "loss/logits": 0.021268734708428383, "step": 796 }, { "epoch": 0.006568482820574104, "grad_norm": 7.375, "grad_norm_var": 2.2628069559733075, "learning_rate": 5e-05, "loss": 0.2407, "loss/crossentropy": 2.4961607456207275, "loss/dist_ce": 0.0, "loss/hidden": 0.19140625, "loss/idx": 0.0, "loss/logits": 0.049283482134342194, "step": 797 }, { "epoch": 0.006576724329759266, "grad_norm": 2.828125, "grad_norm_var": 2.2427263895670575, "learning_rate": 5e-05, "loss": 0.2416, "loss/crossentropy": 2.435840129852295, "loss/dist_ce": 0.0, "loss/hidden": 0.1826171875, "loss/idx": 0.0, "loss/logits": 0.05896880850195885, "step": 798 }, { "epoch": 0.006584965838944428, "grad_norm": 1.5703125, "grad_norm_var": 2.261286417643229, "learning_rate": 5e-05, "loss": 0.147, "loss/crossentropy": 1.4684247970581055, "loss/dist_ce": 0.0, "loss/hidden": 0.119140625, "loss/idx": 0.0, "loss/logits": 0.027891069650650024, "step": 799 }, { "epoch": 0.00659320734812959, "grad_norm": 1.296875, "grad_norm_var": 2.302298990885417, "learning_rate": 5e-05, "loss": 0.122, "loss/crossentropy": 1.3427636623382568, "loss/dist_ce": 0.0, "loss/hidden": 0.1015625, "loss/idx": 0.0, "loss/logits": 0.020389681681990623, "step": 800 }, { "epoch": 0.0066014488573147515, "grad_norm": 2.46875, "grad_norm_var": 2.2629109700520833, "learning_rate": 5e-05, "loss": 0.0932, "loss/crossentropy": 0.27775296568870544, "loss/dist_ce": 0.0, "loss/hidden": 0.08642578125, "loss/idx": 0.0, "loss/logits": 0.006820976734161377, "step": 801 }, { "epoch": 0.006609690366499913, "grad_norm": 1.4140625, "grad_norm_var": 2.344496409098307, "learning_rate": 5e-05, "loss": 0.1262, "loss/crossentropy": 0.8798704147338867, "loss/dist_ce": 0.0, "loss/hidden": 0.10791015625, "loss/idx": 0.0, "loss/logits": 0.018274936825037003, "step": 802 }, { "epoch": 0.006617931875685075, "grad_norm": 1.625, "grad_norm_var": 2.321738433837891, "learning_rate": 5e-05, "loss": 0.1389, "loss/crossentropy": 2.3500454425811768, "loss/dist_ce": 0.0, "loss/hidden": 0.1103515625, "loss/idx": 0.0, "loss/logits": 0.02855532616376877, "step": 803 }, { "epoch": 0.006626173384870237, "grad_norm": 3.21875, "grad_norm_var": 2.325156402587891, "learning_rate": 5e-05, "loss": 0.2325, "loss/crossentropy": 2.58161997795105, "loss/dist_ce": 0.0, "loss/hidden": 0.177734375, "loss/idx": 0.0, "loss/logits": 0.05475683510303497, "step": 804 }, { "epoch": 0.006634414894055399, "grad_norm": 2.265625, "grad_norm_var": 2.1975786844889322, "learning_rate": 5e-05, "loss": 0.2116, "loss/crossentropy": 2.6687753200531006, "loss/dist_ce": 0.0, "loss/hidden": 0.16015625, "loss/idx": 0.0, "loss/logits": 0.051458559930324554, "step": 805 }, { "epoch": 0.006642656403240561, "grad_norm": 3.078125, "grad_norm_var": 2.1956560770670572, "learning_rate": 5e-05, "loss": 0.1836, "loss/crossentropy": 2.341993808746338, "loss/dist_ce": 0.0, "loss/hidden": 0.150390625, "loss/idx": 0.0, "loss/logits": 0.033185191452503204, "step": 806 }, { "epoch": 0.006650897912425724, "grad_norm": 1.8515625, "grad_norm_var": 2.2197336832682293, "learning_rate": 5e-05, "loss": 0.1828, "loss/crossentropy": 2.465346336364746, "loss/dist_ce": 0.0, "loss/hidden": 0.1416015625, "loss/idx": 0.0, "loss/logits": 0.041189346462488174, "step": 807 }, { "epoch": 0.006659139421610886, "grad_norm": 1.09375, "grad_norm_var": 2.3212013244628906, "learning_rate": 5e-05, "loss": 0.1117, "loss/crossentropy": 1.3634965419769287, "loss/dist_ce": 0.0, "loss/hidden": 0.0927734375, "loss/idx": 0.0, "loss/logits": 0.01888800971210003, "step": 808 }, { "epoch": 0.0066673809307960475, "grad_norm": 1.046875, "grad_norm_var": 2.3466651916503904, "learning_rate": 5e-05, "loss": 0.095, "loss/crossentropy": 1.5679908990859985, "loss/dist_ce": 0.0, "loss/hidden": 0.08251953125, "loss/idx": 0.0, "loss/logits": 0.012483851984143257, "step": 809 }, { "epoch": 0.006675622439981209, "grad_norm": 2.046875, "grad_norm_var": 2.3237383524576822, "learning_rate": 5e-05, "loss": 0.1209, "loss/crossentropy": 1.3377238512039185, "loss/dist_ce": 0.0, "loss/hidden": 0.10107421875, "loss/idx": 0.0, "loss/logits": 0.01983119174838066, "step": 810 }, { "epoch": 0.006683863949166371, "grad_norm": 2.875, "grad_norm_var": 2.3450294494628907, "learning_rate": 5e-05, "loss": 0.2099, "loss/crossentropy": 1.367640495300293, "loss/dist_ce": 0.0, "loss/hidden": 0.1826171875, "loss/idx": 0.0, "loss/logits": 0.027279119938611984, "step": 811 }, { "epoch": 0.006692105458351533, "grad_norm": 2.453125, "grad_norm_var": 2.2503537495930988, "learning_rate": 5e-05, "loss": 0.2034, "loss/crossentropy": 2.2119548320770264, "loss/dist_ce": 0.0, "loss/hidden": 0.158203125, "loss/idx": 0.0, "loss/logits": 0.045244939625263214, "step": 812 }, { "epoch": 0.006700346967536695, "grad_norm": 2.140625, "grad_norm_var": 0.4953386942545573, "learning_rate": 5e-05, "loss": 0.1415, "loss/crossentropy": 1.21071457862854, "loss/dist_ce": 0.0, "loss/hidden": 0.12109375, "loss/idx": 0.0, "loss/logits": 0.020365629345178604, "step": 813 }, { "epoch": 0.006708588476721857, "grad_norm": 3.8125, "grad_norm_var": 0.6541460673014323, "learning_rate": 5e-05, "loss": 0.2487, "loss/crossentropy": 2.8512344360351562, "loss/dist_ce": 0.0, "loss/hidden": 0.193359375, "loss/idx": 0.0, "loss/logits": 0.055374931544065475, "step": 814 }, { "epoch": 0.006716829985907019, "grad_norm": 2.671875, "grad_norm_var": 0.6461496988932292, "learning_rate": 5e-05, "loss": 0.1403, "loss/crossentropy": 0.623081624507904, "loss/dist_ce": 0.0, "loss/hidden": 0.12158203125, "loss/idx": 0.0, "loss/logits": 0.01867399737238884, "step": 815 }, { "epoch": 0.006725071495092182, "grad_norm": 1.2421875, "grad_norm_var": 0.6529945373535156, "learning_rate": 5e-05, "loss": 0.1044, "loss/crossentropy": 0.35599735379219055, "loss/dist_ce": 0.0, "loss/hidden": 0.0966796875, "loss/idx": 0.0, "loss/logits": 0.007739436347037554, "step": 816 }, { "epoch": 0.0067333130042773436, "grad_norm": 2.0625, "grad_norm_var": 0.6491065979003906, "learning_rate": 5e-05, "loss": 0.1802, "loss/crossentropy": 2.3499081134796143, "loss/dist_ce": 0.0, "loss/hidden": 0.140625, "loss/idx": 0.0, "loss/logits": 0.039534226059913635, "step": 817 }, { "epoch": 0.0067415545134625054, "grad_norm": 2.953125, "grad_norm_var": 0.6397379557291667, "learning_rate": 5e-05, "loss": 0.1885, "loss/crossentropy": 1.8850847482681274, "loss/dist_ce": 0.0, "loss/hidden": 0.16015625, "loss/idx": 0.0, "loss/logits": 0.02836015075445175, "step": 818 }, { "epoch": 0.006749796022647667, "grad_norm": 2.703125, "grad_norm_var": 0.6186106363932292, "learning_rate": 5e-05, "loss": 0.216, "loss/crossentropy": 1.4330132007598877, "loss/dist_ce": 0.0, "loss/hidden": 0.171875, "loss/idx": 0.0, "loss/logits": 0.04408085718750954, "step": 819 }, { "epoch": 0.006758037531832829, "grad_norm": 3.296875, "grad_norm_var": 0.6280965169270833, "learning_rate": 5e-05, "loss": 0.2694, "loss/crossentropy": 1.9407634735107422, "loss/dist_ce": 0.0, "loss/hidden": 0.20703125, "loss/idx": 0.0, "loss/logits": 0.062371157109737396, "step": 820 }, { "epoch": 0.006766279041017991, "grad_norm": 1.53125, "grad_norm_var": 0.6700266520182292, "learning_rate": 5e-05, "loss": 0.1509, "loss/crossentropy": 1.891184687614441, "loss/dist_ce": 0.0, "loss/hidden": 0.1181640625, "loss/idx": 0.0, "loss/logits": 0.032700151205062866, "step": 821 }, { "epoch": 0.006774520550203153, "grad_norm": 2.921875, "grad_norm_var": 0.6554189046223958, "learning_rate": 5e-05, "loss": 0.212, "loss/crossentropy": 2.192823648452759, "loss/dist_ce": 0.0, "loss/hidden": 0.1650390625, "loss/idx": 0.0, "loss/logits": 0.04695805162191391, "step": 822 }, { "epoch": 0.006782762059388315, "grad_norm": 1.2734375, "grad_norm_var": 0.7104085286458334, "learning_rate": 5e-05, "loss": 0.1329, "loss/crossentropy": 1.4268293380737305, "loss/dist_ce": 0.0, "loss/hidden": 0.109375, "loss/idx": 0.0, "loss/logits": 0.023486068472266197, "step": 823 }, { "epoch": 0.006791003568573477, "grad_norm": 5.03125, "grad_norm_var": 1.0682698567708333, "learning_rate": 5e-05, "loss": 0.4061, "loss/crossentropy": 2.2408840656280518, "loss/dist_ce": 0.0, "loss/hidden": 0.30078125, "loss/idx": 0.0, "loss/logits": 0.10528542101383209, "step": 824 }, { "epoch": 0.00679924507775864, "grad_norm": 1.6328125, "grad_norm_var": 0.9758969624837239, "learning_rate": 5e-05, "loss": 0.1375, "loss/crossentropy": 2.6388261318206787, "loss/dist_ce": 0.0, "loss/hidden": 0.1083984375, "loss/idx": 0.0, "loss/logits": 0.029141269624233246, "step": 825 }, { "epoch": 0.0068074865869438015, "grad_norm": 1.328125, "grad_norm_var": 1.055492909749349, "learning_rate": 5e-05, "loss": 0.1284, "loss/crossentropy": 1.3783127069473267, "loss/dist_ce": 0.0, "loss/hidden": 0.1103515625, "loss/idx": 0.0, "loss/logits": 0.01809128187596798, "step": 826 }, { "epoch": 0.006815728096128963, "grad_norm": 1.140625, "grad_norm_var": 1.155761464436849, "learning_rate": 5e-05, "loss": 0.1161, "loss/crossentropy": 1.5458354949951172, "loss/dist_ce": 0.0, "loss/hidden": 0.0966796875, "loss/idx": 0.0, "loss/logits": 0.01938330940902233, "step": 827 }, { "epoch": 0.006823969605314125, "grad_norm": 3.234375, "grad_norm_var": 1.200774892171224, "learning_rate": 5e-05, "loss": 0.2329, "loss/crossentropy": 1.8702012300491333, "loss/dist_ce": 0.0, "loss/hidden": 0.1875, "loss/idx": 0.0, "loss/logits": 0.045410916209220886, "step": 828 }, { "epoch": 0.006832211114499287, "grad_norm": 5.4375, "grad_norm_var": 1.7502540588378905, "learning_rate": 5e-05, "loss": 0.3512, "loss/crossentropy": 2.7246875762939453, "loss/dist_ce": 0.0, "loss/hidden": 0.28125, "loss/idx": 0.0, "loss/logits": 0.0699634999036789, "step": 829 }, { "epoch": 0.006840452623684449, "grad_norm": 2.265625, "grad_norm_var": 1.6584083557128906, "learning_rate": 5e-05, "loss": 0.1294, "loss/crossentropy": 1.506727695465088, "loss/dist_ce": 0.0, "loss/hidden": 0.10791015625, "loss/idx": 0.0, "loss/logits": 0.021486353129148483, "step": 830 }, { "epoch": 0.006848694132869611, "grad_norm": 2.25, "grad_norm_var": 1.6624183654785156, "learning_rate": 5e-05, "loss": 0.1725, "loss/crossentropy": 2.507237434387207, "loss/dist_ce": 0.0, "loss/hidden": 0.1357421875, "loss/idx": 0.0, "loss/logits": 0.03672666847705841, "step": 831 }, { "epoch": 0.006856935642054773, "grad_norm": 1.96875, "grad_norm_var": 1.57171630859375, "learning_rate": 5e-05, "loss": 0.1601, "loss/crossentropy": 1.883087158203125, "loss/dist_ce": 0.0, "loss/hidden": 0.13671875, "loss/idx": 0.0, "loss/logits": 0.023375539109110832, "step": 832 }, { "epoch": 0.006865177151239935, "grad_norm": 1.296875, "grad_norm_var": 1.6595937093098958, "learning_rate": 5e-05, "loss": 0.1151, "loss/crossentropy": 2.2904350757598877, "loss/dist_ce": 0.0, "loss/hidden": 0.09375, "loss/idx": 0.0, "loss/logits": 0.02137480303645134, "step": 833 }, { "epoch": 0.006873418660425097, "grad_norm": 1.546875, "grad_norm_var": 1.7013417561848958, "learning_rate": 5e-05, "loss": 0.1588, "loss/crossentropy": 2.104323625564575, "loss/dist_ce": 0.0, "loss/hidden": 0.1279296875, "loss/idx": 0.0, "loss/logits": 0.03088623285293579, "step": 834 }, { "epoch": 0.006881660169610259, "grad_norm": 1.40625, "grad_norm_var": 1.7590087890625, "learning_rate": 5e-05, "loss": 0.1215, "loss/crossentropy": 1.4084559679031372, "loss/dist_ce": 0.0, "loss/hidden": 0.103515625, "loss/idx": 0.0, "loss/logits": 0.017994871363043785, "step": 835 }, { "epoch": 0.006889901678795421, "grad_norm": 1.5, "grad_norm_var": 1.7333892822265624, "learning_rate": 5e-05, "loss": 0.1111, "loss/crossentropy": 1.1881444454193115, "loss/dist_ce": 0.0, "loss/hidden": 0.095703125, "loss/idx": 0.0, "loss/logits": 0.01541186310350895, "step": 836 }, { "epoch": 0.006898143187980583, "grad_norm": 3.4375, "grad_norm_var": 1.7815419514973958, "learning_rate": 5e-05, "loss": 0.1738, "loss/crossentropy": 2.4373316764831543, "loss/dist_ce": 0.0, "loss/hidden": 0.1357421875, "loss/idx": 0.0, "loss/logits": 0.038025081157684326, "step": 837 }, { "epoch": 0.006906384697165745, "grad_norm": 1.34375, "grad_norm_var": 1.81781005859375, "learning_rate": 5e-05, "loss": 0.1448, "loss/crossentropy": 2.4372975826263428, "loss/dist_ce": 0.0, "loss/hidden": 0.1142578125, "loss/idx": 0.0, "loss/logits": 0.030569259077310562, "step": 838 }, { "epoch": 0.006914626206350907, "grad_norm": 2.171875, "grad_norm_var": 1.7505734761555989, "learning_rate": 5e-05, "loss": 0.1729, "loss/crossentropy": 1.551772117614746, "loss/dist_ce": 0.0, "loss/hidden": 0.14453125, "loss/idx": 0.0, "loss/logits": 0.028406476601958275, "step": 839 }, { "epoch": 0.006922867715536069, "grad_norm": 1.78125, "grad_norm_var": 1.2323931376139323, "learning_rate": 5e-05, "loss": 0.1473, "loss/crossentropy": 2.1337997913360596, "loss/dist_ce": 0.0, "loss/hidden": 0.1142578125, "loss/idx": 0.0, "loss/logits": 0.033026814460754395, "step": 840 }, { "epoch": 0.006931109224721231, "grad_norm": 2.453125, "grad_norm_var": 1.2223795572916667, "learning_rate": 5e-05, "loss": 0.1839, "loss/crossentropy": 2.638575792312622, "loss/dist_ce": 0.0, "loss/hidden": 0.1416015625, "loss/idx": 0.0, "loss/logits": 0.04230286926031113, "step": 841 }, { "epoch": 0.006939350733906393, "grad_norm": 2.390625, "grad_norm_var": 1.1750651041666667, "learning_rate": 5e-05, "loss": 0.2205, "loss/crossentropy": 2.267352342605591, "loss/dist_ce": 0.0, "loss/hidden": 0.169921875, "loss/idx": 0.0, "loss/logits": 0.05056004598736763, "step": 842 }, { "epoch": 0.0069475922430915545, "grad_norm": 4.34375, "grad_norm_var": 1.3525299072265624, "learning_rate": 5e-05, "loss": 0.3381, "loss/crossentropy": 2.6636710166931152, "loss/dist_ce": 0.0, "loss/hidden": 0.248046875, "loss/idx": 0.0, "loss/logits": 0.09001342952251434, "step": 843 }, { "epoch": 0.006955833752276717, "grad_norm": 3.109375, "grad_norm_var": 1.340046183268229, "learning_rate": 5e-05, "loss": 0.1769, "loss/crossentropy": 1.5471034049987793, "loss/dist_ce": 0.0, "loss/hidden": 0.1484375, "loss/idx": 0.0, "loss/logits": 0.02849499136209488, "step": 844 }, { "epoch": 0.006964075261461879, "grad_norm": 2.546875, "grad_norm_var": 0.698876953125, "learning_rate": 5e-05, "loss": 0.1365, "loss/crossentropy": 2.5193700790405273, "loss/dist_ce": 0.0, "loss/hidden": 0.107421875, "loss/idx": 0.0, "loss/logits": 0.029105795547366142, "step": 845 }, { "epoch": 0.006972316770647041, "grad_norm": 2.28125, "grad_norm_var": 0.6989491780598959, "learning_rate": 5e-05, "loss": 0.1387, "loss/crossentropy": 1.5674251317977905, "loss/dist_ce": 0.0, "loss/hidden": 0.11376953125, "loss/idx": 0.0, "loss/logits": 0.024912243708968163, "step": 846 }, { "epoch": 0.006980558279832203, "grad_norm": 3.28125, "grad_norm_var": 0.7668935139973958, "learning_rate": 5e-05, "loss": 0.1924, "loss/crossentropy": 1.3732967376708984, "loss/dist_ce": 0.0, "loss/hidden": 0.15625, "loss/idx": 0.0, "loss/logits": 0.036177463829517365, "step": 847 }, { "epoch": 0.006988799789017365, "grad_norm": 4.0625, "grad_norm_var": 0.9473704020182292, "learning_rate": 5e-05, "loss": 0.2466, "loss/crossentropy": 1.4407217502593994, "loss/dist_ce": 0.0, "loss/hidden": 0.19921875, "loss/idx": 0.0, "loss/logits": 0.04741185903549194, "step": 848 }, { "epoch": 0.006997041298202527, "grad_norm": 1.5078125, "grad_norm_var": 0.9181536356608073, "learning_rate": 5e-05, "loss": 0.1338, "loss/crossentropy": 2.713207721710205, "loss/dist_ce": 0.0, "loss/hidden": 0.1064453125, "loss/idx": 0.0, "loss/logits": 0.027391444891691208, "step": 849 }, { "epoch": 0.007005282807387689, "grad_norm": 2.5, "grad_norm_var": 0.8604448954264323, "learning_rate": 5e-05, "loss": 0.2414, "loss/crossentropy": 1.8647682666778564, "loss/dist_ce": 0.0, "loss/hidden": 0.203125, "loss/idx": 0.0, "loss/logits": 0.038255929946899414, "step": 850 }, { "epoch": 0.0070135243165728505, "grad_norm": 3.640625, "grad_norm_var": 0.8444435119628906, "learning_rate": 5e-05, "loss": 0.1953, "loss/crossentropy": 3.0968027114868164, "loss/dist_ce": 0.0, "loss/hidden": 0.1513671875, "loss/idx": 0.0, "loss/logits": 0.04395551607012749, "step": 851 }, { "epoch": 0.0070217658257580124, "grad_norm": 1.1015625, "grad_norm_var": 0.9152984619140625, "learning_rate": 5e-05, "loss": 0.1179, "loss/crossentropy": 1.364888072013855, "loss/dist_ce": 0.0, "loss/hidden": 0.09765625, "loss/idx": 0.0, "loss/logits": 0.020214572548866272, "step": 852 }, { "epoch": 0.007030007334943175, "grad_norm": 4.03125, "grad_norm_var": 1.0018870035807292, "learning_rate": 5e-05, "loss": 0.2262, "loss/crossentropy": 1.5738481283187866, "loss/dist_ce": 0.0, "loss/hidden": 0.1923828125, "loss/idx": 0.0, "loss/logits": 0.03383617848157883, "step": 853 }, { "epoch": 0.007038248844128337, "grad_norm": 2.375, "grad_norm_var": 0.8874827067057292, "learning_rate": 5e-05, "loss": 0.147, "loss/crossentropy": 1.7854039669036865, "loss/dist_ce": 0.0, "loss/hidden": 0.125, "loss/idx": 0.0, "loss/logits": 0.02201111428439617, "step": 854 }, { "epoch": 0.007046490353313499, "grad_norm": 1.078125, "grad_norm_var": 1.0427154541015624, "learning_rate": 5e-05, "loss": 0.1042, "loss/crossentropy": 0.8679842352867126, "loss/dist_ce": 0.0, "loss/hidden": 0.0908203125, "loss/idx": 0.0, "loss/logits": 0.013427493162453175, "step": 855 }, { "epoch": 0.007054731862498661, "grad_norm": 2.0, "grad_norm_var": 1.020213826497396, "learning_rate": 5e-05, "loss": 0.1693, "loss/crossentropy": 1.9278184175491333, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.03644675761461258, "step": 856 }, { "epoch": 0.007062973371683823, "grad_norm": 2.265625, "grad_norm_var": 1.0278065999348958, "learning_rate": 5e-05, "loss": 0.1806, "loss/crossentropy": 1.801297664642334, "loss/dist_ce": 0.0, "loss/hidden": 0.142578125, "loss/idx": 0.0, "loss/logits": 0.03802645206451416, "step": 857 }, { "epoch": 0.007071214880868985, "grad_norm": 2.390625, "grad_norm_var": 1.0278065999348958, "learning_rate": 5e-05, "loss": 0.1861, "loss/crossentropy": 2.6730916500091553, "loss/dist_ce": 0.0, "loss/hidden": 0.142578125, "loss/idx": 0.0, "loss/logits": 0.04350658878684044, "step": 858 }, { "epoch": 0.0070794563900541466, "grad_norm": 1.625, "grad_norm_var": 0.8784169514973958, "learning_rate": 5e-05, "loss": 0.1474, "loss/crossentropy": 2.8773488998413086, "loss/dist_ce": 0.0, "loss/hidden": 0.1142578125, "loss/idx": 0.0, "loss/logits": 0.033148590475320816, "step": 859 }, { "epoch": 0.0070876978992393085, "grad_norm": 2.171875, "grad_norm_var": 0.8555898030598958, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.271367311477661, "loss/dist_ce": 0.0, "loss/hidden": 0.1435546875, "loss/idx": 0.0, "loss/logits": 0.03879944607615471, "step": 860 }, { "epoch": 0.00709593940842447, "grad_norm": 2.078125, "grad_norm_var": 0.8619374593098958, "learning_rate": 5e-05, "loss": 0.1606, "loss/crossentropy": 2.637540340423584, "loss/dist_ce": 0.0, "loss/hidden": 0.126953125, "loss/idx": 0.0, "loss/logits": 0.033611979335546494, "step": 861 }, { "epoch": 0.007104180917609632, "grad_norm": 1.375, "grad_norm_var": 0.9275461832682291, "learning_rate": 5e-05, "loss": 0.1324, "loss/crossentropy": 1.5838335752487183, "loss/dist_ce": 0.0, "loss/hidden": 0.109375, "loss/idx": 0.0, "loss/logits": 0.02299325168132782, "step": 862 }, { "epoch": 0.007112422426794795, "grad_norm": 1.1484375, "grad_norm_var": 0.9449724833170573, "learning_rate": 5e-05, "loss": 0.1209, "loss/crossentropy": 1.4155738353729248, "loss/dist_ce": 0.0, "loss/hidden": 0.1015625, "loss/idx": 0.0, "loss/logits": 0.019340351223945618, "step": 863 }, { "epoch": 0.007120663935979957, "grad_norm": 29.625, "grad_norm_var": 48.10079523722331, "learning_rate": 5e-05, "loss": 0.374, "loss/crossentropy": 2.0414719581604004, "loss/dist_ce": 0.0, "loss/hidden": 0.32421875, "loss/idx": 0.0, "loss/logits": 0.049798041582107544, "step": 864 }, { "epoch": 0.007128905445165119, "grad_norm": 1.3515625, "grad_norm_var": 48.15022354125976, "learning_rate": 5e-05, "loss": 0.1175, "loss/crossentropy": 1.4641659259796143, "loss/dist_ce": 0.0, "loss/hidden": 0.0986328125, "loss/idx": 0.0, "loss/logits": 0.018840216100215912, "step": 865 }, { "epoch": 0.007137146954350281, "grad_norm": 1.2890625, "grad_norm_var": 48.451341756184895, "learning_rate": 5e-05, "loss": 0.1129, "loss/crossentropy": 0.8852246403694153, "loss/dist_ce": 0.0, "loss/hidden": 0.09619140625, "loss/idx": 0.0, "loss/logits": 0.01674000360071659, "step": 866 }, { "epoch": 0.007145388463535443, "grad_norm": 1.4296875, "grad_norm_var": 48.78075129191081, "learning_rate": 5e-05, "loss": 0.1627, "loss/crossentropy": 2.618283748626709, "loss/dist_ce": 0.0, "loss/hidden": 0.1240234375, "loss/idx": 0.0, "loss/logits": 0.03862760215997696, "step": 867 }, { "epoch": 0.0071536299727206045, "grad_norm": 2.390625, "grad_norm_var": 48.45802408854167, "learning_rate": 5e-05, "loss": 0.1756, "loss/crossentropy": 1.5870869159698486, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.0232391357421875, "step": 868 }, { "epoch": 0.007161871481905766, "grad_norm": 2.375, "grad_norm_var": 48.54838460286458, "learning_rate": 5e-05, "loss": 0.1535, "loss/crossentropy": 1.9420498609542847, "loss/dist_ce": 0.0, "loss/hidden": 0.1220703125, "loss/idx": 0.0, "loss/logits": 0.03145188093185425, "step": 869 }, { "epoch": 0.007170112991090928, "grad_norm": 1.375, "grad_norm_var": 48.76895751953125, "learning_rate": 5e-05, "loss": 0.1167, "loss/crossentropy": 1.6243377923965454, "loss/dist_ce": 0.0, "loss/hidden": 0.0947265625, "loss/idx": 0.0, "loss/logits": 0.021999340504407883, "step": 870 }, { "epoch": 0.00717835450027609, "grad_norm": 1.6484375, "grad_norm_var": 48.60527114868164, "learning_rate": 5e-05, "loss": 0.1465, "loss/crossentropy": 0.5207220911979675, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.015599271282553673, "step": 871 }, { "epoch": 0.007186596009461253, "grad_norm": 1.6015625, "grad_norm_var": 48.69667053222656, "learning_rate": 5e-05, "loss": 0.1485, "loss/crossentropy": 2.410845994949341, "loss/dist_ce": 0.0, "loss/hidden": 0.1181640625, "loss/idx": 0.0, "loss/logits": 0.030334439128637314, "step": 872 }, { "epoch": 0.007194837518646415, "grad_norm": 2.46875, "grad_norm_var": 48.66558024088542, "learning_rate": 5e-05, "loss": 0.1478, "loss/crossentropy": 1.4278953075408936, "loss/dist_ce": 0.0, "loss/hidden": 0.12890625, "loss/idx": 0.0, "loss/logits": 0.01892838627099991, "step": 873 }, { "epoch": 0.007203079027831577, "grad_norm": 2.015625, "grad_norm_var": 48.73091227213542, "learning_rate": 5e-05, "loss": 0.1412, "loss/crossentropy": 2.3847408294677734, "loss/dist_ce": 0.0, "loss/hidden": 0.111328125, "loss/idx": 0.0, "loss/logits": 0.029860273003578186, "step": 874 }, { "epoch": 0.007211320537016739, "grad_norm": 2.4375, "grad_norm_var": 48.56925862630208, "learning_rate": 5e-05, "loss": 0.2361, "loss/crossentropy": 2.355764627456665, "loss/dist_ce": 0.0, "loss/hidden": 0.185546875, "loss/idx": 0.0, "loss/logits": 0.05050516501069069, "step": 875 }, { "epoch": 0.0072195620462019005, "grad_norm": 1.6875, "grad_norm_var": 48.67285054524739, "learning_rate": 5e-05, "loss": 0.2005, "loss/crossentropy": 2.284627676010132, "loss/dist_ce": 0.0, "loss/hidden": 0.158203125, "loss/idx": 0.0, "loss/logits": 0.04234454780817032, "step": 876 }, { "epoch": 0.007227803555387062, "grad_norm": 3.296875, "grad_norm_var": 48.53161519368489, "learning_rate": 5e-05, "loss": 0.2251, "loss/crossentropy": 2.510892152786255, "loss/dist_ce": 0.0, "loss/hidden": 0.18359375, "loss/idx": 0.0, "loss/logits": 0.041470736265182495, "step": 877 }, { "epoch": 0.007236045064572224, "grad_norm": 1.90625, "grad_norm_var": 48.39202372233073, "learning_rate": 5e-05, "loss": 0.1637, "loss/crossentropy": 1.8948522806167603, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.030879024416208267, "step": 878 }, { "epoch": 0.007244286573757386, "grad_norm": 2.078125, "grad_norm_var": 48.13868993123372, "learning_rate": 5e-05, "loss": 0.1593, "loss/crossentropy": 1.3858805894851685, "loss/dist_ce": 0.0, "loss/hidden": 0.1259765625, "loss/idx": 0.0, "loss/logits": 0.03331441059708595, "step": 879 }, { "epoch": 0.007252528082942548, "grad_norm": 2.171875, "grad_norm_var": 0.29590021769205727, "learning_rate": 5e-05, "loss": 0.1446, "loss/crossentropy": 1.4740588665008545, "loss/dist_ce": 0.0, "loss/hidden": 0.1181640625, "loss/idx": 0.0, "loss/logits": 0.02642858400940895, "step": 880 }, { "epoch": 0.007260769592127711, "grad_norm": 3.1875, "grad_norm_var": 0.3551259358723958, "learning_rate": 5e-05, "loss": 0.1482, "loss/crossentropy": 1.0553547143936157, "loss/dist_ce": 0.0, "loss/hidden": 0.12890625, "loss/idx": 0.0, "loss/logits": 0.01929531991481781, "step": 881 }, { "epoch": 0.007269011101312873, "grad_norm": 2.375, "grad_norm_var": 0.31359024047851564, "learning_rate": 5e-05, "loss": 0.1372, "loss/crossentropy": 1.9523429870605469, "loss/dist_ce": 0.0, "loss/hidden": 0.109375, "loss/idx": 0.0, "loss/logits": 0.027855783700942993, "step": 882 }, { "epoch": 0.007277252610498035, "grad_norm": 1.390625, "grad_norm_var": 0.3174519856770833, "learning_rate": 5e-05, "loss": 0.0988, "loss/crossentropy": 0.5503354072570801, "loss/dist_ce": 0.0, "loss/hidden": 0.0888671875, "loss/idx": 0.0, "loss/logits": 0.009944621473550797, "step": 883 }, { "epoch": 0.0072854941196831965, "grad_norm": 1.2578125, "grad_norm_var": 0.36137059529622395, "learning_rate": 5e-05, "loss": 0.1208, "loss/crossentropy": 1.7299476861953735, "loss/dist_ce": 0.0, "loss/hidden": 0.10009765625, "loss/idx": 0.0, "loss/logits": 0.020662667229771614, "step": 884 }, { "epoch": 0.007293735628868358, "grad_norm": 3.65625, "grad_norm_var": 0.5144365946451823, "learning_rate": 5e-05, "loss": 0.1408, "loss/crossentropy": 1.2476385831832886, "loss/dist_ce": 0.0, "loss/hidden": 0.115234375, "loss/idx": 0.0, "loss/logits": 0.025581957772374153, "step": 885 }, { "epoch": 0.00730197713805352, "grad_norm": 1.078125, "grad_norm_var": 0.5510047912597656, "learning_rate": 5e-05, "loss": 0.103, "loss/crossentropy": 1.68153715133667, "loss/dist_ce": 0.0, "loss/hidden": 0.0849609375, "loss/idx": 0.0, "loss/logits": 0.018041210249066353, "step": 886 }, { "epoch": 0.007310218647238682, "grad_norm": 2.4375, "grad_norm_var": 0.5380849202473958, "learning_rate": 5e-05, "loss": 0.1718, "loss/crossentropy": 2.690136671066284, "loss/dist_ce": 0.0, "loss/hidden": 0.1318359375, "loss/idx": 0.0, "loss/logits": 0.03993324190378189, "step": 887 }, { "epoch": 0.007318460156423844, "grad_norm": 1.59375, "grad_norm_var": 0.5387021382649739, "learning_rate": 5e-05, "loss": 0.1553, "loss/crossentropy": 1.3326576948165894, "loss/dist_ce": 0.0, "loss/hidden": 0.1240234375, "loss/idx": 0.0, "loss/logits": 0.03129717335104942, "step": 888 }, { "epoch": 0.007326701665609006, "grad_norm": 2.109375, "grad_norm_var": 0.5334144592285156, "learning_rate": 5e-05, "loss": 0.1613, "loss/crossentropy": 2.293314218521118, "loss/dist_ce": 0.0, "loss/hidden": 0.12890625, "loss/idx": 0.0, "loss/logits": 0.03244052827358246, "step": 889 }, { "epoch": 0.007334943174794169, "grad_norm": 2.3125, "grad_norm_var": 0.5329119364420573, "learning_rate": 5e-05, "loss": 0.1668, "loss/crossentropy": 2.6757304668426514, "loss/dist_ce": 0.0, "loss/hidden": 0.12890625, "loss/idx": 0.0, "loss/logits": 0.03789503872394562, "step": 890 }, { "epoch": 0.007343184683979331, "grad_norm": 2.265625, "grad_norm_var": 0.5289955139160156, "learning_rate": 5e-05, "loss": 0.172, "loss/crossentropy": 1.9997798204421997, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.03719984367489815, "step": 891 }, { "epoch": 0.0073514261931644925, "grad_norm": 1.5703125, "grad_norm_var": 0.5374755859375, "learning_rate": 5e-05, "loss": 0.1209, "loss/crossentropy": 0.4429371654987335, "loss/dist_ce": 0.0, "loss/hidden": 0.1015625, "loss/idx": 0.0, "loss/logits": 0.019379278644919395, "step": 892 }, { "epoch": 0.007359667702349654, "grad_norm": 1.3515625, "grad_norm_var": 0.48118057250976565, "learning_rate": 5e-05, "loss": 0.0994, "loss/crossentropy": 0.2325073629617691, "loss/dist_ce": 0.0, "loss/hidden": 0.09375, "loss/idx": 0.0, "loss/logits": 0.005671404767781496, "step": 893 }, { "epoch": 0.007367909211534816, "grad_norm": 1.328125, "grad_norm_var": 0.512872060139974, "learning_rate": 5e-05, "loss": 0.1444, "loss/crossentropy": 2.507219076156616, "loss/dist_ce": 0.0, "loss/hidden": 0.109375, "loss/idx": 0.0, "loss/logits": 0.035071056336164474, "step": 894 }, { "epoch": 0.007376150720719978, "grad_norm": 2.234375, "grad_norm_var": 0.5158119201660156, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 2.556119918823242, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.037296053022146225, "step": 895 }, { "epoch": 0.00738439222990514, "grad_norm": 2.3125, "grad_norm_var": 0.5198951721191406, "learning_rate": 5e-05, "loss": 0.2003, "loss/crossentropy": 2.8151919841766357, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.04793722182512283, "step": 896 }, { "epoch": 0.007392633739090302, "grad_norm": 1.9296875, "grad_norm_var": 0.4244537353515625, "learning_rate": 5e-05, "loss": 0.2205, "loss/crossentropy": 2.4031951427459717, "loss/dist_ce": 0.0, "loss/hidden": 0.16796875, "loss/idx": 0.0, "loss/logits": 0.05256333947181702, "step": 897 }, { "epoch": 0.007400875248275464, "grad_norm": 2.21875, "grad_norm_var": 0.4171295166015625, "learning_rate": 5e-05, "loss": 0.1722, "loss/crossentropy": 3.179619312286377, "loss/dist_ce": 0.0, "loss/hidden": 0.1328125, "loss/idx": 0.0, "loss/logits": 0.0393570140004158, "step": 898 }, { "epoch": 0.007409116757460626, "grad_norm": 4.625, "grad_norm_var": 0.83385009765625, "learning_rate": 5e-05, "loss": 0.2282, "loss/crossentropy": 1.429465651512146, "loss/dist_ce": 0.0, "loss/hidden": 0.19921875, "loss/idx": 0.0, "loss/logits": 0.028934892266988754, "step": 899 }, { "epoch": 0.0074173582666457885, "grad_norm": 1.515625, "grad_norm_var": 0.8075904846191406, "learning_rate": 5e-05, "loss": 0.1484, "loss/crossentropy": 1.4451292753219604, "loss/dist_ce": 0.0, "loss/hidden": 0.1240234375, "loss/idx": 0.0, "loss/logits": 0.024369023740291595, "step": 900 }, { "epoch": 0.0074255997758309504, "grad_norm": 103.5, "grad_norm_var": 643.7922401428223, "learning_rate": 5e-05, "loss": 0.5508, "loss/crossentropy": 0.928047239780426, "loss/dist_ce": 0.0, "loss/hidden": 0.50390625, "loss/idx": 0.0, "loss/logits": 0.04687977582216263, "step": 901 }, { "epoch": 0.007433841285016112, "grad_norm": 1.0078125, "grad_norm_var": 643.861181640625, "learning_rate": 5e-05, "loss": 0.1275, "loss/crossentropy": 2.235487461090088, "loss/dist_ce": 0.0, "loss/hidden": 0.1044921875, "loss/idx": 0.0, "loss/logits": 0.022974800318479538, "step": 902 }, { "epoch": 0.007442082794201274, "grad_norm": 2.421875, "grad_norm_var": 643.8736073811849, "learning_rate": 5e-05, "loss": 0.1372, "loss/crossentropy": 1.5220298767089844, "loss/dist_ce": 0.0, "loss/hidden": 0.11328125, "loss/idx": 0.0, "loss/logits": 0.02395622618496418, "step": 903 }, { "epoch": 0.007450324303386436, "grad_norm": 2.34375, "grad_norm_var": 643.2287831624349, "learning_rate": 5e-05, "loss": 0.1307, "loss/crossentropy": 1.4191033840179443, "loss/dist_ce": 0.0, "loss/hidden": 0.10595703125, "loss/idx": 0.0, "loss/logits": 0.024734167382121086, "step": 904 }, { "epoch": 0.007458565812571598, "grad_norm": 5.125, "grad_norm_var": 641.2515462239584, "learning_rate": 5e-05, "loss": 0.3705, "loss/crossentropy": 1.2650396823883057, "loss/dist_ce": 0.0, "loss/hidden": 0.26953125, "loss/idx": 0.0, "loss/logits": 0.10096491873264313, "step": 905 }, { "epoch": 0.00746680732175676, "grad_norm": 3.234375, "grad_norm_var": 640.5282704671224, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 1.6492811441421509, "loss/dist_ce": 0.0, "loss/hidden": 0.154296875, "loss/idx": 0.0, "loss/logits": 0.0383470356464386, "step": 906 }, { "epoch": 0.007475048830941922, "grad_norm": 2.671875, "grad_norm_var": 640.190786743164, "learning_rate": 5e-05, "loss": 0.1486, "loss/crossentropy": 1.6101174354553223, "loss/dist_ce": 0.0, "loss/hidden": 0.12353515625, "loss/idx": 0.0, "loss/logits": 0.02505500242114067, "step": 907 }, { "epoch": 0.007483290340127084, "grad_norm": 3.21875, "grad_norm_var": 638.7909563700358, "learning_rate": 5e-05, "loss": 0.1805, "loss/crossentropy": 1.6961467266082764, "loss/dist_ce": 0.0, "loss/hidden": 0.150390625, "loss/idx": 0.0, "loss/logits": 0.030135734006762505, "step": 908 }, { "epoch": 0.0074915318493122465, "grad_norm": 3.28125, "grad_norm_var": 637.1034220377604, "learning_rate": 5e-05, "loss": 0.2268, "loss/crossentropy": 2.749086618423462, "loss/dist_ce": 0.0, "loss/hidden": 0.1787109375, "loss/idx": 0.0, "loss/logits": 0.048118021339178085, "step": 909 }, { "epoch": 0.007499773358497408, "grad_norm": 1.3515625, "grad_norm_var": 637.0796831766764, "learning_rate": 5e-05, "loss": 0.1485, "loss/crossentropy": 1.4836238622665405, "loss/dist_ce": 0.0, "loss/hidden": 0.1220703125, "loss/idx": 0.0, "loss/logits": 0.02646125853061676, "step": 910 }, { "epoch": 0.00750801486768257, "grad_norm": 2.21875, "grad_norm_var": 637.0936622619629, "learning_rate": 5e-05, "loss": 0.16, "loss/crossentropy": 1.5685328245162964, "loss/dist_ce": 0.0, "loss/hidden": 0.126953125, "loss/idx": 0.0, "loss/logits": 0.033031269907951355, "step": 911 }, { "epoch": 0.007516256376867732, "grad_norm": 1.7890625, "grad_norm_var": 637.5730539957682, "learning_rate": 5e-05, "loss": 0.1701, "loss/crossentropy": 0.7678622007369995, "loss/dist_ce": 0.0, "loss/hidden": 0.1435546875, "loss/idx": 0.0, "loss/logits": 0.02650710754096508, "step": 912 }, { "epoch": 0.007524497886052894, "grad_norm": 2.5625, "grad_norm_var": 637.0096819559733, "learning_rate": 5e-05, "loss": 0.1833, "loss/crossentropy": 2.773200273513794, "loss/dist_ce": 0.0, "loss/hidden": 0.138671875, "loss/idx": 0.0, "loss/logits": 0.04464181140065193, "step": 913 }, { "epoch": 0.007532739395238056, "grad_norm": 1.703125, "grad_norm_var": 637.4885821024577, "learning_rate": 5e-05, "loss": 0.1809, "loss/crossentropy": 2.4580299854278564, "loss/dist_ce": 0.0, "loss/hidden": 0.138671875, "loss/idx": 0.0, "loss/logits": 0.04220724105834961, "step": 914 }, { "epoch": 0.007540980904423218, "grad_norm": 1.6796875, "grad_norm_var": 639.71376953125, "learning_rate": 5e-05, "loss": 0.1522, "loss/crossentropy": 2.1130568981170654, "loss/dist_ce": 0.0, "loss/hidden": 0.1201171875, "loss/idx": 0.0, "loss/logits": 0.03203842043876648, "step": 915 }, { "epoch": 0.00754922241360838, "grad_norm": 1.84375, "grad_norm_var": 639.4050201416015, "learning_rate": 5e-05, "loss": 0.1293, "loss/crossentropy": 1.8496575355529785, "loss/dist_ce": 0.0, "loss/hidden": 0.103515625, "loss/idx": 0.0, "loss/logits": 0.025790153071284294, "step": 916 }, { "epoch": 0.007557463922793542, "grad_norm": 3.078125, "grad_norm_var": 0.9873331705729167, "learning_rate": 5e-05, "loss": 0.1361, "loss/crossentropy": 1.5800864696502686, "loss/dist_ce": 0.0, "loss/hidden": 0.1123046875, "loss/idx": 0.0, "loss/logits": 0.023767339065670967, "step": 917 }, { "epoch": 0.007565705431978704, "grad_norm": 1.8359375, "grad_norm_var": 0.8686676025390625, "learning_rate": 5e-05, "loss": 0.1117, "loss/crossentropy": 1.1425081491470337, "loss/dist_ce": 0.0, "loss/hidden": 0.09375, "loss/idx": 0.0, "loss/logits": 0.01797986589372158, "step": 918 }, { "epoch": 0.007573946941163866, "grad_norm": 3.578125, "grad_norm_var": 0.9367177327473958, "learning_rate": 5e-05, "loss": 0.1796, "loss/crossentropy": 2.826726198196411, "loss/dist_ce": 0.0, "loss/hidden": 0.142578125, "loss/idx": 0.0, "loss/logits": 0.036978572607040405, "step": 919 }, { "epoch": 0.007582188450349028, "grad_norm": 1.8671875, "grad_norm_var": 0.9668596903483073, "learning_rate": 5e-05, "loss": 0.1869, "loss/crossentropy": 2.1164968013763428, "loss/dist_ce": 0.0, "loss/hidden": 0.1484375, "loss/idx": 0.0, "loss/logits": 0.03847302123904228, "step": 920 }, { "epoch": 0.00759042995953419, "grad_norm": 4.40625, "grad_norm_var": 0.753808339436849, "learning_rate": 5e-05, "loss": 0.1586, "loss/crossentropy": 2.5811195373535156, "loss/dist_ce": 0.0, "loss/hidden": 0.1240234375, "loss/idx": 0.0, "loss/logits": 0.034622643142938614, "step": 921 }, { "epoch": 0.007598671468719352, "grad_norm": 1.65625, "grad_norm_var": 0.7591509501139323, "learning_rate": 5e-05, "loss": 0.1544, "loss/crossentropy": 2.408280611038208, "loss/dist_ce": 0.0, "loss/hidden": 0.1181640625, "loss/idx": 0.0, "loss/logits": 0.036218732595443726, "step": 922 }, { "epoch": 0.007606912977904514, "grad_norm": 1.78125, "grad_norm_var": 0.7789812723795573, "learning_rate": 5e-05, "loss": 0.1502, "loss/crossentropy": 1.7369745969772339, "loss/dist_ce": 0.0, "loss/hidden": 0.123046875, "loss/idx": 0.0, "loss/logits": 0.027189793065190315, "step": 923 }, { "epoch": 0.007615154487089676, "grad_norm": 4.75, "grad_norm_var": 1.0996864318847657, "learning_rate": 5e-05, "loss": 0.4286, "loss/crossentropy": 2.7182440757751465, "loss/dist_ce": 0.0, "loss/hidden": 0.3671875, "loss/idx": 0.0, "loss/logits": 0.061443451792001724, "step": 924 }, { "epoch": 0.007623395996274838, "grad_norm": 1.671875, "grad_norm_var": 1.0856463114420574, "learning_rate": 5e-05, "loss": 0.1476, "loss/crossentropy": 2.5658042430877686, "loss/dist_ce": 0.0, "loss/hidden": 0.1171875, "loss/idx": 0.0, "loss/logits": 0.030445091426372528, "step": 925 }, { "epoch": 0.0076316375054599995, "grad_norm": 1.578125, "grad_norm_var": 1.0583658854166667, "learning_rate": 5e-05, "loss": 0.1394, "loss/crossentropy": 1.2689893245697021, "loss/dist_ce": 0.0, "loss/hidden": 0.11279296875, "loss/idx": 0.0, "loss/logits": 0.02664242871105671, "step": 926 }, { "epoch": 0.007639879014645161, "grad_norm": 1.8125, "grad_norm_var": 1.0771443684895834, "learning_rate": 5e-05, "loss": 0.1498, "loss/crossentropy": 2.381871223449707, "loss/dist_ce": 0.0, "loss/hidden": 0.119140625, "loss/idx": 0.0, "loss/logits": 0.03070569597184658, "step": 927 }, { "epoch": 0.007648120523830324, "grad_norm": 2.25, "grad_norm_var": 1.0559730529785156, "learning_rate": 5e-05, "loss": 0.1529, "loss/crossentropy": 2.887047529220581, "loss/dist_ce": 0.0, "loss/hidden": 0.11962890625, "loss/idx": 0.0, "loss/logits": 0.033232178539037704, "step": 928 }, { "epoch": 0.007656362033015486, "grad_norm": 11.875, "grad_norm_var": 6.704707590738932, "learning_rate": 5e-05, "loss": 0.8791, "loss/crossentropy": 1.3170801401138306, "loss/dist_ce": 0.0, "loss/hidden": 0.734375, "loss/idx": 0.0, "loss/logits": 0.14475420117378235, "step": 929 }, { "epoch": 0.007664603542200648, "grad_norm": 1.6484375, "grad_norm_var": 6.7140625, "learning_rate": 5e-05, "loss": 0.127, "loss/crossentropy": 1.6071490049362183, "loss/dist_ce": 0.0, "loss/hidden": 0.10107421875, "loss/idx": 0.0, "loss/logits": 0.025960184633731842, "step": 930 }, { "epoch": 0.00767284505138581, "grad_norm": 1.8984375, "grad_norm_var": 6.67979736328125, "learning_rate": 5e-05, "loss": 0.1926, "loss/crossentropy": 2.840394973754883, "loss/dist_ce": 0.0, "loss/hidden": 0.1455078125, "loss/idx": 0.0, "loss/logits": 0.047091417014598846, "step": 931 }, { "epoch": 0.007681086560570972, "grad_norm": 1.9921875, "grad_norm_var": 6.658870188395182, "learning_rate": 5e-05, "loss": 0.155, "loss/crossentropy": 1.9618257284164429, "loss/dist_ce": 0.0, "loss/hidden": 0.1240234375, "loss/idx": 0.0, "loss/logits": 0.03101358562707901, "step": 932 }, { "epoch": 0.007689328069756134, "grad_norm": 1.6953125, "grad_norm_var": 6.760285441080729, "learning_rate": 5e-05, "loss": 0.1729, "loss/crossentropy": 0.5429065823554993, "loss/dist_ce": 0.0, "loss/hidden": 0.1552734375, "loss/idx": 0.0, "loss/logits": 0.017611898481845856, "step": 933 }, { "epoch": 0.0076975695789412955, "grad_norm": 1.953125, "grad_norm_var": 6.744618479410807, "learning_rate": 5e-05, "loss": 0.1526, "loss/crossentropy": 2.004976749420166, "loss/dist_ce": 0.0, "loss/hidden": 0.12255859375, "loss/idx": 0.0, "loss/logits": 0.030033178627490997, "step": 934 }, { "epoch": 0.007705811088126457, "grad_norm": 1.3046875, "grad_norm_var": 6.8623606363932295, "learning_rate": 5e-05, "loss": 0.106, "loss/crossentropy": 1.3220717906951904, "loss/dist_ce": 0.0, "loss/hidden": 0.08984375, "loss/idx": 0.0, "loss/logits": 0.01619834452867508, "step": 935 }, { "epoch": 0.007714052597311619, "grad_norm": 1.46875, "grad_norm_var": 6.919648996988932, "learning_rate": 5e-05, "loss": 0.1283, "loss/crossentropy": 1.6176024675369263, "loss/dist_ce": 0.0, "loss/hidden": 0.1044921875, "loss/idx": 0.0, "loss/logits": 0.023826373741030693, "step": 936 }, { "epoch": 0.007722294106496782, "grad_norm": 2.296875, "grad_norm_var": 6.727388254801432, "learning_rate": 5e-05, "loss": 0.1905, "loss/crossentropy": 2.7859342098236084, "loss/dist_ce": 0.0, "loss/hidden": 0.14453125, "loss/idx": 0.0, "loss/logits": 0.04592683166265488, "step": 937 }, { "epoch": 0.007730535615681944, "grad_norm": 2.28125, "grad_norm_var": 6.672985585530599, "learning_rate": 5e-05, "loss": 0.1824, "loss/crossentropy": 2.764125108718872, "loss/dist_ce": 0.0, "loss/hidden": 0.1416015625, "loss/idx": 0.0, "loss/logits": 0.04075000435113907, "step": 938 }, { "epoch": 0.007738777124867106, "grad_norm": 1.8203125, "grad_norm_var": 6.668602498372396, "learning_rate": 5e-05, "loss": 0.1234, "loss/crossentropy": 2.6147968769073486, "loss/dist_ce": 0.0, "loss/hidden": 0.099609375, "loss/idx": 0.0, "loss/logits": 0.023791346698999405, "step": 939 }, { "epoch": 0.007747018634052268, "grad_norm": 1.8828125, "grad_norm_var": 6.377123769124349, "learning_rate": 5e-05, "loss": 0.155, "loss/crossentropy": 1.4540939331054688, "loss/dist_ce": 0.0, "loss/hidden": 0.125, "loss/idx": 0.0, "loss/logits": 0.029985029250383377, "step": 940 }, { "epoch": 0.00775526014323743, "grad_norm": 2.0625, "grad_norm_var": 6.345385487874349, "learning_rate": 5e-05, "loss": 0.1772, "loss/crossentropy": 2.8355026245117188, "loss/dist_ce": 0.0, "loss/hidden": 0.138671875, "loss/idx": 0.0, "loss/logits": 0.038478825241327286, "step": 941 }, { "epoch": 0.0077635016524225916, "grad_norm": 2.6875, "grad_norm_var": 6.287605539957682, "learning_rate": 5e-05, "loss": 0.1997, "loss/crossentropy": 2.347745895385742, "loss/dist_ce": 0.0, "loss/hidden": 0.154296875, "loss/idx": 0.0, "loss/logits": 0.0454033724963665, "step": 942 }, { "epoch": 0.0077717431616077534, "grad_norm": 1.03125, "grad_norm_var": 6.403419748942057, "learning_rate": 5e-05, "loss": 0.098, "loss/crossentropy": 0.4349134564399719, "loss/dist_ce": 0.0, "loss/hidden": 0.087890625, "loss/idx": 0.0, "loss/logits": 0.010073849000036716, "step": 943 }, { "epoch": 0.007779984670792915, "grad_norm": 1.5, "grad_norm_var": 6.464503733317057, "learning_rate": 5e-05, "loss": 0.132, "loss/crossentropy": 2.370471239089966, "loss/dist_ce": 0.0, "loss/hidden": 0.10546875, "loss/idx": 0.0, "loss/logits": 0.02652416005730629, "step": 944 }, { "epoch": 0.007788226179978077, "grad_norm": 2.3125, "grad_norm_var": 0.1785296122233073, "learning_rate": 5e-05, "loss": 0.1954, "loss/crossentropy": 2.6951100826263428, "loss/dist_ce": 0.0, "loss/hidden": 0.1484375, "loss/idx": 0.0, "loss/logits": 0.04693574458360672, "step": 945 }, { "epoch": 0.00779646768916324, "grad_norm": 2.9375, "grad_norm_var": 0.24520670572916667, "learning_rate": 5e-05, "loss": 0.1673, "loss/crossentropy": 1.1609289646148682, "loss/dist_ce": 0.0, "loss/hidden": 0.134765625, "loss/idx": 0.0, "loss/logits": 0.03258271515369415, "step": 946 }, { "epoch": 0.007804709198348402, "grad_norm": 1.8125, "grad_norm_var": 0.2462053934733073, "learning_rate": 5e-05, "loss": 0.1578, "loss/crossentropy": 2.407763957977295, "loss/dist_ce": 0.0, "loss/hidden": 0.12353515625, "loss/idx": 0.0, "loss/logits": 0.034231819212436676, "step": 947 }, { "epoch": 0.007812950707533564, "grad_norm": 1.703125, "grad_norm_var": 0.2494140625, "learning_rate": 5e-05, "loss": 0.155, "loss/crossentropy": 1.5243741273880005, "loss/dist_ce": 0.0, "loss/hidden": 0.126953125, "loss/idx": 0.0, "loss/logits": 0.02800397202372551, "step": 948 }, { "epoch": 0.007821192216718726, "grad_norm": 5.0625, "grad_norm_var": 0.8563189188639323, "learning_rate": 5e-05, "loss": 0.1713, "loss/crossentropy": 1.9425218105316162, "loss/dist_ce": 0.0, "loss/hidden": 0.140625, "loss/idx": 0.0, "loss/logits": 0.03065253421664238, "step": 949 }, { "epoch": 0.007829433725903888, "grad_norm": 3.8125, "grad_norm_var": 1.027972157796224, "learning_rate": 5e-05, "loss": 0.2875, "loss/crossentropy": 2.2058985233306885, "loss/dist_ce": 0.0, "loss/hidden": 0.228515625, "loss/idx": 0.0, "loss/logits": 0.0590139701962471, "step": 950 }, { "epoch": 0.00783767523508905, "grad_norm": 2.171875, "grad_norm_var": 0.9658406575520834, "learning_rate": 5e-05, "loss": 0.2211, "loss/crossentropy": 2.6648082733154297, "loss/dist_ce": 0.0, "loss/hidden": 0.1630859375, "loss/idx": 0.0, "loss/logits": 0.05802769958972931, "step": 951 }, { "epoch": 0.007845916744274211, "grad_norm": 22.875, "grad_norm_var": 27.2247314453125, "learning_rate": 5e-05, "loss": 0.3126, "loss/crossentropy": 2.362283945083618, "loss/dist_ce": 0.0, "loss/hidden": 0.2578125, "loss/idx": 0.0, "loss/logits": 0.05479743331670761, "step": 952 }, { "epoch": 0.007854158253459373, "grad_norm": 1.3046875, "grad_norm_var": 27.4640256245931, "learning_rate": 5e-05, "loss": 0.1007, "loss/crossentropy": 0.38303616642951965, "loss/dist_ce": 0.0, "loss/hidden": 0.08935546875, "loss/idx": 0.0, "loss/logits": 0.011384121142327785, "step": 953 }, { "epoch": 0.007862399762644535, "grad_norm": 1.1484375, "grad_norm_var": 27.740185546875, "learning_rate": 5e-05, "loss": 0.1253, "loss/crossentropy": 1.6580368280410767, "loss/dist_ce": 0.0, "loss/hidden": 0.0986328125, "loss/idx": 0.0, "loss/logits": 0.026700211688876152, "step": 954 }, { "epoch": 0.007870641271829697, "grad_norm": 1.7734375, "grad_norm_var": 27.750869750976562, "learning_rate": 5e-05, "loss": 0.1519, "loss/crossentropy": 2.318824529647827, "loss/dist_ce": 0.0, "loss/hidden": 0.11767578125, "loss/idx": 0.0, "loss/logits": 0.03420557081699371, "step": 955 }, { "epoch": 0.007878882781014859, "grad_norm": 3.03125, "grad_norm_var": 27.58492202758789, "learning_rate": 5e-05, "loss": 0.1829, "loss/crossentropy": 2.9506826400756836, "loss/dist_ce": 0.0, "loss/hidden": 0.140625, "loss/idx": 0.0, "loss/logits": 0.042233243584632874, "step": 956 }, { "epoch": 0.00788712429020002, "grad_norm": 2.75, "grad_norm_var": 27.475665028889974, "learning_rate": 5e-05, "loss": 0.1622, "loss/crossentropy": 1.327090859413147, "loss/dist_ce": 0.0, "loss/hidden": 0.13671875, "loss/idx": 0.0, "loss/logits": 0.02552720718085766, "step": 957 }, { "epoch": 0.007895365799385183, "grad_norm": 2.046875, "grad_norm_var": 27.580934397379558, "learning_rate": 5e-05, "loss": 0.1882, "loss/crossentropy": 2.5473806858062744, "loss/dist_ce": 0.0, "loss/hidden": 0.1455078125, "loss/idx": 0.0, "loss/logits": 0.04271348565816879, "step": 958 }, { "epoch": 0.007903607308570345, "grad_norm": 1.4453125, "grad_norm_var": 27.450960286458333, "learning_rate": 5e-05, "loss": 0.1266, "loss/crossentropy": 1.5830978155136108, "loss/dist_ce": 0.0, "loss/hidden": 0.10400390625, "loss/idx": 0.0, "loss/logits": 0.022615976631641388, "step": 959 }, { "epoch": 0.007911848817755508, "grad_norm": 2.53125, "grad_norm_var": 27.227925618489582, "learning_rate": 5e-05, "loss": 0.1684, "loss/crossentropy": 2.8910608291625977, "loss/dist_ce": 0.0, "loss/hidden": 0.1298828125, "loss/idx": 0.0, "loss/logits": 0.03847365081310272, "step": 960 }, { "epoch": 0.00792009032694067, "grad_norm": 2.109375, "grad_norm_var": 27.26726786295573, "learning_rate": 5e-05, "loss": 0.1292, "loss/crossentropy": 1.506900429725647, "loss/dist_ce": 0.0, "loss/hidden": 0.111328125, "loss/idx": 0.0, "loss/logits": 0.017824511975049973, "step": 961 }, { "epoch": 0.007928331836125832, "grad_norm": 5.3125, "grad_norm_var": 27.391893513997395, "learning_rate": 5e-05, "loss": 0.3447, "loss/crossentropy": 2.996657133102417, "loss/dist_ce": 0.0, "loss/hidden": 0.2734375, "loss/idx": 0.0, "loss/logits": 0.07124556601047516, "step": 962 }, { "epoch": 0.007936573345310994, "grad_norm": 1.703125, "grad_norm_var": 27.421708170572916, "learning_rate": 5e-05, "loss": 0.1274, "loss/crossentropy": 1.4830890893936157, "loss/dist_ce": 0.0, "loss/hidden": 0.1064453125, "loss/idx": 0.0, "loss/logits": 0.02091062068939209, "step": 963 }, { "epoch": 0.007944814854496156, "grad_norm": 1.0390625, "grad_norm_var": 27.6348264058431, "learning_rate": 5e-05, "loss": 0.0988, "loss/crossentropy": 0.4731054902076721, "loss/dist_ce": 0.0, "loss/hidden": 0.08740234375, "loss/idx": 0.0, "loss/logits": 0.011356725357472897, "step": 964 }, { "epoch": 0.007953056363681318, "grad_norm": 3.203125, "grad_norm_var": 27.5273312886556, "learning_rate": 5e-05, "loss": 0.2465, "loss/crossentropy": 2.7651779651641846, "loss/dist_ce": 0.0, "loss/hidden": 0.203125, "loss/idx": 0.0, "loss/logits": 0.043382175266742706, "step": 965 }, { "epoch": 0.00796129787286648, "grad_norm": 2.953125, "grad_norm_var": 27.553851064046224, "learning_rate": 5e-05, "loss": 0.2593, "loss/crossentropy": 1.5127090215682983, "loss/dist_ce": 0.0, "loss/hidden": 0.212890625, "loss/idx": 0.0, "loss/logits": 0.04636671021580696, "step": 966 }, { "epoch": 0.007969539382051642, "grad_norm": 3.0, "grad_norm_var": 27.44041519165039, "learning_rate": 5e-05, "loss": 0.1922, "loss/crossentropy": 1.4316555261611938, "loss/dist_ce": 0.0, "loss/hidden": 0.16796875, "loss/idx": 0.0, "loss/logits": 0.024203313514590263, "step": 967 }, { "epoch": 0.007977780891236803, "grad_norm": 1.6875, "grad_norm_var": 1.1560523986816407, "learning_rate": 5e-05, "loss": 0.1196, "loss/crossentropy": 1.2071629762649536, "loss/dist_ce": 0.0, "loss/hidden": 0.1025390625, "loss/idx": 0.0, "loss/logits": 0.017063483595848083, "step": 968 }, { "epoch": 0.007986022400421965, "grad_norm": 2.375, "grad_norm_var": 1.0834788004557292, "learning_rate": 5e-05, "loss": 0.1698, "loss/crossentropy": 2.020332098007202, "loss/dist_ce": 0.0, "loss/hidden": 0.1396484375, "loss/idx": 0.0, "loss/logits": 0.030112620443105698, "step": 969 }, { "epoch": 0.007994263909607127, "grad_norm": 3.421875, "grad_norm_var": 1.0326372782389324, "learning_rate": 5e-05, "loss": 0.2121, "loss/crossentropy": 1.4469772577285767, "loss/dist_ce": 0.0, "loss/hidden": 0.1767578125, "loss/idx": 0.0, "loss/logits": 0.03534634783864021, "step": 970 }, { "epoch": 0.008002505418792289, "grad_norm": 2.8125, "grad_norm_var": 0.9961415608723958, "learning_rate": 5e-05, "loss": 0.198, "loss/crossentropy": 2.7077815532684326, "loss/dist_ce": 0.0, "loss/hidden": 0.15234375, "loss/idx": 0.0, "loss/logits": 0.045610323548316956, "step": 971 }, { "epoch": 0.008010746927977451, "grad_norm": 2.21875, "grad_norm_var": 0.9894765218098959, "learning_rate": 5e-05, "loss": 0.1874, "loss/crossentropy": 1.9022347927093506, "loss/dist_ce": 0.0, "loss/hidden": 0.154296875, "loss/idx": 0.0, "loss/logits": 0.033076584339141846, "step": 972 }, { "epoch": 0.008018988437162613, "grad_norm": 2.703125, "grad_norm_var": 0.9882893880208333, "learning_rate": 5e-05, "loss": 0.1256, "loss/crossentropy": 0.7915277481079102, "loss/dist_ce": 0.0, "loss/hidden": 0.11181640625, "loss/idx": 0.0, "loss/logits": 0.013769976794719696, "step": 973 }, { "epoch": 0.008027229946347775, "grad_norm": 4.875, "grad_norm_var": 1.3040598551432292, "learning_rate": 5e-05, "loss": 0.3439, "loss/crossentropy": 2.4664230346679688, "loss/dist_ce": 0.0, "loss/hidden": 0.28515625, "loss/idx": 0.0, "loss/logits": 0.05875328183174133, "step": 974 }, { "epoch": 0.008035471455532937, "grad_norm": 1.609375, "grad_norm_var": 1.2780352274576823, "learning_rate": 5e-05, "loss": 0.1361, "loss/crossentropy": 2.0170304775238037, "loss/dist_ce": 0.0, "loss/hidden": 0.111328125, "loss/idx": 0.0, "loss/logits": 0.024749569594860077, "step": 975 }, { "epoch": 0.008043712964718099, "grad_norm": 2.078125, "grad_norm_var": 1.3024024963378906, "learning_rate": 5e-05, "loss": 0.1647, "loss/crossentropy": 2.516977310180664, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.033810123801231384, "step": 976 }, { "epoch": 0.00805195447390326, "grad_norm": 2.421875, "grad_norm_var": 1.2841529846191406, "learning_rate": 5e-05, "loss": 0.1797, "loss/crossentropy": 1.6354763507843018, "loss/dist_ce": 0.0, "loss/hidden": 0.142578125, "loss/idx": 0.0, "loss/logits": 0.03713398799300194, "step": 977 }, { "epoch": 0.008060195983088422, "grad_norm": 2.609375, "grad_norm_var": 0.8040667215983073, "learning_rate": 5e-05, "loss": 0.1307, "loss/crossentropy": 2.52553653717041, "loss/dist_ce": 0.0, "loss/hidden": 0.10546875, "loss/idx": 0.0, "loss/logits": 0.025194775313138962, "step": 978 }, { "epoch": 0.008068437492273586, "grad_norm": 1.15625, "grad_norm_var": 0.8841041564941406, "learning_rate": 5e-05, "loss": 0.1438, "loss/crossentropy": 2.593212127685547, "loss/dist_ce": 0.0, "loss/hidden": 0.11328125, "loss/idx": 0.0, "loss/logits": 0.030498359352350235, "step": 979 }, { "epoch": 0.008076679001458748, "grad_norm": 1.3203125, "grad_norm_var": 0.8338783264160157, "learning_rate": 5e-05, "loss": 0.1281, "loss/crossentropy": 2.2054710388183594, "loss/dist_ce": 0.0, "loss/hidden": 0.10107421875, "loss/idx": 0.0, "loss/logits": 0.02705022320151329, "step": 980 }, { "epoch": 0.00808492051064391, "grad_norm": 1.390625, "grad_norm_var": 0.8760047912597656, "learning_rate": 5e-05, "loss": 0.1358, "loss/crossentropy": 1.5589760541915894, "loss/dist_ce": 0.0, "loss/hidden": 0.1103515625, "loss/idx": 0.0, "loss/logits": 0.025421902537345886, "step": 981 }, { "epoch": 0.008093162019829072, "grad_norm": 2.4375, "grad_norm_var": 0.8555946350097656, "learning_rate": 5e-05, "loss": 0.1685, "loss/crossentropy": 2.5976901054382324, "loss/dist_ce": 0.0, "loss/hidden": 0.130859375, "loss/idx": 0.0, "loss/logits": 0.03766857087612152, "step": 982 }, { "epoch": 0.008101403529014234, "grad_norm": 1.703125, "grad_norm_var": 0.853905995686849, "learning_rate": 5e-05, "loss": 0.1405, "loss/crossentropy": 1.4126673936843872, "loss/dist_ce": 0.0, "loss/hidden": 0.115234375, "loss/idx": 0.0, "loss/logits": 0.025234002619981766, "step": 983 }, { "epoch": 0.008109645038199395, "grad_norm": 3.0625, "grad_norm_var": 0.859545644124349, "learning_rate": 5e-05, "loss": 0.1731, "loss/crossentropy": 1.74605131149292, "loss/dist_ce": 0.0, "loss/hidden": 0.1376953125, "loss/idx": 0.0, "loss/logits": 0.035446591675281525, "step": 984 }, { "epoch": 0.008117886547384557, "grad_norm": 2.90625, "grad_norm_var": 0.8763201395670573, "learning_rate": 5e-05, "loss": 0.124, "loss/crossentropy": 1.0640939474105835, "loss/dist_ce": 0.0, "loss/hidden": 0.1044921875, "loss/idx": 0.0, "loss/logits": 0.019489990547299385, "step": 985 }, { "epoch": 0.00812612805656972, "grad_norm": 1.6171875, "grad_norm_var": 0.8388987223307292, "learning_rate": 5e-05, "loss": 0.142, "loss/crossentropy": 1.6699655055999756, "loss/dist_ce": 0.0, "loss/hidden": 0.11474609375, "loss/idx": 0.0, "loss/logits": 0.02729114145040512, "step": 986 }, { "epoch": 0.008134369565754881, "grad_norm": 2.40625, "grad_norm_var": 0.8218658447265625, "learning_rate": 5e-05, "loss": 0.1273, "loss/crossentropy": 0.9518370032310486, "loss/dist_ce": 0.0, "loss/hidden": 0.10693359375, "loss/idx": 0.0, "loss/logits": 0.020414654165506363, "step": 987 }, { "epoch": 0.008142611074940043, "grad_norm": 1.9453125, "grad_norm_var": 0.8288530985514323, "learning_rate": 5e-05, "loss": 0.1543, "loss/crossentropy": 2.4604508876800537, "loss/dist_ce": 0.0, "loss/hidden": 0.1220703125, "loss/idx": 0.0, "loss/logits": 0.03225576505064964, "step": 988 }, { "epoch": 0.008150852584125205, "grad_norm": 2.46875, "grad_norm_var": 0.8185991923014323, "learning_rate": 5e-05, "loss": 0.1825, "loss/crossentropy": 2.5151901245117188, "loss/dist_ce": 0.0, "loss/hidden": 0.140625, "loss/idx": 0.0, "loss/logits": 0.04188704490661621, "step": 989 }, { "epoch": 0.008159094093310367, "grad_norm": 3.8125, "grad_norm_var": 0.5173500061035157, "learning_rate": 5e-05, "loss": 0.1537, "loss/crossentropy": 2.6265015602111816, "loss/dist_ce": 0.0, "loss/hidden": 0.119140625, "loss/idx": 0.0, "loss/logits": 0.03460276871919632, "step": 990 }, { "epoch": 0.008167335602495529, "grad_norm": 1.4921875, "grad_norm_var": 0.5271881103515625, "learning_rate": 5e-05, "loss": 0.1395, "loss/crossentropy": 2.9253175258636475, "loss/dist_ce": 0.0, "loss/hidden": 0.109375, "loss/idx": 0.0, "loss/logits": 0.030113544315099716, "step": 991 }, { "epoch": 0.00817557711168069, "grad_norm": 1.3828125, "grad_norm_var": 0.5665484110514323, "learning_rate": 5e-05, "loss": 0.1456, "loss/crossentropy": 2.487765073776245, "loss/dist_ce": 0.0, "loss/hidden": 0.11328125, "loss/idx": 0.0, "loss/logits": 0.032366957515478134, "step": 992 }, { "epoch": 0.008183818620865852, "grad_norm": 1.796875, "grad_norm_var": 0.5669146219889323, "learning_rate": 5e-05, "loss": 0.1707, "loss/crossentropy": 2.329315185546875, "loss/dist_ce": 0.0, "loss/hidden": 0.13671875, "loss/idx": 0.0, "loss/logits": 0.03394667059183121, "step": 993 }, { "epoch": 0.008192060130051014, "grad_norm": 3.78125, "grad_norm_var": 0.7332354227701823, "learning_rate": 5e-05, "loss": 0.1519, "loss/crossentropy": 2.148042917251587, "loss/dist_ce": 0.0, "loss/hidden": 0.1220703125, "loss/idx": 0.0, "loss/logits": 0.029824528843164444, "step": 994 }, { "epoch": 0.008200301639236176, "grad_norm": 25.625, "grad_norm_var": 34.854078928629555, "learning_rate": 5e-05, "loss": 0.334, "loss/crossentropy": 1.8628984689712524, "loss/dist_ce": 0.0, "loss/hidden": 0.27734375, "loss/idx": 0.0, "loss/logits": 0.05670515447854996, "step": 995 }, { "epoch": 0.008208543148421338, "grad_norm": 1.4375, "grad_norm_var": 34.81780497233073, "learning_rate": 5e-05, "loss": 0.1181, "loss/crossentropy": 2.3790695667266846, "loss/dist_ce": 0.0, "loss/hidden": 0.0947265625, "loss/idx": 0.0, "loss/logits": 0.02332988940179348, "step": 996 }, { "epoch": 0.0082167846576065, "grad_norm": 1.2734375, "grad_norm_var": 34.854811350504555, "learning_rate": 5e-05, "loss": 0.1176, "loss/crossentropy": 1.283698320388794, "loss/dist_ce": 0.0, "loss/hidden": 0.1005859375, "loss/idx": 0.0, "loss/logits": 0.016972240060567856, "step": 997 }, { "epoch": 0.008225026166791664, "grad_norm": 2.640625, "grad_norm_var": 34.82328465779622, "learning_rate": 5e-05, "loss": 0.1779, "loss/crossentropy": 2.7694525718688965, "loss/dist_ce": 0.0, "loss/hidden": 0.1337890625, "loss/idx": 0.0, "loss/logits": 0.04413297772407532, "step": 998 }, { "epoch": 0.008233267675976826, "grad_norm": 1.8828125, "grad_norm_var": 34.77723388671875, "learning_rate": 5e-05, "loss": 0.1636, "loss/crossentropy": 2.530651330947876, "loss/dist_ce": 0.0, "loss/hidden": 0.1298828125, "loss/idx": 0.0, "loss/logits": 0.0337049663066864, "step": 999 }, { "epoch": 0.008241509185161987, "grad_norm": 5.5625, "grad_norm_var": 34.94845784505208, "learning_rate": 5e-05, "loss": 0.278, "loss/crossentropy": 1.1534559726715088, "loss/dist_ce": 0.0, "loss/hidden": 0.2333984375, "loss/idx": 0.0, "loss/logits": 0.04459930956363678, "step": 1000 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.8956539674624e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }