| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.008241509185161987, |
| "eval_steps": 2000, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 8.241509185161987e-06, |
| "grad_norm": 780.0, |
| "learning_rate": 5e-05, |
| "loss": 22.7489, |
| "loss/crossentropy": 8.68287467956543, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 16.25, |
| "loss/idx": 0.0, |
| "loss/logits": 6.498888969421387, |
| "step": 1 |
| }, |
| { |
| "epoch": 1.6483018370323974e-05, |
| "grad_norm": 824.0, |
| "learning_rate": 5e-05, |
| "loss": 18.5076, |
| "loss/crossentropy": 8.787271499633789, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 12.625, |
| "loss/idx": 0.0, |
| "loss/logits": 5.8825788497924805, |
| "step": 2 |
| }, |
| { |
| "epoch": 2.472452755548596e-05, |
| "grad_norm": 466.0, |
| "learning_rate": 5e-05, |
| "loss": 12.0241, |
| "loss/crossentropy": 7.810218334197998, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 7.21875, |
| "loss/idx": 0.0, |
| "loss/logits": 4.805373191833496, |
| "step": 3 |
| }, |
| { |
| "epoch": 3.296603674064795e-05, |
| "grad_norm": 215.0, |
| "learning_rate": 5e-05, |
| "loss": 9.4126, |
| "loss/crossentropy": 6.043552398681641, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 6.25, |
| "loss/idx": 0.0, |
| "loss/logits": 3.1625852584838867, |
| "step": 4 |
| }, |
| { |
| "epoch": 4.1207545925809937e-05, |
| "grad_norm": 468.0, |
| "learning_rate": 5e-05, |
| "loss": 6.2853, |
| "loss/crossentropy": 4.783352851867676, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 4.21875, |
| "loss/idx": 0.0, |
| "loss/logits": 2.066528558731079, |
| "step": 5 |
| }, |
| { |
| "epoch": 4.944905511097192e-05, |
| "grad_norm": 306.0, |
| "learning_rate": 5e-05, |
| "loss": 5.4625, |
| "loss/crossentropy": 1.6133296489715576, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 4.1875, |
| "loss/idx": 0.0, |
| "loss/logits": 1.2749497890472412, |
| "step": 6 |
| }, |
| { |
| "epoch": 5.769056429613391e-05, |
| "grad_norm": 217.0, |
| "learning_rate": 5e-05, |
| "loss": 8.1947, |
| "loss/crossentropy": 4.63270378112793, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 5.78125, |
| "loss/idx": 0.0, |
| "loss/logits": 2.4134607315063477, |
| "step": 7 |
| }, |
| { |
| "epoch": 6.59320734812959e-05, |
| "grad_norm": 404.0, |
| "learning_rate": 5e-05, |
| "loss": 5.0477, |
| "loss/crossentropy": 4.424153804779053, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 3.53125, |
| "loss/idx": 0.0, |
| "loss/logits": 1.5164613723754883, |
| "step": 8 |
| }, |
| { |
| "epoch": 7.417358266645788e-05, |
| "grad_norm": 83.5, |
| "learning_rate": 5e-05, |
| "loss": 3.1549, |
| "loss/crossentropy": 3.354282855987549, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 2.296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.8579829931259155, |
| "step": 9 |
| }, |
| { |
| "epoch": 8.241509185161987e-05, |
| "grad_norm": 115.5, |
| "learning_rate": 5e-05, |
| "loss": 3.1588, |
| "loss/crossentropy": 3.1871225833892822, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 2.375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.7837648391723633, |
| "step": 10 |
| }, |
| { |
| "epoch": 9.065660103678186e-05, |
| "grad_norm": 252.0, |
| "learning_rate": 5e-05, |
| "loss": 7.2603, |
| "loss/crossentropy": 4.682134628295898, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 5.0, |
| "loss/idx": 0.0, |
| "loss/logits": 2.2602720260620117, |
| "step": 11 |
| }, |
| { |
| "epoch": 9.889811022194384e-05, |
| "grad_norm": 109.0, |
| "learning_rate": 5e-05, |
| "loss": 3.1302, |
| "loss/crossentropy": 2.417746067047119, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 2.46875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.661416232585907, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.00010713961940710583, |
| "grad_norm": 68.5, |
| "learning_rate": 5e-05, |
| "loss": 2.4003, |
| "loss/crossentropy": 1.6968345642089844, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.4158973693847656, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.00011538112859226781, |
| "grad_norm": 454.0, |
| "learning_rate": 5e-05, |
| "loss": 7.3347, |
| "loss/crossentropy": 4.652151584625244, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 4.25, |
| "loss/idx": 0.0, |
| "loss/logits": 3.084686756134033, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.00012362263777742982, |
| "grad_norm": 126.5, |
| "learning_rate": 5e-05, |
| "loss": 2.4695, |
| "loss/crossentropy": 3.0716333389282227, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.594476580619812, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0001318641469625918, |
| "grad_norm": 306.0, |
| "grad_norm_var": 53540.9625, |
| "learning_rate": 5e-05, |
| "loss": 3.9132, |
| "loss/crossentropy": 2.430070638656616, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 2.9375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.9756777882575989, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.00014010565614775377, |
| "grad_norm": 68.5, |
| "grad_norm_var": 41986.49895833333, |
| "learning_rate": 5e-05, |
| "loss": 2.3829, |
| "loss/crossentropy": 1.8029091358184814, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.8828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.5000446438789368, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.00014834716533291577, |
| "grad_norm": 139.0, |
| "grad_norm_var": 21647.707291666666, |
| "learning_rate": 5e-05, |
| "loss": 2.5801, |
| "loss/crossentropy": 1.5956979990005493, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 2.15625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.4238685965538025, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.00015658867451807774, |
| "grad_norm": 28.625, |
| "grad_norm_var": 20272.937434895834, |
| "learning_rate": 5e-05, |
| "loss": 1.6796, |
| "loss/crossentropy": 2.664867401123047, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.3827553689479828, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.00016483018370323975, |
| "grad_norm": 81.5, |
| "grad_norm_var": 21299.079622395835, |
| "learning_rate": 5e-05, |
| "loss": 1.8862, |
| "loss/crossentropy": 3.0564301013946533, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.4609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.42524370551109314, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00017307169288840172, |
| "grad_norm": 71.5, |
| "grad_norm_var": 17047.856184895834, |
| "learning_rate": 5e-05, |
| "loss": 1.8175, |
| "loss/crossentropy": 1.5220972299575806, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.23937611281871796, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.00018131320207356372, |
| "grad_norm": 32.25, |
| "grad_norm_var": 17021.051497395834, |
| "learning_rate": 5e-05, |
| "loss": 1.636, |
| "loss/crossentropy": 1.8798402547836304, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.3125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.32345157861709595, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.0001895547112587257, |
| "grad_norm": 31.25, |
| "grad_norm_var": 17761.729622395833, |
| "learning_rate": 5e-05, |
| "loss": 1.5306, |
| "loss/crossentropy": 3.0712087154388428, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.3587738275527954, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.00019779622044388767, |
| "grad_norm": 13.375, |
| "grad_norm_var": 13976.939583333333, |
| "learning_rate": 5e-05, |
| "loss": 1.0326, |
| "loss/crossentropy": 2.2200183868408203, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.8125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.2200760841369629, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.00020603772962904968, |
| "grad_norm": 28.125, |
| "grad_norm_var": 14466.229622395833, |
| "learning_rate": 5e-05, |
| "loss": 1.509, |
| "loss/crossentropy": 3.206345319747925, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.1484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.3605613112449646, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.00021427923881421165, |
| "grad_norm": 153.0, |
| "grad_norm_var": 14529.862434895833, |
| "learning_rate": 5e-05, |
| "loss": 2.1474, |
| "loss/crossentropy": 1.5313490629196167, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.38178950548171997, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.00022252074799937365, |
| "grad_norm": 94.0, |
| "grad_norm_var": 13366.093684895834, |
| "learning_rate": 5e-05, |
| "loss": 3.417, |
| "loss/crossentropy": 1.551514744758606, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 2.75, |
| "loss/idx": 0.0, |
| "loss/logits": 0.6670438051223755, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.00023076225718453563, |
| "grad_norm": 268.0, |
| "grad_norm_var": 14865.165559895833, |
| "learning_rate": 5e-05, |
| "loss": 1.9003, |
| "loss/crossentropy": 3.108414649963379, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.4784301221370697, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.00023900376636969763, |
| "grad_norm": 34.25, |
| "grad_norm_var": 15186.259309895833, |
| "learning_rate": 5e-05, |
| "loss": 1.0626, |
| "loss/crossentropy": 3.3259568214416504, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.81640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.24623815715312958, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.00024724527555485963, |
| "grad_norm": 167.0, |
| "grad_norm_var": 7576.8728515625, |
| "learning_rate": 5e-05, |
| "loss": 1.4178, |
| "loss/crossentropy": 1.5920592546463013, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.28125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.13655498623847961, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0002554867847400216, |
| "grad_norm": 12.6875, |
| "grad_norm_var": 8024.979931640625, |
| "learning_rate": 5e-05, |
| "loss": 0.9625, |
| "loss/crossentropy": 2.868499517440796, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.7421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.2203603982925415, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.0002637282939251836, |
| "grad_norm": 49.5, |
| "grad_norm_var": 4940.166650390625, |
| "learning_rate": 5e-05, |
| "loss": 1.0668, |
| "loss/crossentropy": 2.660956859588623, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.8359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.23084667325019836, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.0002719698031103456, |
| "grad_norm": 18.875, |
| "grad_norm_var": 5167.097639973958, |
| "learning_rate": 5e-05, |
| "loss": 0.9463, |
| "loss/crossentropy": 1.6037225723266602, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.8125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.13378173112869263, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.00028021131229550753, |
| "grad_norm": 18.75, |
| "grad_norm_var": 5067.703499348959, |
| "learning_rate": 5e-05, |
| "loss": 1.0141, |
| "loss/crossentropy": 1.0409276485443115, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.84375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.1703900545835495, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.00028845282148066954, |
| "grad_norm": 16.5, |
| "grad_norm_var": 5142.032275390625, |
| "learning_rate": 5e-05, |
| "loss": 0.9889, |
| "loss/crossentropy": 1.5536582469940186, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.8359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.1529390513896942, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.00029669433066583154, |
| "grad_norm": 9.8125, |
| "grad_norm_var": 5335.719205729167, |
| "learning_rate": 5e-05, |
| "loss": 0.8687, |
| "loss/crossentropy": 2.7224836349487305, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.19684143364429474, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.00030493583985099354, |
| "grad_norm": 7.125, |
| "grad_norm_var": 5527.603645833334, |
| "learning_rate": 5e-05, |
| "loss": 0.5503, |
| "loss/crossentropy": 2.5596024990081787, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.4296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.12058012187480927, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.0003131773490361555, |
| "grad_norm": 13.1875, |
| "grad_norm_var": 5619.972379557292, |
| "learning_rate": 5e-05, |
| "loss": 0.8248, |
| "loss/crossentropy": 2.8074352741241455, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.62109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.2037278115749359, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.0003214188582213175, |
| "grad_norm": 37.75, |
| "grad_norm_var": 5599.026806640625, |
| "learning_rate": 5e-05, |
| "loss": 1.3462, |
| "loss/crossentropy": 1.5375018119812012, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.15625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.18997883796691895, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.0003296603674064795, |
| "grad_norm": 7.28125, |
| "grad_norm_var": 5638.313244628906, |
| "learning_rate": 5e-05, |
| "loss": 0.6618, |
| "loss/crossentropy": 2.4615395069122314, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.51171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.15010175108909607, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.00033790187659164144, |
| "grad_norm": 68.5, |
| "grad_norm_var": 5576.730855305989, |
| "learning_rate": 5e-05, |
| "loss": 0.7963, |
| "loss/crossentropy": 1.1309521198272705, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.6796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.11660157144069672, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00034614338577680344, |
| "grad_norm": 10.0625, |
| "grad_norm_var": 5100.57030843099, |
| "learning_rate": 5e-05, |
| "loss": 0.6657, |
| "loss/crossentropy": 2.225135326385498, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.54296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.12272368371486664, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.00035438489496196544, |
| "grad_norm": 15.125, |
| "grad_norm_var": 5048.541564941406, |
| "learning_rate": 5e-05, |
| "loss": 0.7119, |
| "loss/crossentropy": 0.907244861125946, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.6328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07912808656692505, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.00036262640414712745, |
| "grad_norm": 11.25, |
| "grad_norm_var": 1608.158426920573, |
| "learning_rate": 5e-05, |
| "loss": 0.757, |
| "loss/crossentropy": 1.7073473930358887, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.62109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.1358700692653656, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.0003708679133322894, |
| "grad_norm": 8.3125, |
| "grad_norm_var": 1639.323954264323, |
| "learning_rate": 5e-05, |
| "loss": 0.7863, |
| "loss/crossentropy": 2.7458887100219727, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.62890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.15743763744831085, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.0003791094225174514, |
| "grad_norm": 7.15625, |
| "grad_norm_var": 305.3570963541667, |
| "learning_rate": 5e-05, |
| "loss": 0.7888, |
| "loss/crossentropy": 3.2708899974823, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.58984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.19896197319030762, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.0003873509317026134, |
| "grad_norm": 95.5, |
| "grad_norm_var": 658.8413899739584, |
| "learning_rate": 5e-05, |
| "loss": 1.2412, |
| "loss/crossentropy": 2.0868113040924072, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.0, |
| "loss/idx": 0.0, |
| "loss/logits": 0.24115484952926636, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.00039559244088777535, |
| "grad_norm": 7.53125, |
| "grad_norm_var": 629.9714803059895, |
| "learning_rate": 5e-05, |
| "loss": 0.7909, |
| "loss/crossentropy": 2.5569632053375244, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.6171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.17373064160346985, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.00040383395007293735, |
| "grad_norm": 7.125, |
| "grad_norm_var": 643.5665974934896, |
| "learning_rate": 5e-05, |
| "loss": 0.5926, |
| "loss/crossentropy": 1.3575685024261475, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.486328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10628513246774673, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.00041207545925809935, |
| "grad_norm": 14.0625, |
| "grad_norm_var": 646.5402303059896, |
| "learning_rate": 5e-05, |
| "loss": 0.5641, |
| "loss/crossentropy": 1.111220359802246, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.486328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07779324799776077, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.00042031696844326135, |
| "grad_norm": 5.375, |
| "grad_norm_var": 660.9766560872396, |
| "learning_rate": 5e-05, |
| "loss": 0.622, |
| "loss/crossentropy": 2.907522678375244, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.498046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.12399697303771973, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0004285584776284233, |
| "grad_norm": 9.75, |
| "grad_norm_var": 661.0644816080729, |
| "learning_rate": 5e-05, |
| "loss": 0.5746, |
| "loss/crossentropy": 2.72662353515625, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.4453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.12924730777740479, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.0004367999868135853, |
| "grad_norm": 4.9375, |
| "grad_norm_var": 665.2116170247396, |
| "learning_rate": 5e-05, |
| "loss": 0.6226, |
| "loss/crossentropy": 2.3917365074157715, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.498046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.1245586946606636, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.0004450414959987473, |
| "grad_norm": 28.0, |
| "grad_norm_var": 665.1113240559896, |
| "learning_rate": 5e-05, |
| "loss": 0.8407, |
| "loss/crossentropy": 2.7228264808654785, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.65234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.18836885690689087, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.00045328300518390925, |
| "grad_norm": 32.5, |
| "grad_norm_var": 655.1841756184896, |
| "learning_rate": 5e-05, |
| "loss": 0.6963, |
| "loss/crossentropy": 2.543640375137329, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.56640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.12986385822296143, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.00046152451436907126, |
| "grad_norm": 16.875, |
| "grad_norm_var": 643.6704264322917, |
| "learning_rate": 5e-05, |
| "loss": 0.7344, |
| "loss/crossentropy": 1.649795413017273, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.12504054605960846, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.00046976602355423326, |
| "grad_norm": 28.0, |
| "grad_norm_var": 491.7321451822917, |
| "learning_rate": 5e-05, |
| "loss": 0.7995, |
| "loss/crossentropy": 1.5515227317810059, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.6796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.11983367055654526, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.00047800753273939526, |
| "grad_norm": 4.15625, |
| "grad_norm_var": 500.8306925455729, |
| "learning_rate": 5e-05, |
| "loss": 0.4793, |
| "loss/crossentropy": 1.7439237833023071, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.388671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09059557318687439, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.0004862490419245572, |
| "grad_norm": 14.5, |
| "grad_norm_var": 501.1345662434896, |
| "learning_rate": 5e-05, |
| "loss": 1.0179, |
| "loss/crossentropy": 1.387863039970398, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.8828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.1351165473461151, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0004944905511097193, |
| "grad_norm": 8.0625, |
| "grad_norm_var": 504.82509358723956, |
| "learning_rate": 5e-05, |
| "loss": 0.5969, |
| "loss/crossentropy": 1.6710844039916992, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.490234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10665580630302429, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.0005027320602948812, |
| "grad_norm": 18.0, |
| "grad_norm_var": 497.86724853515625, |
| "learning_rate": 5e-05, |
| "loss": 0.5408, |
| "loss/crossentropy": 1.0266728401184082, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.47265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.068178191781044, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.0005109735694800432, |
| "grad_norm": 7.46875, |
| "grad_norm_var": 497.38629150390625, |
| "learning_rate": 5e-05, |
| "loss": 0.8584, |
| "loss/crossentropy": 2.9015908241271973, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.6796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.1787503957748413, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.0005192150786652052, |
| "grad_norm": 7.84375, |
| "grad_norm_var": 81.943603515625, |
| "learning_rate": 5e-05, |
| "loss": 0.4871, |
| "loss/crossentropy": 1.605463981628418, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.40234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08476820588111877, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.0005274565878503672, |
| "grad_norm": 6.3125, |
| "grad_norm_var": 82.98795166015626, |
| "learning_rate": 5e-05, |
| "loss": 0.4591, |
| "loss/crossentropy": 1.7012584209442139, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.373046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0860566645860672, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.0005356980970355292, |
| "grad_norm": 8.625, |
| "grad_norm_var": 81.89146728515625, |
| "learning_rate": 5e-05, |
| "loss": 0.513, |
| "loss/crossentropy": 1.6209317445755005, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.416015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09696009755134583, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.0005439396062206912, |
| "grad_norm": 5.0, |
| "grad_norm_var": 86.22919514973958, |
| "learning_rate": 5e-05, |
| "loss": 0.4936, |
| "loss/crossentropy": 2.992037773132324, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.37890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.114667147397995, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.0005521811154058532, |
| "grad_norm": 10.5625, |
| "grad_norm_var": 82.74924723307292, |
| "learning_rate": 5e-05, |
| "loss": 0.9616, |
| "loss/crossentropy": 2.2757253646850586, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.75, |
| "loss/idx": 0.0, |
| "loss/logits": 0.21162353456020355, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0005604226245910151, |
| "grad_norm": 6.0625, |
| "grad_norm_var": 85.27672119140625, |
| "learning_rate": 5e-05, |
| "loss": 0.3899, |
| "loss/crossentropy": 0.7420970797538757, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.34765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04225603863596916, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.0005686641337761771, |
| "grad_norm": 4.78125, |
| "grad_norm_var": 85.44479166666666, |
| "learning_rate": 5e-05, |
| "loss": 0.3729, |
| "loss/crossentropy": 1.957132339477539, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07599128782749176, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.0005769056429613391, |
| "grad_norm": 8.875, |
| "grad_norm_var": 69.85592447916666, |
| "learning_rate": 5e-05, |
| "loss": 0.5605, |
| "loss/crossentropy": 2.908198356628418, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.4453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.1151949018239975, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.0005851471521465011, |
| "grad_norm": 6.125, |
| "grad_norm_var": 40.280208333333334, |
| "learning_rate": 5e-05, |
| "loss": 0.4273, |
| "loss/crossentropy": 1.8814876079559326, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.33984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08749841153621674, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.0005933886613316631, |
| "grad_norm": 3.625, |
| "grad_norm_var": 39.245052083333334, |
| "learning_rate": 5e-05, |
| "loss": 0.3375, |
| "loss/crossentropy": 1.5110681056976318, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.275390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.062085069715976715, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.0006016301705168251, |
| "grad_norm": 4.09375, |
| "grad_norm_var": 15.198726399739583, |
| "learning_rate": 5e-05, |
| "loss": 0.3922, |
| "loss/crossentropy": 2.652179479598999, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.306640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.085569366812706, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.0006098716797019871, |
| "grad_norm": 12.0, |
| "grad_norm_var": 15.279410807291667, |
| "learning_rate": 5e-05, |
| "loss": 0.7232, |
| "loss/crossentropy": 1.3632968664169312, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09823663532733917, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.000618113188887149, |
| "grad_norm": 3.15625, |
| "grad_norm_var": 13.862919108072917, |
| "learning_rate": 5e-05, |
| "loss": 0.3533, |
| "loss/crossentropy": 1.3768854141235352, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.30078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05253131687641144, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.000626354698072311, |
| "grad_norm": 4.5, |
| "grad_norm_var": 14.406571451822916, |
| "learning_rate": 5e-05, |
| "loss": 0.5053, |
| "loss/crossentropy": 2.9618265628814697, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.373046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.13229554891586304, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.000634596207257473, |
| "grad_norm": 3.96875, |
| "grad_norm_var": 6.720442708333334, |
| "learning_rate": 5e-05, |
| "loss": 0.4483, |
| "loss/crossentropy": 1.52455472946167, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.3828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.065483957529068, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.000642837716442635, |
| "grad_norm": 3.40625, |
| "grad_norm_var": 7.193343098958334, |
| "learning_rate": 5e-05, |
| "loss": 0.3286, |
| "loss/crossentropy": 2.478041648864746, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.255859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0727752074599266, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.000651079225627797, |
| "grad_norm": 3.765625, |
| "grad_norm_var": 7.330077107747396, |
| "learning_rate": 5e-05, |
| "loss": 0.382, |
| "loss/crossentropy": 1.4515001773834229, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.31640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06562215089797974, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.000659320734812959, |
| "grad_norm": 3.265625, |
| "grad_norm_var": 7.754378255208334, |
| "learning_rate": 5e-05, |
| "loss": 0.4323, |
| "loss/crossentropy": 1.748592495918274, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.34375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08855777978897095, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.000667562243998121, |
| "grad_norm": 11.9375, |
| "grad_norm_var": 9.71513671875, |
| "learning_rate": 5e-05, |
| "loss": 0.465, |
| "loss/crossentropy": 2.6704611778259277, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.369140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09580960124731064, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.0006758037531832829, |
| "grad_norm": 23.0, |
| "grad_norm_var": 27.69638671875, |
| "learning_rate": 5e-05, |
| "loss": 0.8228, |
| "loss/crossentropy": 0.5576035976409912, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.66796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.1548667550086975, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.0006840452623684449, |
| "grad_norm": 5.59375, |
| "grad_norm_var": 26.92584228515625, |
| "learning_rate": 5e-05, |
| "loss": 0.4884, |
| "loss/crossentropy": 0.8048841953277588, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.43359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05478814244270325, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0006922867715536069, |
| "grad_norm": 4.96875, |
| "grad_norm_var": 27.102294921875, |
| "learning_rate": 5e-05, |
| "loss": 0.5603, |
| "loss/crossentropy": 2.0498743057250977, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.46875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0915648490190506, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.0007005282807387689, |
| "grad_norm": 7.8125, |
| "grad_norm_var": 26.90455322265625, |
| "learning_rate": 5e-05, |
| "loss": 0.5343, |
| "loss/crossentropy": 1.643184781074524, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.44140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09292187541723251, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.0007087697899239309, |
| "grad_norm": 5.34375, |
| "grad_norm_var": 26.745003255208335, |
| "learning_rate": 5e-05, |
| "loss": 0.537, |
| "loss/crossentropy": 2.662973165512085, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.439453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09757896512746811, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.0007170112991090929, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 27.857975260416666, |
| "learning_rate": 5e-05, |
| "loss": 0.2826, |
| "loss/crossentropy": 1.4633480310440063, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.236328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0462251678109169, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.0007252528082942549, |
| "grad_norm": 7.21875, |
| "grad_norm_var": 27.32125244140625, |
| "learning_rate": 5e-05, |
| "loss": 0.7004, |
| "loss/crossentropy": 2.25626277923584, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.55859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.14179712533950806, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.0007334943174794168, |
| "grad_norm": 10.0625, |
| "grad_norm_var": 27.510107421875, |
| "learning_rate": 5e-05, |
| "loss": 0.341, |
| "loss/crossentropy": 2.072801113128662, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.275390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0656304880976677, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.0007417358266645788, |
| "grad_norm": 3.46875, |
| "grad_norm_var": 26.40260009765625, |
| "learning_rate": 5e-05, |
| "loss": 0.3528, |
| "loss/crossentropy": 1.6771039962768555, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.29296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.059821829199790955, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0007499773358497408, |
| "grad_norm": 2.640625, |
| "grad_norm_var": 26.648696899414062, |
| "learning_rate": 5e-05, |
| "loss": 0.2959, |
| "loss/crossentropy": 1.3347995281219482, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.24609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.049777351319789886, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.0007582188450349028, |
| "grad_norm": 5.21875, |
| "grad_norm_var": 26.492967732747395, |
| "learning_rate": 5e-05, |
| "loss": 0.4277, |
| "loss/crossentropy": 2.2866933345794678, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.34765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07999749481678009, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.0007664603542200648, |
| "grad_norm": 20.375, |
| "grad_norm_var": 37.763719685872395, |
| "learning_rate": 5e-05, |
| "loss": 0.592, |
| "loss/crossentropy": 0.5346123576164246, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.52734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06467436254024506, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.0007747018634052268, |
| "grad_norm": 5.28125, |
| "grad_norm_var": 36.951952107747395, |
| "learning_rate": 5e-05, |
| "loss": 0.5485, |
| "loss/crossentropy": 1.4174734354019165, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.46484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08362259715795517, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.0007829433725903888, |
| "grad_norm": 3.5, |
| "grad_norm_var": 37.093912760416664, |
| "learning_rate": 5e-05, |
| "loss": 0.3081, |
| "loss/crossentropy": 1.5907094478607178, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05422712862491608, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.0007911848817755507, |
| "grad_norm": 3.546875, |
| "grad_norm_var": 36.93508707682292, |
| "learning_rate": 5e-05, |
| "loss": 0.3014, |
| "loss/crossentropy": 1.4840497970581055, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05143030732870102, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.0007994263909607127, |
| "grad_norm": 6.03125, |
| "grad_norm_var": 35.73922526041667, |
| "learning_rate": 5e-05, |
| "loss": 0.5235, |
| "loss/crossentropy": 1.9094655513763428, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.416015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10748349130153656, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.0008076679001458747, |
| "grad_norm": 5.53125, |
| "grad_norm_var": 18.19996337890625, |
| "learning_rate": 5e-05, |
| "loss": 0.3909, |
| "loss/crossentropy": 1.4463390111923218, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06276652216911316, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.0008159094093310367, |
| "grad_norm": 7.03125, |
| "grad_norm_var": 18.214937337239583, |
| "learning_rate": 5e-05, |
| "loss": 0.5097, |
| "loss/crossentropy": 2.1369168758392334, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.3984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.11129927635192871, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.0008241509185161987, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 19.135965983072918, |
| "learning_rate": 5e-05, |
| "loss": 0.2675, |
| "loss/crossentropy": 1.7142083644866943, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.22265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.044855352491140366, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0008323924277013607, |
| "grad_norm": 5.25, |
| "grad_norm_var": 18.965132649739584, |
| "learning_rate": 5e-05, |
| "loss": 0.3094, |
| "loss/crossentropy": 0.9302163124084473, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.267578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.041776590049266815, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.0008406339368865227, |
| "grad_norm": 9.9375, |
| "grad_norm_var": 19.911995442708335, |
| "learning_rate": 5e-05, |
| "loss": 0.4483, |
| "loss/crossentropy": 2.0367283821105957, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.36328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08506779372692108, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.0008488754460716846, |
| "grad_norm": 2.984375, |
| "grad_norm_var": 19.65354715983073, |
| "learning_rate": 5e-05, |
| "loss": 0.3074, |
| "loss/crossentropy": 2.5578417778015137, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05742755904793739, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.0008571169552568466, |
| "grad_norm": 3.96875, |
| "grad_norm_var": 19.903644816080728, |
| "learning_rate": 5e-05, |
| "loss": 0.3362, |
| "loss/crossentropy": 1.235908031463623, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05100230872631073, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.0008653584644420086, |
| "grad_norm": 13.1875, |
| "grad_norm_var": 22.177814737955728, |
| "learning_rate": 5e-05, |
| "loss": 0.5365, |
| "loss/crossentropy": 1.3280326128005981, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.4765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05990615114569664, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.0008735999736271706, |
| "grad_norm": 3.640625, |
| "grad_norm_var": 22.11558837890625, |
| "learning_rate": 5e-05, |
| "loss": 0.3851, |
| "loss/crossentropy": 2.764561414718628, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08821941912174225, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.0008818414828123326, |
| "grad_norm": 5.875, |
| "grad_norm_var": 21.201919555664062, |
| "learning_rate": 5e-05, |
| "loss": 0.3966, |
| "loss/crossentropy": 2.7057063579559326, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.31640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08024018257856369, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0008900829919974946, |
| "grad_norm": 3.375, |
| "grad_norm_var": 21.723835245768228, |
| "learning_rate": 5e-05, |
| "loss": 0.3745, |
| "loss/crossentropy": 1.9366930723190308, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08542559295892715, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.0008983245011826566, |
| "grad_norm": 3.796875, |
| "grad_norm_var": 7.927079264322916, |
| "learning_rate": 5e-05, |
| "loss": 0.3125, |
| "loss/crossentropy": 2.499528408050537, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25, |
| "loss/idx": 0.0, |
| "loss/logits": 0.062451932579278946, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.0009065660103678185, |
| "grad_norm": 4.59375, |
| "grad_norm_var": 7.960738118489584, |
| "learning_rate": 5e-05, |
| "loss": 0.3585, |
| "loss/crossentropy": 2.2361199855804443, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07333969324827194, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0009148075195529805, |
| "grad_norm": 2.671875, |
| "grad_norm_var": 8.200495402018229, |
| "learning_rate": 5e-05, |
| "loss": 0.2519, |
| "loss/crossentropy": 1.3813279867172241, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.21484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03708701953291893, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.0009230490287381425, |
| "grad_norm": 6.875, |
| "grad_norm_var": 8.145243326822916, |
| "learning_rate": 5e-05, |
| "loss": 0.6369, |
| "loss/crossentropy": 0.7904279232025146, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.55859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07834139466285706, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.0009312905379233045, |
| "grad_norm": 26.625, |
| "grad_norm_var": 36.27662760416667, |
| "learning_rate": 5e-05, |
| "loss": 0.811, |
| "loss/crossentropy": 1.7555081844329834, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.6171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.19379198551177979, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.0009395320471084665, |
| "grad_norm": 20.875, |
| "grad_norm_var": 48.545633951822914, |
| "learning_rate": 5e-05, |
| "loss": 0.5722, |
| "loss/crossentropy": 2.7940163612365723, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.455078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.11716997623443604, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.0009477735562936285, |
| "grad_norm": 4.1875, |
| "grad_norm_var": 49.299153645833336, |
| "learning_rate": 5e-05, |
| "loss": 0.3537, |
| "loss/crossentropy": 2.1036393642425537, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.291015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06271170824766159, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.0009560150654787905, |
| "grad_norm": 2.84375, |
| "grad_norm_var": 48.926936848958334, |
| "learning_rate": 5e-05, |
| "loss": 0.2983, |
| "loss/crossentropy": 1.6935715675354004, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04436497390270233, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.0009642565746639524, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 50.4494140625, |
| "learning_rate": 5e-05, |
| "loss": 0.2826, |
| "loss/crossentropy": 0.9073331356048584, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25, |
| "loss/idx": 0.0, |
| "loss/logits": 0.032610610127449036, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.0009724980838491144, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 51.52145080566406, |
| "learning_rate": 5e-05, |
| "loss": 0.1972, |
| "loss/crossentropy": 1.5172605514526367, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1669921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030240532010793686, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.0009807395930342764, |
| "grad_norm": 3.609375, |
| "grad_norm_var": 51.222215779622395, |
| "learning_rate": 5e-05, |
| "loss": 0.3043, |
| "loss/crossentropy": 1.3851293325424194, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04644084721803665, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.0009889811022194385, |
| "grad_norm": 11.5, |
| "grad_norm_var": 51.8164784749349, |
| "learning_rate": 5e-05, |
| "loss": 0.3607, |
| "loss/crossentropy": 1.786331057548523, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.30078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.059897445142269135, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.0009972226114046004, |
| "grad_norm": 3.421875, |
| "grad_norm_var": 50.212398274739584, |
| "learning_rate": 5e-05, |
| "loss": 0.3368, |
| "loss/crossentropy": 1.3703113794326782, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.283203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05364468693733215, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.0010054641205897623, |
| "grad_norm": 5.6875, |
| "grad_norm_var": 49.62085673014323, |
| "learning_rate": 5e-05, |
| "loss": 0.3796, |
| "loss/crossentropy": 1.5540196895599365, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.31640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06318466365337372, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.0010137056297749244, |
| "grad_norm": 3.15625, |
| "grad_norm_var": 50.45276590983073, |
| "learning_rate": 5e-05, |
| "loss": 0.2844, |
| "loss/crossentropy": 2.5752596855163574, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.224609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05982211232185364, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.0010219471389600863, |
| "grad_norm": 7.9375, |
| "grad_norm_var": 49.715518188476565, |
| "learning_rate": 5e-05, |
| "loss": 0.6157, |
| "loss/crossentropy": 2.424745798110962, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.47265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.14306676387786865, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.0010301886481452484, |
| "grad_norm": 13.0625, |
| "grad_norm_var": 51.110791015625, |
| "learning_rate": 5e-05, |
| "loss": 0.3906, |
| "loss/crossentropy": 1.6216858625411987, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.32421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06637328118085861, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.0010384301573304103, |
| "grad_norm": 10.3125, |
| "grad_norm_var": 50.87027587890625, |
| "learning_rate": 5e-05, |
| "loss": 0.4044, |
| "loss/crossentropy": 2.009226083755493, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.32421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08017371594905853, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.0010466716665155724, |
| "grad_norm": 2.734375, |
| "grad_norm_var": 50.826558430989586, |
| "learning_rate": 5e-05, |
| "loss": 0.2593, |
| "loss/crossentropy": 0.31721383333206177, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019071679562330246, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.0010549131757007343, |
| "grad_norm": 9.0625, |
| "grad_norm_var": 50.811747233072914, |
| "learning_rate": 5e-05, |
| "loss": 0.4044, |
| "loss/crossentropy": 1.5959731340408325, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.3359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06845791637897491, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.0010631546848858962, |
| "grad_norm": 6.53125, |
| "grad_norm_var": 26.382666015625, |
| "learning_rate": 5e-05, |
| "loss": 0.404, |
| "loss/crossentropy": 1.000110387802124, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.345703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0582566112279892, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.0010713961940710583, |
| "grad_norm": 13.375, |
| "grad_norm_var": 15.855322265625, |
| "learning_rate": 5e-05, |
| "loss": 0.4836, |
| "loss/crossentropy": 1.4723174571990967, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.41015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07343296706676483, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0010796377032562202, |
| "grad_norm": 1.8359375, |
| "grad_norm_var": 16.883135732014974, |
| "learning_rate": 5e-05, |
| "loss": 0.2167, |
| "loss/crossentropy": 1.884359359741211, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.17578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04091912880539894, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.0010878792124413823, |
| "grad_norm": 9.0625, |
| "grad_norm_var": 16.503775787353515, |
| "learning_rate": 5e-05, |
| "loss": 0.335, |
| "loss/crossentropy": 2.47468638420105, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06939947605133057, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.0010961207216265442, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 16.44910659790039, |
| "learning_rate": 5e-05, |
| "loss": 0.2479, |
| "loss/crossentropy": 1.4188505411148071, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.208984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03889765217900276, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.0011043622308117063, |
| "grad_norm": 3.453125, |
| "grad_norm_var": 15.812143707275391, |
| "learning_rate": 5e-05, |
| "loss": 0.3702, |
| "loss/crossentropy": 2.618537425994873, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.27734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09288428723812103, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.0011126037399968682, |
| "grad_norm": 14.5, |
| "grad_norm_var": 18.753179677327473, |
| "learning_rate": 5e-05, |
| "loss": 0.6568, |
| "loss/crossentropy": 3.4983789920806885, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.5078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.14897578954696655, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.0011208452491820301, |
| "grad_norm": 3.078125, |
| "grad_norm_var": 18.548115793863932, |
| "learning_rate": 5e-05, |
| "loss": 0.2164, |
| "loss/crossentropy": 0.7671207189559937, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028927473351359367, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.0011290867583671922, |
| "grad_norm": 3.015625, |
| "grad_norm_var": 18.743755849202476, |
| "learning_rate": 5e-05, |
| "loss": 0.2917, |
| "loss/crossentropy": 1.4987382888793945, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.051444459706544876, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.0011373282675523541, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 19.85060806274414, |
| "learning_rate": 5e-05, |
| "loss": 0.2295, |
| "loss/crossentropy": 1.1994467973709106, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1982421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03128223866224289, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.0011455697767375162, |
| "grad_norm": 11.625, |
| "grad_norm_var": 20.42235895792643, |
| "learning_rate": 5e-05, |
| "loss": 0.3444, |
| "loss/crossentropy": 1.7283226251602173, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.287109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05732431262731552, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0011538112859226781, |
| "grad_norm": 4.09375, |
| "grad_norm_var": 20.941615549723306, |
| "learning_rate": 5e-05, |
| "loss": 0.3394, |
| "loss/crossentropy": 2.3807051181793213, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.26171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07771667838096619, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.0011620527951078403, |
| "grad_norm": 4.4375, |
| "grad_norm_var": 18.514149729410807, |
| "learning_rate": 5e-05, |
| "loss": 0.4546, |
| "loss/crossentropy": 3.182520866394043, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.34765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10692334175109863, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.0011702943042930021, |
| "grad_norm": 4.34375, |
| "grad_norm_var": 17.60290501912435, |
| "learning_rate": 5e-05, |
| "loss": 0.2908, |
| "loss/crossentropy": 1.5368177890777588, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2431640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04762953519821167, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.001178535813478164, |
| "grad_norm": 4.5, |
| "grad_norm_var": 17.029766591389976, |
| "learning_rate": 5e-05, |
| "loss": 0.407, |
| "loss/crossentropy": 2.778043270111084, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.302734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.1042385995388031, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.0011867773226633262, |
| "grad_norm": 4.15625, |
| "grad_norm_var": 16.600789133707682, |
| "learning_rate": 5e-05, |
| "loss": 0.3435, |
| "loss/crossentropy": 2.792048692703247, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.267578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07588605582714081, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.001195018831848488, |
| "grad_norm": 3.8125, |
| "grad_norm_var": 16.797792307535808, |
| "learning_rate": 5e-05, |
| "loss": 0.2385, |
| "loss/crossentropy": 1.401113510131836, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.205078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03345421701669693, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.0012032603410336502, |
| "grad_norm": 6.875, |
| "grad_norm_var": 12.726405588785807, |
| "learning_rate": 5e-05, |
| "loss": 0.3725, |
| "loss/crossentropy": 2.2165474891662598, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07561925053596497, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.001211501850218812, |
| "grad_norm": 3.171875, |
| "grad_norm_var": 12.234430948893229, |
| "learning_rate": 5e-05, |
| "loss": 0.2506, |
| "loss/crossentropy": 2.589618444442749, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05137525126338005, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.0012197433594039742, |
| "grad_norm": 22.25, |
| "grad_norm_var": 29.706151326497395, |
| "learning_rate": 5e-05, |
| "loss": 0.542, |
| "loss/crossentropy": 1.4461145401000977, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.47265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06938936561346054, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.001227984868589136, |
| "grad_norm": 4.75, |
| "grad_norm_var": 28.819587198893228, |
| "learning_rate": 5e-05, |
| "loss": 0.3594, |
| "loss/crossentropy": 1.5630475282669067, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07812213897705078, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.001236226377774298, |
| "grad_norm": 3.09375, |
| "grad_norm_var": 28.963407389322917, |
| "learning_rate": 5e-05, |
| "loss": 0.2805, |
| "loss/crossentropy": 1.3344874382019043, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04612912982702255, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.00124446788695946, |
| "grad_norm": 6.34375, |
| "grad_norm_var": 24.164176432291665, |
| "learning_rate": 5e-05, |
| "loss": 0.3487, |
| "loss/crossentropy": 2.1057682037353516, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06741908937692642, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.001252709396144622, |
| "grad_norm": 3.078125, |
| "grad_norm_var": 24.164176432291665, |
| "learning_rate": 5e-05, |
| "loss": 0.3282, |
| "loss/crossentropy": 2.7360680103302, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.248046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0801510438323021, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.001260950905329784, |
| "grad_norm": 3.265625, |
| "grad_norm_var": 24.076806640625, |
| "learning_rate": 5e-05, |
| "loss": 0.2896, |
| "loss/crossentropy": 2.9115164279937744, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06300797313451767, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.001269192414514946, |
| "grad_norm": 3.109375, |
| "grad_norm_var": 23.841239420572915, |
| "learning_rate": 5e-05, |
| "loss": 0.2801, |
| "loss/crossentropy": 2.20858097076416, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.22265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05741541087627411, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.001277433923700108, |
| "grad_norm": 3.4375, |
| "grad_norm_var": 21.679227701822917, |
| "learning_rate": 5e-05, |
| "loss": 0.2616, |
| "loss/crossentropy": 2.4968795776367188, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.205078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05649275332689285, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.00128567543288527, |
| "grad_norm": 1.90625, |
| "grad_norm_var": 22.328641764322917, |
| "learning_rate": 5e-05, |
| "loss": 0.2142, |
| "loss/crossentropy": 1.6698881387710571, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03645133972167969, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.0012939169420704319, |
| "grad_norm": 2.84375, |
| "grad_norm_var": 22.640543619791668, |
| "learning_rate": 5e-05, |
| "loss": 0.2762, |
| "loss/crossentropy": 2.5742998123168945, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.208984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06718096137046814, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.001302158451255594, |
| "grad_norm": 13.0, |
| "grad_norm_var": 26.498661295572916, |
| "learning_rate": 5e-05, |
| "loss": 0.3361, |
| "loss/crossentropy": 2.4912259578704834, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.26953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06653441488742828, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.0013103999604407559, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 27.131640625, |
| "learning_rate": 5e-05, |
| "loss": 0.244, |
| "loss/crossentropy": 1.5568723678588867, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1923828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05163004621863365, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.001318641469625918, |
| "grad_norm": 2.90625, |
| "grad_norm_var": 27.446744791666667, |
| "learning_rate": 5e-05, |
| "loss": 0.2633, |
| "loss/crossentropy": 2.150268793106079, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.212890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.050429798662662506, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.0013268829788110799, |
| "grad_norm": 3.6875, |
| "grad_norm_var": 27.473893229166666, |
| "learning_rate": 5e-05, |
| "loss": 0.2778, |
| "loss/crossentropy": 1.7107495069503784, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.21875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.059024274349212646, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.001335124487996242, |
| "grad_norm": 5.46875, |
| "grad_norm_var": 27.316239420572916, |
| "learning_rate": 5e-05, |
| "loss": 0.3054, |
| "loss/crossentropy": 2.609410285949707, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.236328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06908264756202698, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.0013433659971814039, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 27.654426066080728, |
| "learning_rate": 5e-05, |
| "loss": 0.1832, |
| "loss/crossentropy": 1.302159309387207, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1552734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027962597087025642, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.0013516075063665658, |
| "grad_norm": 4.96875, |
| "grad_norm_var": 7.092438761393229, |
| "learning_rate": 5e-05, |
| "loss": 0.275, |
| "loss/crossentropy": 1.3545722961425781, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03673800453543663, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.0013598490155517279, |
| "grad_norm": 5.625, |
| "grad_norm_var": 7.210814412434896, |
| "learning_rate": 5e-05, |
| "loss": 0.3255, |
| "loss/crossentropy": 2.3857431411743164, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.267578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05789117142558098, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.0013680905247368898, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 7.364216105143229, |
| "learning_rate": 5e-05, |
| "loss": 0.224, |
| "loss/crossentropy": 1.6262630224227905, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1865234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03748723864555359, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.0013763320339220519, |
| "grad_norm": 3.515625, |
| "grad_norm_var": 7.037398274739584, |
| "learning_rate": 5e-05, |
| "loss": 0.3005, |
| "loss/crossentropy": 2.802839756011963, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05828278884291649, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.0013845735431072138, |
| "grad_norm": 5.9375, |
| "grad_norm_var": 7.206615193684896, |
| "learning_rate": 5e-05, |
| "loss": 0.3168, |
| "loss/crossentropy": 2.6197855472564697, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.24609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07069416344165802, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.0013928150522923759, |
| "grad_norm": 2.609375, |
| "grad_norm_var": 7.311205037434896, |
| "learning_rate": 5e-05, |
| "loss": 0.2188, |
| "loss/crossentropy": 2.0093469619750977, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04102495685219765, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.0014010565614775378, |
| "grad_norm": 9.0625, |
| "grad_norm_var": 8.730110677083333, |
| "learning_rate": 5e-05, |
| "loss": 0.4723, |
| "loss/crossentropy": 0.45764070749282837, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05042431876063347, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0014092980706626997, |
| "grad_norm": 10.6875, |
| "grad_norm_var": 11.003287760416667, |
| "learning_rate": 5e-05, |
| "loss": 0.2967, |
| "loss/crossentropy": 1.4506618976593018, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04666414484381676, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.0014175395798478618, |
| "grad_norm": 2.734375, |
| "grad_norm_var": 10.711449178059896, |
| "learning_rate": 5e-05, |
| "loss": 0.1833, |
| "loss/crossentropy": 1.2239363193511963, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1494140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03392494469881058, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.0014257810890330237, |
| "grad_norm": 9.25, |
| "grad_norm_var": 11.44383036295573, |
| "learning_rate": 5e-05, |
| "loss": 0.4527, |
| "loss/crossentropy": 2.3572747707366943, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.365234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08745455741882324, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.0014340225982181858, |
| "grad_norm": 3.328125, |
| "grad_norm_var": 7.476220703125, |
| "learning_rate": 5e-05, |
| "loss": 0.3071, |
| "loss/crossentropy": 2.6207613945007324, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06882129609584808, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.0014422641074033477, |
| "grad_norm": 12.8125, |
| "grad_norm_var": 10.892020670572917, |
| "learning_rate": 5e-05, |
| "loss": 0.3909, |
| "loss/crossentropy": 2.3904902935028076, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.3046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08622868359088898, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.0014505056165885098, |
| "grad_norm": 6.78125, |
| "grad_norm_var": 10.519657389322917, |
| "learning_rate": 5e-05, |
| "loss": 0.4847, |
| "loss/crossentropy": 2.4582583904266357, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.40625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07843705266714096, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.0014587471257736717, |
| "grad_norm": 6.125, |
| "grad_norm_var": 10.241630045572917, |
| "learning_rate": 5e-05, |
| "loss": 0.3267, |
| "loss/crossentropy": 2.8331282138824463, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.248046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07867051661014557, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.0014669886349588336, |
| "grad_norm": 4.21875, |
| "grad_norm_var": 10.400809733072917, |
| "learning_rate": 5e-05, |
| "loss": 0.4616, |
| "loss/crossentropy": 2.921239137649536, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.35546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10613523423671722, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.0014752301441439957, |
| "grad_norm": 2.96875, |
| "grad_norm_var": 10.066845703125, |
| "learning_rate": 5e-05, |
| "loss": 0.2874, |
| "loss/crossentropy": 2.7095119953155518, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2333984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05400337651371956, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.0014834716533291576, |
| "grad_norm": 2.71875, |
| "grad_norm_var": 10.635205078125, |
| "learning_rate": 5e-05, |
| "loss": 0.3012, |
| "loss/crossentropy": 2.5020012855529785, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07070466130971909, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.0014917131625143197, |
| "grad_norm": 2.375, |
| "grad_norm_var": 11.313981119791666, |
| "learning_rate": 5e-05, |
| "loss": 0.2055, |
| "loss/crossentropy": 1.2848535776138306, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1767578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02870912477374077, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.0014999546716994816, |
| "grad_norm": 4.4375, |
| "grad_norm_var": 10.703043619791666, |
| "learning_rate": 5e-05, |
| "loss": 0.2174, |
| "loss/crossentropy": 0.40366628766059875, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.016210440546274185, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.0015081961808846437, |
| "grad_norm": 6.28125, |
| "grad_norm_var": 10.41333719889323, |
| "learning_rate": 5e-05, |
| "loss": 0.3403, |
| "loss/crossentropy": 2.8000519275665283, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.263671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07665810734033585, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.0015164376900698056, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 11.22276102701823, |
| "learning_rate": 5e-05, |
| "loss": 0.2103, |
| "loss/crossentropy": 1.5652306079864502, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1708984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.039365194737911224, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.0015246791992549675, |
| "grad_norm": 50.5, |
| "grad_norm_var": 135.891162109375, |
| "learning_rate": 5e-05, |
| "loss": 0.5337, |
| "loss/crossentropy": 1.4967753887176514, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.4453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08833958208560944, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.0015329207084401296, |
| "grad_norm": 6.53125, |
| "grad_norm_var": 136.11099853515626, |
| "learning_rate": 5e-05, |
| "loss": 0.3785, |
| "loss/crossentropy": 2.4065871238708496, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08948490023612976, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.0015411622176252915, |
| "grad_norm": 6.125, |
| "grad_norm_var": 136.0016886393229, |
| "learning_rate": 5e-05, |
| "loss": 0.315, |
| "loss/crossentropy": 2.2277615070343018, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.244140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07082026451826096, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.0015494037268104536, |
| "grad_norm": 2.578125, |
| "grad_norm_var": 136.11466471354166, |
| "learning_rate": 5e-05, |
| "loss": 0.2275, |
| "loss/crossentropy": 2.0500736236572266, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1806640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04683098569512367, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.0015576452359956155, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 138.1135732014974, |
| "learning_rate": 5e-05, |
| "loss": 0.1763, |
| "loss/crossentropy": 0.851466953754425, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1552734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021058566868305206, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.0015658867451807776, |
| "grad_norm": 10.9375, |
| "grad_norm_var": 137.36402994791666, |
| "learning_rate": 5e-05, |
| "loss": 0.4211, |
| "loss/crossentropy": 1.5812166929244995, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.33984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08130454272031784, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.0015741282543659395, |
| "grad_norm": 5.75, |
| "grad_norm_var": 136.052685546875, |
| "learning_rate": 5e-05, |
| "loss": 0.2656, |
| "loss/crossentropy": 1.3986968994140625, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2216796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04394121095538139, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.0015823697635511014, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 137.9039052327474, |
| "learning_rate": 5e-05, |
| "loss": 0.2049, |
| "loss/crossentropy": 2.949207067489624, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1591796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04576685652136803, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.0015906112727362635, |
| "grad_norm": 206.0, |
| "grad_norm_var": 2601.2852040608723, |
| "learning_rate": 5e-05, |
| "loss": 1.1479, |
| "loss/crossentropy": 1.637980580329895, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.13231301307678223, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.0015988527819214254, |
| "grad_norm": 4.28125, |
| "grad_norm_var": 2601.1549875895184, |
| "learning_rate": 5e-05, |
| "loss": 0.294, |
| "loss/crossentropy": 2.0168232917785645, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.055670544505119324, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.0016070942911065875, |
| "grad_norm": 5.4375, |
| "grad_norm_var": 2595.9699696858725, |
| "learning_rate": 5e-05, |
| "loss": 0.3063, |
| "loss/crossentropy": 2.8578052520751953, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2275390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07876630872488022, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.0016153358002917494, |
| "grad_norm": 2.5, |
| "grad_norm_var": 2596.477936808268, |
| "learning_rate": 5e-05, |
| "loss": 0.1944, |
| "loss/crossentropy": 1.4366533756256104, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.166015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028374146670103073, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.0016235773094769115, |
| "grad_norm": 2.09375, |
| "grad_norm_var": 2597.144513956706, |
| "learning_rate": 5e-05, |
| "loss": 0.2272, |
| "loss/crossentropy": 1.4764188528060913, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1845703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04259010776877403, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.0016318188186620734, |
| "grad_norm": 8.25, |
| "grad_norm_var": 2590.14152730306, |
| "learning_rate": 5e-05, |
| "loss": 0.2137, |
| "loss/crossentropy": 0.44381940364837646, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1884765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025243356823921204, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.0016400603278472353, |
| "grad_norm": 3.703125, |
| "grad_norm_var": 2595.355013020833, |
| "learning_rate": 5e-05, |
| "loss": 0.2982, |
| "loss/crossentropy": 2.862804889678955, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06769528239965439, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.0016483018370323974, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2577.980920410156, |
| "learning_rate": 5e-05, |
| "loss": 0.5157, |
| "loss/crossentropy": 2.726966142654419, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.4296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08603046834468842, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0016565433462175593, |
| "grad_norm": 8.4375, |
| "grad_norm_var": 2527.9221638997396, |
| "learning_rate": 5e-05, |
| "loss": 0.4416, |
| "loss/crossentropy": 1.4357587099075317, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.37109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07054366171360016, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.0016647848554027214, |
| "grad_norm": 3.515625, |
| "grad_norm_var": 2533.595897420247, |
| "learning_rate": 5e-05, |
| "loss": 0.2845, |
| "loss/crossentropy": 2.2265231609344482, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.21875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06571735441684723, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.0016730263645878833, |
| "grad_norm": 8.1875, |
| "grad_norm_var": 2530.3101308186847, |
| "learning_rate": 5e-05, |
| "loss": 0.3526, |
| "loss/crossentropy": 2.462019681930542, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0713062509894371, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.0016812678737730454, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 2531.50295308431, |
| "learning_rate": 5e-05, |
| "loss": 0.2505, |
| "loss/crossentropy": 2.9555587768554688, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1923828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05816446244716644, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.0016895093829582073, |
| "grad_norm": 4.3125, |
| "grad_norm_var": 2527.0187459309896, |
| "learning_rate": 5e-05, |
| "loss": 0.3139, |
| "loss/crossentropy": 1.3578487634658813, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.267578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04633466899394989, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.0016977508921433692, |
| "grad_norm": 5.03125, |
| "grad_norm_var": 2535.7589192708333, |
| "learning_rate": 5e-05, |
| "loss": 0.2753, |
| "loss/crossentropy": 2.804027557373047, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06441053748130798, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.0017059924013285313, |
| "grad_norm": 2.859375, |
| "grad_norm_var": 2541.3487782796224, |
| "learning_rate": 5e-05, |
| "loss": 0.2116, |
| "loss/crossentropy": 1.8921546936035156, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1767578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034831516444683075, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.0017142339105136932, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 2542.324095662435, |
| "learning_rate": 5e-05, |
| "loss": 0.2004, |
| "loss/crossentropy": 2.6469180583953857, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1572265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04312657564878464, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.0017224754196988553, |
| "grad_norm": 5.65625, |
| "grad_norm_var": 47.41833394368489, |
| "learning_rate": 5e-05, |
| "loss": 0.3427, |
| "loss/crossentropy": 1.994149088859558, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.27734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06540031731128693, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.0017307169288840172, |
| "grad_norm": 3.671875, |
| "grad_norm_var": 47.59491780598958, |
| "learning_rate": 5e-05, |
| "loss": 0.2391, |
| "loss/crossentropy": 1.5324124097824097, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04377663880586624, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0017389584380691793, |
| "grad_norm": 7.125, |
| "grad_norm_var": 47.61689046223958, |
| "learning_rate": 5e-05, |
| "loss": 0.5363, |
| "loss/crossentropy": 2.5077903270721436, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.4453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09098894894123077, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.0017471999472543412, |
| "grad_norm": 2.5, |
| "grad_norm_var": 47.61689046223958, |
| "learning_rate": 5e-05, |
| "loss": 0.2245, |
| "loss/crossentropy": 1.6434502601623535, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.042850345373153687, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.0017554414564395031, |
| "grad_norm": 3.0625, |
| "grad_norm_var": 47.140462239583336, |
| "learning_rate": 5e-05, |
| "loss": 0.2987, |
| "loss/crossentropy": 2.1260766983032227, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.228515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07022828608751297, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.0017636829656246652, |
| "grad_norm": 2.703125, |
| "grad_norm_var": 47.61895243326823, |
| "learning_rate": 5e-05, |
| "loss": 0.3036, |
| "loss/crossentropy": 2.342567205429077, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.24609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.057552557438611984, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.0017719244748098271, |
| "grad_norm": 3.5625, |
| "grad_norm_var": 47.66232096354167, |
| "learning_rate": 5e-05, |
| "loss": 0.2831, |
| "loss/crossentropy": 2.4342286586761475, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.21875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06433624029159546, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.0017801659839949892, |
| "grad_norm": 4.59375, |
| "grad_norm_var": 4.341304524739583, |
| "learning_rate": 5e-05, |
| "loss": 0.2732, |
| "loss/crossentropy": 1.6944836378097534, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05249807611107826, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.0017884074931801511, |
| "grad_norm": 3.015625, |
| "grad_norm_var": 3.197980753580729, |
| "learning_rate": 5e-05, |
| "loss": 0.2028, |
| "loss/crossentropy": 1.4322035312652588, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.173828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028966199606657028, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.0017966490023653132, |
| "grad_norm": 8.1875, |
| "grad_norm_var": 4.275614420572917, |
| "learning_rate": 5e-05, |
| "loss": 0.4244, |
| "loss/crossentropy": 2.7989346981048584, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.3359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08842961490154266, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.0018048905115504751, |
| "grad_norm": 2.359375, |
| "grad_norm_var": 3.3524485270182294, |
| "learning_rate": 5e-05, |
| "loss": 0.1763, |
| "loss/crossentropy": 0.49160024523735046, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.020072361454367638, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.001813132020735637, |
| "grad_norm": 2.03125, |
| "grad_norm_var": 3.356331380208333, |
| "learning_rate": 5e-05, |
| "loss": 0.1975, |
| "loss/crossentropy": 0.9580312967300415, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027562592178583145, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0018213735299207991, |
| "grad_norm": 4.46875, |
| "grad_norm_var": 3.366402180989583, |
| "learning_rate": 5e-05, |
| "loss": 0.2313, |
| "loss/crossentropy": 2.2378625869750977, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04968283697962761, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.001829615039105961, |
| "grad_norm": 2.609375, |
| "grad_norm_var": 3.3716054280598957, |
| "learning_rate": 5e-05, |
| "loss": 0.2638, |
| "loss/crossentropy": 1.2911431789398193, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.21484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04894676432013512, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.0018378565482911231, |
| "grad_norm": 4.125, |
| "grad_norm_var": 3.3196126302083333, |
| "learning_rate": 5e-05, |
| "loss": 0.2351, |
| "loss/crossentropy": 2.6005423069000244, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.185546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04953521490097046, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.001846098057476285, |
| "grad_norm": 2.109375, |
| "grad_norm_var": 3.2319295247395834, |
| "learning_rate": 5e-05, |
| "loss": 0.1785, |
| "loss/crossentropy": 1.635225772857666, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1455078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03304152935743332, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.0018543395666614471, |
| "grad_norm": 6.4375, |
| "grad_norm_var": 3.457047526041667, |
| "learning_rate": 5e-05, |
| "loss": 0.5431, |
| "loss/crossentropy": 2.507209062576294, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.404296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.13885299861431122, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.001862581075846609, |
| "grad_norm": 2.984375, |
| "grad_norm_var": 3.508430989583333, |
| "learning_rate": 5e-05, |
| "loss": 0.181, |
| "loss/crossentropy": 0.42544418573379517, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1611328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019888322800397873, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.001870822585031771, |
| "grad_norm": 3.140625, |
| "grad_norm_var": 2.7699208577473957, |
| "learning_rate": 5e-05, |
| "loss": 0.2789, |
| "loss/crossentropy": 2.700981378555298, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06499424576759338, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.001879064094216933, |
| "grad_norm": 5.3125, |
| "grad_norm_var": 2.8449940999348957, |
| "learning_rate": 5e-05, |
| "loss": 0.2954, |
| "loss/crossentropy": 1.6264232397079468, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2451171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.050260186195373535, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.001887305603402095, |
| "grad_norm": 5.96875, |
| "grad_norm_var": 3.089452107747396, |
| "learning_rate": 5e-05, |
| "loss": 0.1884, |
| "loss/crossentropy": 1.3441599607467651, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1669921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021441757678985596, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.001895547112587257, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 3.197223917643229, |
| "learning_rate": 5e-05, |
| "loss": 0.1824, |
| "loss/crossentropy": 0.4492271840572357, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.162109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02026466839015484, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.001903788621772419, |
| "grad_norm": 3.21875, |
| "grad_norm_var": 3.222020467122396, |
| "learning_rate": 5e-05, |
| "loss": 0.2551, |
| "loss/crossentropy": 2.23905873298645, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.20703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04803081601858139, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.001912030130957581, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 3.368024698893229, |
| "learning_rate": 5e-05, |
| "loss": 0.216, |
| "loss/crossentropy": 1.9740031957626343, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04604914411902428, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.001920271640142743, |
| "grad_norm": 2.125, |
| "grad_norm_var": 3.5072428385416665, |
| "learning_rate": 5e-05, |
| "loss": 0.2327, |
| "loss/crossentropy": 2.738755226135254, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.18359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04907117411494255, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.0019285131493279048, |
| "grad_norm": 4.21875, |
| "grad_norm_var": 2.1248982747395835, |
| "learning_rate": 5e-05, |
| "loss": 0.4989, |
| "loss/crossentropy": 2.8038580417633057, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10827778279781342, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.001936754658513067, |
| "grad_norm": 2.84375, |
| "grad_norm_var": 2.068040974934896, |
| "learning_rate": 5e-05, |
| "loss": 0.3216, |
| "loss/crossentropy": 2.015542984008789, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04818400368094444, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.0019449961676982288, |
| "grad_norm": 3.203125, |
| "grad_norm_var": 1.9248372395833333, |
| "learning_rate": 5e-05, |
| "loss": 0.2473, |
| "loss/crossentropy": 1.5457327365875244, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.205078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04223756492137909, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.001953237676883391, |
| "grad_norm": 3.15625, |
| "grad_norm_var": 1.8752766927083333, |
| "learning_rate": 5e-05, |
| "loss": 0.3016, |
| "loss/crossentropy": 2.3469016551971436, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.24609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05547412484884262, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.001961479186068553, |
| "grad_norm": 3.578125, |
| "grad_norm_var": 1.8204060872395833, |
| "learning_rate": 5e-05, |
| "loss": 0.2486, |
| "loss/crossentropy": 2.7459280490875244, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.197265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.051300592720508575, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.0019697206952537147, |
| "grad_norm": 1.9765625, |
| "grad_norm_var": 1.9438433329264322, |
| "learning_rate": 5e-05, |
| "loss": 0.2187, |
| "loss/crossentropy": 2.024442434310913, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04683014005422592, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.001977962204438877, |
| "grad_norm": 3.140625, |
| "grad_norm_var": 1.8308489481608072, |
| "learning_rate": 5e-05, |
| "loss": 0.2406, |
| "loss/crossentropy": 1.520363211631775, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.039388738572597504, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.001986203713624039, |
| "grad_norm": 2.609375, |
| "grad_norm_var": 1.2366920471191407, |
| "learning_rate": 5e-05, |
| "loss": 0.1882, |
| "loss/crossentropy": 2.6319429874420166, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.035828400403261185, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.001994445222809201, |
| "grad_norm": 9.3125, |
| "grad_norm_var": 3.5240455627441407, |
| "learning_rate": 5e-05, |
| "loss": 0.2554, |
| "loss/crossentropy": 1.4544413089752197, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.21875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036612022668123245, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.0020026867319943627, |
| "grad_norm": 2.96875, |
| "grad_norm_var": 3.5372271219889324, |
| "learning_rate": 5e-05, |
| "loss": 0.2419, |
| "loss/crossentropy": 2.548147201538086, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1884765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.053470924496650696, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.0020109282411795246, |
| "grad_norm": 11.8125, |
| "grad_norm_var": 7.640775299072265, |
| "learning_rate": 5e-05, |
| "loss": 0.3306, |
| "loss/crossentropy": 0.6415009498596191, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.294921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.035646334290504456, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.002019169750364687, |
| "grad_norm": 3.203125, |
| "grad_norm_var": 7.404184722900391, |
| "learning_rate": 5e-05, |
| "loss": 0.2558, |
| "loss/crossentropy": 2.516376495361328, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.052696891129016876, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.002027411259549849, |
| "grad_norm": 2.53125, |
| "grad_norm_var": 7.3314674377441404, |
| "learning_rate": 5e-05, |
| "loss": 0.1816, |
| "loss/crossentropy": 0.31695130467414856, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.166015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.015581747516989708, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.0020356527687350108, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 7.4243934631347654, |
| "learning_rate": 5e-05, |
| "loss": 0.2397, |
| "loss/crossentropy": 1.937793493270874, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05216747149825096, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.0020438942779201726, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 7.367502593994141, |
| "learning_rate": 5e-05, |
| "loss": 0.2293, |
| "loss/crossentropy": 2.4479126930236816, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04766194522380829, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.0020521357871053345, |
| "grad_norm": 2.25, |
| "grad_norm_var": 7.339662424723307, |
| "learning_rate": 5e-05, |
| "loss": 0.1789, |
| "loss/crossentropy": 1.441886067390442, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030450304970145226, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.002060377296290497, |
| "grad_norm": 6.25, |
| "grad_norm_var": 7.694205474853516, |
| "learning_rate": 5e-05, |
| "loss": 0.2986, |
| "loss/crossentropy": 2.495968818664551, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06425687670707703, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0020686188054756588, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 7.931449127197266, |
| "learning_rate": 5e-05, |
| "loss": 0.1676, |
| "loss/crossentropy": 1.5723477602005005, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1416015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025984089821577072, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.0020768603146608207, |
| "grad_norm": 5.3125, |
| "grad_norm_var": 8.00752944946289, |
| "learning_rate": 5e-05, |
| "loss": 0.4023, |
| "loss/crossentropy": 1.5475258827209473, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.34375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05857189744710922, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.0020851018238459825, |
| "grad_norm": 4.96875, |
| "grad_norm_var": 7.996083323160807, |
| "learning_rate": 5e-05, |
| "loss": 0.3194, |
| "loss/crossentropy": 2.286716938018799, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.244140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07523184269666672, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.002093343333031145, |
| "grad_norm": 5.5, |
| "grad_norm_var": 8.076161448160807, |
| "learning_rate": 5e-05, |
| "loss": 0.2753, |
| "loss/crossentropy": 1.5914890766143799, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.044802576303482056, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.0021015848422163068, |
| "grad_norm": 3.34375, |
| "grad_norm_var": 7.771882120768229, |
| "learning_rate": 5e-05, |
| "loss": 0.2477, |
| "loss/crossentropy": 2.1449875831604004, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1962890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0514422208070755, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.0021098263514014687, |
| "grad_norm": 9.3125, |
| "grad_norm_var": 9.1392578125, |
| "learning_rate": 5e-05, |
| "loss": 0.5091, |
| "loss/crossentropy": 2.605140447616577, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.408203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10090796649456024, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.0021180678605866306, |
| "grad_norm": 13.75, |
| "grad_norm_var": 13.705028279622395, |
| "learning_rate": 5e-05, |
| "loss": 0.3277, |
| "loss/crossentropy": 2.162487745285034, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.26953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.058200109750032425, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.0021263093697717925, |
| "grad_norm": 3.140625, |
| "grad_norm_var": 12.910640462239583, |
| "learning_rate": 5e-05, |
| "loss": 0.2509, |
| "loss/crossentropy": 2.1336512565612793, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.197265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.053586918860673904, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.0021345508789569548, |
| "grad_norm": 5.71875, |
| "grad_norm_var": 12.61343994140625, |
| "learning_rate": 5e-05, |
| "loss": 0.2007, |
| "loss/crossentropy": 0.37939441204071045, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019016824662685394, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.0021427923881421167, |
| "grad_norm": 2.59375, |
| "grad_norm_var": 9.846614583333333, |
| "learning_rate": 5e-05, |
| "loss": 0.2159, |
| "loss/crossentropy": 1.1070560216903687, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.185546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03035794384777546, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.0021510338973272786, |
| "grad_norm": 4.28125, |
| "grad_norm_var": 9.709251912434896, |
| "learning_rate": 5e-05, |
| "loss": 0.2899, |
| "loss/crossentropy": 1.604844331741333, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2294921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06044600158929825, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.0021592754065124405, |
| "grad_norm": 2.84375, |
| "grad_norm_var": 9.623680623372396, |
| "learning_rate": 5e-05, |
| "loss": 0.2858, |
| "loss/crossentropy": 2.6856131553649902, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06507028639316559, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.0021675169156976024, |
| "grad_norm": 3.734375, |
| "grad_norm_var": 9.353270467122396, |
| "learning_rate": 5e-05, |
| "loss": 0.3012, |
| "loss/crossentropy": 2.49045991897583, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.24609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.055099453777074814, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.0021757584248827647, |
| "grad_norm": 4.65625, |
| "grad_norm_var": 8.964476521809896, |
| "learning_rate": 5e-05, |
| "loss": 0.3876, |
| "loss/crossentropy": 1.3125131130218506, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.330078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.057526711374521255, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.0021839999340679266, |
| "grad_norm": 11.625, |
| "grad_norm_var": 11.065306599934896, |
| "learning_rate": 5e-05, |
| "loss": 0.5553, |
| "loss/crossentropy": 2.620126485824585, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.43359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.12169644981622696, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.0021922414432530885, |
| "grad_norm": 2.09375, |
| "grad_norm_var": 11.756932576497396, |
| "learning_rate": 5e-05, |
| "loss": 0.2092, |
| "loss/crossentropy": 1.3833715915679932, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.173828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.035347893834114075, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.0022004829524382504, |
| "grad_norm": 2.5, |
| "grad_norm_var": 11.445540364583334, |
| "learning_rate": 5e-05, |
| "loss": 0.1932, |
| "loss/crossentropy": 0.3456151485443115, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.015461962670087814, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.0022087244616234127, |
| "grad_norm": 2.546875, |
| "grad_norm_var": 11.932225545247396, |
| "learning_rate": 5e-05, |
| "loss": 0.1921, |
| "loss/crossentropy": 2.6017909049987793, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03972596302628517, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.0022169659708085746, |
| "grad_norm": 14.4375, |
| "grad_norm_var": 17.290453084309895, |
| "learning_rate": 5e-05, |
| "loss": 0.368, |
| "loss/crossentropy": 2.35398006439209, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0789838507771492, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.0022252074799937365, |
| "grad_norm": 3.53125, |
| "grad_norm_var": 17.599608357747396, |
| "learning_rate": 5e-05, |
| "loss": 0.2167, |
| "loss/crossentropy": 2.4693641662597656, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.166015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05065637826919556, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.0022334489891788984, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 17.895113118489583, |
| "learning_rate": 5e-05, |
| "loss": 0.1815, |
| "loss/crossentropy": 1.3819841146469116, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.154296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027211952954530716, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.0022416904983640603, |
| "grad_norm": 7.0, |
| "grad_norm_var": 17.078511555989582, |
| "learning_rate": 5e-05, |
| "loss": 0.1861, |
| "loss/crossentropy": 1.4722107648849487, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1572265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028832225129008293, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.0022499320075492226, |
| "grad_norm": 4.21875, |
| "grad_norm_var": 12.190022786458334, |
| "learning_rate": 5e-05, |
| "loss": 0.3099, |
| "loss/crossentropy": 1.6392327547073364, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.251953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05795075744390488, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.0022581735167343845, |
| "grad_norm": 5.21875, |
| "grad_norm_var": 11.989110310872396, |
| "learning_rate": 5e-05, |
| "loss": 0.4204, |
| "loss/crossentropy": 2.4640941619873047, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.34765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0727241188287735, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.0022664150259195464, |
| "grad_norm": 3.21875, |
| "grad_norm_var": 12.130060831705729, |
| "learning_rate": 5e-05, |
| "loss": 0.2795, |
| "loss/crossentropy": 1.452579140663147, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2314453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.048067688941955566, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.0022746565351047083, |
| "grad_norm": 6.9375, |
| "grad_norm_var": 12.023729451497395, |
| "learning_rate": 5e-05, |
| "loss": 0.2885, |
| "loss/crossentropy": 1.5026805400848389, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05024395138025284, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.00228289804428987, |
| "grad_norm": 2.875, |
| "grad_norm_var": 12.298021443684895, |
| "learning_rate": 5e-05, |
| "loss": 0.1911, |
| "loss/crossentropy": 1.6457816362380981, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1552734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03582204133272171, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.0022911395534750325, |
| "grad_norm": 2.453125, |
| "grad_norm_var": 12.419710286458333, |
| "learning_rate": 5e-05, |
| "loss": 0.226, |
| "loss/crossentropy": 2.44157338142395, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1767578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0492391511797905, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.0022993810626601944, |
| "grad_norm": 3.09375, |
| "grad_norm_var": 12.55113016764323, |
| "learning_rate": 5e-05, |
| "loss": 0.285, |
| "loss/crossentropy": 2.398951292037964, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.21484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07012955844402313, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.0023076225718453563, |
| "grad_norm": 4.78125, |
| "grad_norm_var": 12.547500610351562, |
| "learning_rate": 5e-05, |
| "loss": 0.3511, |
| "loss/crossentropy": 2.1601598262786865, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.267578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08354485034942627, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.002315864081030518, |
| "grad_norm": 1.8046875, |
| "grad_norm_var": 9.82229995727539, |
| "learning_rate": 5e-05, |
| "loss": 0.2235, |
| "loss/crossentropy": 1.5090404748916626, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.18359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03991977125406265, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.0023241055902156805, |
| "grad_norm": 3.375, |
| "grad_norm_var": 9.5434445699056, |
| "learning_rate": 5e-05, |
| "loss": 0.3108, |
| "loss/crossentropy": 2.371715545654297, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.251953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05880487337708473, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.0023323470994008424, |
| "grad_norm": 4.78125, |
| "grad_norm_var": 9.288734690348308, |
| "learning_rate": 5e-05, |
| "loss": 0.3743, |
| "loss/crossentropy": 1.7449641227722168, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07747267186641693, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.0023405886085860043, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 9.345546213785807, |
| "learning_rate": 5e-05, |
| "loss": 0.2385, |
| "loss/crossentropy": 2.332099199295044, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05684517323970795, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.002348830117771166, |
| "grad_norm": 8.875, |
| "grad_norm_var": 3.936232248942057, |
| "learning_rate": 5e-05, |
| "loss": 0.2679, |
| "loss/crossentropy": 2.6605751514434814, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2197265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04814404994249344, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.002357071626956328, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 4.089766184488933, |
| "learning_rate": 5e-05, |
| "loss": 0.2346, |
| "loss/crossentropy": 2.4595489501953125, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.189453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0451187826693058, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.0023653131361414904, |
| "grad_norm": 6.3125, |
| "grad_norm_var": 4.175789133707682, |
| "learning_rate": 5e-05, |
| "loss": 0.4335, |
| "loss/crossentropy": 3.0684797763824463, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.3125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.12099675089120865, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.0023735546453266523, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 3.967474110921224, |
| "learning_rate": 5e-05, |
| "loss": 0.1653, |
| "loss/crossentropy": 2.7369492053985596, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03439934179186821, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.002381796154511814, |
| "grad_norm": 37.0, |
| "grad_norm_var": 71.8541135152181, |
| "learning_rate": 5e-05, |
| "loss": 0.2523, |
| "loss/crossentropy": 1.4466071128845215, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.21484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.037433233112096786, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.002390037663696976, |
| "grad_norm": 2.71875, |
| "grad_norm_var": 72.5391721089681, |
| "learning_rate": 5e-05, |
| "loss": 0.183, |
| "loss/crossentropy": 0.8366924524307251, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.154296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028669871389865875, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.002398279172882138, |
| "grad_norm": 3.296875, |
| "grad_norm_var": 72.5111467997233, |
| "learning_rate": 5e-05, |
| "loss": 0.2282, |
| "loss/crossentropy": 2.2352423667907715, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1865234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04168039560317993, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.0024065206820673003, |
| "grad_norm": 7.25, |
| "grad_norm_var": 72.55836766560873, |
| "learning_rate": 5e-05, |
| "loss": 0.5589, |
| "loss/crossentropy": 3.05739426612854, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.45703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10190241038799286, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.002414762191252462, |
| "grad_norm": 8.4375, |
| "grad_norm_var": 72.19658788045247, |
| "learning_rate": 5e-05, |
| "loss": 0.337, |
| "loss/crossentropy": 1.8930912017822266, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.29296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04408019781112671, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.002423003700437624, |
| "grad_norm": 10.9375, |
| "grad_norm_var": 72.32363255818684, |
| "learning_rate": 5e-05, |
| "loss": 0.2491, |
| "loss/crossentropy": 1.5801359415054321, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.208984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04009478539228439, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.002431245209622786, |
| "grad_norm": 5.09375, |
| "grad_norm_var": 71.57246068318685, |
| "learning_rate": 5e-05, |
| "loss": 0.1903, |
| "loss/crossentropy": 0.9831718802452087, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.158203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.032129500061273575, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.0024394867188079483, |
| "grad_norm": 2.765625, |
| "grad_norm_var": 72.41545384724935, |
| "learning_rate": 5e-05, |
| "loss": 0.3458, |
| "loss/crossentropy": 2.587148666381836, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2470703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09877443313598633, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.00244772822799311, |
| "grad_norm": 2.8125, |
| "grad_norm_var": 71.80135091145833, |
| "learning_rate": 5e-05, |
| "loss": 0.2355, |
| "loss/crossentropy": 2.281587839126587, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1865234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04900825023651123, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.002455969737178272, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 72.45891825358073, |
| "learning_rate": 5e-05, |
| "loss": 0.1823, |
| "loss/crossentropy": 1.4672614336013794, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1552734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02706265263259411, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.002464211246363434, |
| "grad_norm": 2.953125, |
| "grad_norm_var": 73.16838785807292, |
| "learning_rate": 5e-05, |
| "loss": 0.2346, |
| "loss/crossentropy": 2.4047231674194336, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1865234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.048083603382110596, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.002472452755548596, |
| "grad_norm": 11.1875, |
| "grad_norm_var": 72.89547526041666, |
| "learning_rate": 5e-05, |
| "loss": 0.3081, |
| "loss/crossentropy": 0.815432071685791, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.279296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028755802661180496, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0024806942647337582, |
| "grad_norm": 5.34375, |
| "grad_norm_var": 72.92076416015625, |
| "learning_rate": 5e-05, |
| "loss": 0.3042, |
| "loss/crossentropy": 1.8723258972167969, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.248046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05616258084774017, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.00248893577391892, |
| "grad_norm": 18.0, |
| "grad_norm_var": 78.5388905843099, |
| "learning_rate": 5e-05, |
| "loss": 0.3228, |
| "loss/crossentropy": 1.394120693206787, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03374548256397247, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.002497177283104082, |
| "grad_norm": 3.984375, |
| "grad_norm_var": 79.40784505208333, |
| "learning_rate": 5e-05, |
| "loss": 0.1944, |
| "loss/crossentropy": 1.3347328901290894, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02643435075879097, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.002505418792289244, |
| "grad_norm": 3.3125, |
| "grad_norm_var": 78.5244618733724, |
| "learning_rate": 5e-05, |
| "loss": 0.2445, |
| "loss/crossentropy": 1.3129806518554688, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04336348548531532, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.002513660301474406, |
| "grad_norm": 2.5, |
| "grad_norm_var": 19.303954060872396, |
| "learning_rate": 5e-05, |
| "loss": 0.2061, |
| "loss/crossentropy": 1.494554042816162, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02449900656938553, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.002521901810659568, |
| "grad_norm": 2.0625, |
| "grad_norm_var": 19.600291951497397, |
| "learning_rate": 5e-05, |
| "loss": 0.2326, |
| "loss/crossentropy": 2.698347806930542, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04119253158569336, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.00253014331984473, |
| "grad_norm": 3.890625, |
| "grad_norm_var": 19.427578735351563, |
| "learning_rate": 5e-05, |
| "loss": 0.4184, |
| "loss/crossentropy": 1.5166016817092896, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.3515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06688607484102249, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.002538384829029892, |
| "grad_norm": 7.28125, |
| "grad_norm_var": 19.43370666503906, |
| "learning_rate": 5e-05, |
| "loss": 0.2909, |
| "loss/crossentropy": 2.696192979812622, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.22265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.068264901638031, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.002546626338215054, |
| "grad_norm": 4.875, |
| "grad_norm_var": 18.97215881347656, |
| "learning_rate": 5e-05, |
| "loss": 0.2964, |
| "loss/crossentropy": 2.594907522201538, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05810549482703209, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.002554867847400216, |
| "grad_norm": 2.59375, |
| "grad_norm_var": 17.355557250976563, |
| "learning_rate": 5e-05, |
| "loss": 0.2423, |
| "loss/crossentropy": 2.760004758834839, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1884765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05380372703075409, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.002563109356585378, |
| "grad_norm": 2.84375, |
| "grad_norm_var": 17.659365844726562, |
| "learning_rate": 5e-05, |
| "loss": 0.2252, |
| "loss/crossentropy": 1.6147583723068237, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.189453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.035766348242759705, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.00257135086577054, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 17.966829172770183, |
| "learning_rate": 5e-05, |
| "loss": 0.1994, |
| "loss/crossentropy": 2.396746873855591, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1591796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04022689908742905, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.002579592374955702, |
| "grad_norm": 2.71875, |
| "grad_norm_var": 17.992909495035807, |
| "learning_rate": 5e-05, |
| "loss": 0.2099, |
| "loss/crossentropy": 1.5479542016983032, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03212570399045944, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.0025878338841408637, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 18.331384023030598, |
| "learning_rate": 5e-05, |
| "loss": 0.1331, |
| "loss/crossentropy": 0.4866638779640198, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01593683287501335, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.002596075393326026, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 18.5145627339681, |
| "learning_rate": 5e-05, |
| "loss": 0.2208, |
| "loss/crossentropy": 2.5530097484588623, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.18359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.037243057042360306, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.002604316902511188, |
| "grad_norm": 9.3125, |
| "grad_norm_var": 17.1267453511556, |
| "learning_rate": 5e-05, |
| "loss": 0.2906, |
| "loss/crossentropy": 1.721799373626709, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.251953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03867912292480469, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.00261255841169635, |
| "grad_norm": 4.21875, |
| "grad_norm_var": 17.100304921468098, |
| "learning_rate": 5e-05, |
| "loss": 0.2352, |
| "loss/crossentropy": 2.6998496055603027, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.189453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04570027440786362, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.0026207999208815117, |
| "grad_norm": 2.9375, |
| "grad_norm_var": 4.307966868082683, |
| "learning_rate": 5e-05, |
| "loss": 0.268, |
| "loss/crossentropy": 2.9610462188720703, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06485921144485474, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.0026290414300666736, |
| "grad_norm": 14.9375, |
| "grad_norm_var": 12.325996653238933, |
| "learning_rate": 5e-05, |
| "loss": 0.4275, |
| "loss/crossentropy": 0.4579806327819824, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.369140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.058338165283203125, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.002637282939251836, |
| "grad_norm": 2.6875, |
| "grad_norm_var": 12.433784739176433, |
| "learning_rate": 5e-05, |
| "loss": 0.2148, |
| "loss/crossentropy": 0.9076347947120667, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033187899738550186, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.002645524448436998, |
| "grad_norm": 7.5, |
| "grad_norm_var": 12.813667551676433, |
| "learning_rate": 5e-05, |
| "loss": 0.2431, |
| "loss/crossentropy": 1.4940351247787476, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.197265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04587508738040924, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.0026537659576221597, |
| "grad_norm": 3.09375, |
| "grad_norm_var": 12.533095041910807, |
| "learning_rate": 5e-05, |
| "loss": 0.2585, |
| "loss/crossentropy": 2.5319809913635254, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.193359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06514355540275574, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.0026620074668073216, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 12.839448801676433, |
| "learning_rate": 5e-05, |
| "loss": 0.1839, |
| "loss/crossentropy": 1.5948070287704468, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.150390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03354639932513237, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.002670248975992484, |
| "grad_norm": 2.109375, |
| "grad_norm_var": 12.630688222249349, |
| "learning_rate": 5e-05, |
| "loss": 0.2014, |
| "loss/crossentropy": 0.86527419090271, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.173828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027525369077920914, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.002678490485177646, |
| "grad_norm": 5.4375, |
| "grad_norm_var": 12.698766835530598, |
| "learning_rate": 5e-05, |
| "loss": 0.3587, |
| "loss/crossentropy": 1.7473679780960083, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.27734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.081350177526474, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.0026867319943628077, |
| "grad_norm": 3.8125, |
| "grad_norm_var": 12.519842274983723, |
| "learning_rate": 5e-05, |
| "loss": 0.2829, |
| "loss/crossentropy": 2.330885410308838, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.232421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05051898583769798, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.0026949735035479696, |
| "grad_norm": 3.078125, |
| "grad_norm_var": 12.476446278889973, |
| "learning_rate": 5e-05, |
| "loss": 0.2309, |
| "loss/crossentropy": 1.6335246562957764, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0394761748611927, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.0027032150127331315, |
| "grad_norm": 3.53125, |
| "grad_norm_var": 12.097102864583333, |
| "learning_rate": 5e-05, |
| "loss": 0.1913, |
| "loss/crossentropy": 1.0116859674453735, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021392133086919785, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.002711456521918294, |
| "grad_norm": 108.0, |
| "grad_norm_var": 680.3999959309896, |
| "learning_rate": 5e-05, |
| "loss": 0.7489, |
| "loss/crossentropy": 1.9841949939727783, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.13949471712112427, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.0027196980311034557, |
| "grad_norm": 1.8515625, |
| "grad_norm_var": 679.7595273335775, |
| "learning_rate": 5e-05, |
| "loss": 0.1969, |
| "loss/crossentropy": 2.633579730987549, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.154296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.042630117386579514, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.0027279395402886176, |
| "grad_norm": 3.640625, |
| "grad_norm_var": 678.318477121989, |
| "learning_rate": 5e-05, |
| "loss": 0.3044, |
| "loss/crossentropy": 1.2868930101394653, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25, |
| "loss/idx": 0.0, |
| "loss/logits": 0.054360825568437576, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.0027361810494737795, |
| "grad_norm": 2.0, |
| "grad_norm_var": 683.4576983133952, |
| "learning_rate": 5e-05, |
| "loss": 0.1712, |
| "loss/crossentropy": 1.3776507377624512, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.14453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02663344331085682, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.0027444225586589414, |
| "grad_norm": 1.875, |
| "grad_norm_var": 685.8260149637858, |
| "learning_rate": 5e-05, |
| "loss": 0.2246, |
| "loss/crossentropy": 2.4259564876556396, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04682992398738861, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.0027526640678441038, |
| "grad_norm": 2.484375, |
| "grad_norm_var": 686.2989051818847, |
| "learning_rate": 5e-05, |
| "loss": 0.1464, |
| "loss/crossentropy": 0.36722007393836975, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.011639876291155815, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.0027609055770292656, |
| "grad_norm": 2.484375, |
| "grad_norm_var": 688.6630531311035, |
| "learning_rate": 5e-05, |
| "loss": 0.182, |
| "loss/crossentropy": 1.3434722423553467, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.150390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03163960948586464, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.0027691470862144275, |
| "grad_norm": 9.4375, |
| "grad_norm_var": 685.1584144592285, |
| "learning_rate": 5e-05, |
| "loss": 0.5014, |
| "loss/crossentropy": 1.7821474075317383, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.380859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.12050823867321014, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.0027773885953995894, |
| "grad_norm": 2.765625, |
| "grad_norm_var": 688.2431556701661, |
| "learning_rate": 5e-05, |
| "loss": 0.1513, |
| "loss/crossentropy": 2.026543378829956, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1259765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025330830365419388, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.0027856301045847518, |
| "grad_norm": 2.25, |
| "grad_norm_var": 689.0501564025878, |
| "learning_rate": 5e-05, |
| "loss": 0.1708, |
| "loss/crossentropy": 1.9071934223175049, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.13671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03409082442522049, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.0027938716137699137, |
| "grad_norm": 1.75, |
| "grad_norm_var": 689.6639686584473, |
| "learning_rate": 5e-05, |
| "loss": 0.1443, |
| "loss/crossentropy": 0.5046422481536865, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1298828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.014421624131500721, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.0028021131229550755, |
| "grad_norm": 2.59375, |
| "grad_norm_var": 689.183125559489, |
| "learning_rate": 5e-05, |
| "loss": 0.2108, |
| "loss/crossentropy": 1.3842849731445312, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03306809440255165, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.0028103546321402374, |
| "grad_norm": 3.984375, |
| "grad_norm_var": 690.1626604715983, |
| "learning_rate": 5e-05, |
| "loss": 0.3321, |
| "loss/crossentropy": 2.6099562644958496, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.248046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08409686386585236, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.0028185961413253993, |
| "grad_norm": 1.9296875, |
| "grad_norm_var": 691.8675496419271, |
| "learning_rate": 5e-05, |
| "loss": 0.1953, |
| "loss/crossentropy": 1.5371732711791992, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1572265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03804505988955498, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.0028268376505105617, |
| "grad_norm": 2.828125, |
| "grad_norm_var": 692.0889689127604, |
| "learning_rate": 5e-05, |
| "loss": 0.2334, |
| "loss/crossentropy": 2.7439146041870117, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.055648088455200195, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.0028350791596957236, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 693.3022288004557, |
| "learning_rate": 5e-05, |
| "loss": 0.1965, |
| "loss/crossentropy": 1.5956981182098389, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036296091973781586, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.0028433206688808855, |
| "grad_norm": 2.59375, |
| "grad_norm_var": 3.4128326416015624, |
| "learning_rate": 5e-05, |
| "loss": 0.1868, |
| "loss/crossentropy": 1.582602858543396, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1533203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033465512096881866, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.0028515621780660473, |
| "grad_norm": 2.234375, |
| "grad_norm_var": 3.3677101135253906, |
| "learning_rate": 5e-05, |
| "loss": 0.1981, |
| "loss/crossentropy": 1.295432209968567, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030169658362865448, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.0028598036872512092, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 3.3456214904785155, |
| "learning_rate": 5e-05, |
| "loss": 0.2163, |
| "loss/crossentropy": 1.2951956987380981, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034686122089624405, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.0028680451964363716, |
| "grad_norm": 3.015625, |
| "grad_norm_var": 3.293121083577474, |
| "learning_rate": 5e-05, |
| "loss": 0.2123, |
| "loss/crossentropy": 0.4816242456436157, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.193359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.018970437347888947, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.0028762867056215335, |
| "grad_norm": 3.15625, |
| "grad_norm_var": 3.2159624735514325, |
| "learning_rate": 5e-05, |
| "loss": 0.2762, |
| "loss/crossentropy": 2.688483715057373, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2060546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07011875510215759, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.0028845282148066954, |
| "grad_norm": 3.5625, |
| "grad_norm_var": 3.2134356180826824, |
| "learning_rate": 5e-05, |
| "loss": 0.2319, |
| "loss/crossentropy": 2.7942285537719727, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.050256311893463135, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0028927697239918572, |
| "grad_norm": 2.890625, |
| "grad_norm_var": 3.1917742411295573, |
| "learning_rate": 5e-05, |
| "loss": 0.2219, |
| "loss/crossentropy": 0.3606606721878052, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.020754382014274597, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.0029010112331770196, |
| "grad_norm": 2.640625, |
| "grad_norm_var": 0.33584772745768227, |
| "learning_rate": 5e-05, |
| "loss": 0.2024, |
| "loss/crossentropy": 1.9477823972702026, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.158203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04415898397564888, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.0029092527423621815, |
| "grad_norm": 2.90625, |
| "grad_norm_var": 0.33877741495768227, |
| "learning_rate": 5e-05, |
| "loss": 0.2767, |
| "loss/crossentropy": 1.250917673110962, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2314453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04520602151751518, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.0029174942515473434, |
| "grad_norm": 6.28125, |
| "grad_norm_var": 1.1211443583170573, |
| "learning_rate": 5e-05, |
| "loss": 0.3595, |
| "loss/crossentropy": 2.2711970806121826, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07829815149307251, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.0029257357607325053, |
| "grad_norm": 3.1875, |
| "grad_norm_var": 1.0229713439941406, |
| "learning_rate": 5e-05, |
| "loss": 0.319, |
| "loss/crossentropy": 2.352555990219116, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.251953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06703340262174606, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.002933977269917667, |
| "grad_norm": 7.3125, |
| "grad_norm_var": 2.142752838134766, |
| "learning_rate": 5e-05, |
| "loss": 0.3474, |
| "loss/crossentropy": 2.538165807723999, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05830331891775131, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.0029422187791028295, |
| "grad_norm": 6.53125, |
| "grad_norm_var": 2.7735023498535156, |
| "learning_rate": 5e-05, |
| "loss": 0.222, |
| "loss/crossentropy": 1.2392635345458984, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030571604147553444, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.0029504602882879914, |
| "grad_norm": 2.140625, |
| "grad_norm_var": 2.7326812744140625, |
| "learning_rate": 5e-05, |
| "loss": 0.1149, |
| "loss/crossentropy": 0.3575584590435028, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.107421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.007437488064169884, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.0029587017974731533, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 2.8581207275390623, |
| "learning_rate": 5e-05, |
| "loss": 0.1523, |
| "loss/crossentropy": 1.4491117000579834, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.126953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025374623015522957, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.002966943306658315, |
| "grad_norm": 6.875, |
| "grad_norm_var": 3.4463175455729167, |
| "learning_rate": 5e-05, |
| "loss": 0.3615, |
| "loss/crossentropy": 2.15775203704834, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.294921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06661273539066315, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.002975184815843477, |
| "grad_norm": 9.4375, |
| "grad_norm_var": 5.334586588541667, |
| "learning_rate": 5e-05, |
| "loss": 0.2964, |
| "loss/crossentropy": 1.3742177486419678, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.259765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036625977605581284, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.0029834263250286394, |
| "grad_norm": 1.75, |
| "grad_norm_var": 5.473623657226563, |
| "learning_rate": 5e-05, |
| "loss": 0.1747, |
| "loss/crossentropy": 1.4426945447921753, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1455078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02921513468027115, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.0029916678342138013, |
| "grad_norm": 1.6640625, |
| "grad_norm_var": 5.685538482666016, |
| "learning_rate": 5e-05, |
| "loss": 0.1638, |
| "loss/crossentropy": 1.559606909751892, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1357421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028026653453707695, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.002999909343398963, |
| "grad_norm": 2.109375, |
| "grad_norm_var": 5.865667470296224, |
| "learning_rate": 5e-05, |
| "loss": 0.1489, |
| "loss/crossentropy": 0.7043201923370361, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.018008584156632423, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.003008150852584125, |
| "grad_norm": 4.1875, |
| "grad_norm_var": 5.812695058186849, |
| "learning_rate": 5e-05, |
| "loss": 0.2217, |
| "loss/crossentropy": 1.9148753881454468, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0439751073718071, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.0030163923617692874, |
| "grad_norm": 4.125, |
| "grad_norm_var": 5.792956288655599, |
| "learning_rate": 5e-05, |
| "loss": 0.3083, |
| "loss/crossentropy": 2.424750328063965, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07787832617759705, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.0030246338709544493, |
| "grad_norm": 2.09375, |
| "grad_norm_var": 5.963744862874349, |
| "learning_rate": 5e-05, |
| "loss": 0.1791, |
| "loss/crossentropy": 1.68095862865448, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.146484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03266463428735733, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.003032875380139611, |
| "grad_norm": 11.875, |
| "grad_norm_var": 9.52763646443685, |
| "learning_rate": 5e-05, |
| "loss": 0.5786, |
| "loss/crossentropy": 1.7796623706817627, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.4765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10202518105506897, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.003041116889324773, |
| "grad_norm": 2.953125, |
| "grad_norm_var": 9.516863759358724, |
| "learning_rate": 5e-05, |
| "loss": 0.2031, |
| "loss/crossentropy": 1.285621166229248, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025359109044075012, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.003049358398509935, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 9.749049631754557, |
| "learning_rate": 5e-05, |
| "loss": 0.176, |
| "loss/crossentropy": 1.5697388648986816, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.14453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03151218220591545, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.0030575999076950973, |
| "grad_norm": 3.0, |
| "grad_norm_var": 9.781166330973308, |
| "learning_rate": 5e-05, |
| "loss": 0.2175, |
| "loss/crossentropy": 1.4509150981903076, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1845703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03292187303304672, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.003065841416880259, |
| "grad_norm": 1.75, |
| "grad_norm_var": 9.534547678629558, |
| "learning_rate": 5e-05, |
| "loss": 0.1677, |
| "loss/crossentropy": 2.5047202110290527, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1318359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03586728125810623, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.003074082926065421, |
| "grad_norm": 5.5, |
| "grad_norm_var": 9.256392161051432, |
| "learning_rate": 5e-05, |
| "loss": 0.2845, |
| "loss/crossentropy": 1.728163719177246, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04620472714304924, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.003082324435250583, |
| "grad_norm": 3.109375, |
| "grad_norm_var": 9.079986317952473, |
| "learning_rate": 5e-05, |
| "loss": 0.2467, |
| "loss/crossentropy": 2.323497772216797, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04549071192741394, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.003090565944435745, |
| "grad_norm": 2.734375, |
| "grad_norm_var": 8.902730051676432, |
| "learning_rate": 5e-05, |
| "loss": 0.1624, |
| "loss/crossentropy": 1.6974822282791138, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1357421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.026702899485826492, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.003098807453620907, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 8.553822580973307, |
| "learning_rate": 5e-05, |
| "loss": 0.1575, |
| "loss/crossentropy": 1.636716604232788, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02862347848713398, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.003107048962806069, |
| "grad_norm": 3.953125, |
| "grad_norm_var": 6.287947336832683, |
| "learning_rate": 5e-05, |
| "loss": 0.366, |
| "loss/crossentropy": 2.5682785511016846, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08478732407093048, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.003115290471991231, |
| "grad_norm": 4.4375, |
| "grad_norm_var": 6.139050038655599, |
| "learning_rate": 5e-05, |
| "loss": 0.236, |
| "loss/crossentropy": 1.9300963878631592, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1884765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04753483831882477, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.003123531981176393, |
| "grad_norm": 8.0, |
| "grad_norm_var": 7.0182851155598955, |
| "learning_rate": 5e-05, |
| "loss": 0.3625, |
| "loss/crossentropy": 1.655861496925354, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0734243243932724, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.003131773490361555, |
| "grad_norm": 4.71875, |
| "grad_norm_var": 6.789794921875, |
| "learning_rate": 5e-05, |
| "loss": 0.3235, |
| "loss/crossentropy": 2.3060801029205322, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06571874022483826, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.003140014999546717, |
| "grad_norm": 3.640625, |
| "grad_norm_var": 6.8059234619140625, |
| "learning_rate": 5e-05, |
| "loss": 0.2763, |
| "loss/crossentropy": 2.67515230178833, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2236328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05268421396613121, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.003148256508731879, |
| "grad_norm": 8.6875, |
| "grad_norm_var": 8.111107381184896, |
| "learning_rate": 5e-05, |
| "loss": 0.2852, |
| "loss/crossentropy": 2.6763105392456055, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06450507789850235, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.003156498017917041, |
| "grad_norm": 3.78125, |
| "grad_norm_var": 7.769432576497396, |
| "learning_rate": 5e-05, |
| "loss": 0.2415, |
| "loss/crossentropy": 2.8496878147125244, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.189453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.052030615508556366, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.0031647395271022028, |
| "grad_norm": 4.21875, |
| "grad_norm_var": 3.9133941650390627, |
| "learning_rate": 5e-05, |
| "loss": 0.1925, |
| "loss/crossentropy": 1.3341301679611206, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1630859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02942117676138878, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.003172981036287365, |
| "grad_norm": 1.984375, |
| "grad_norm_var": 4.1111806233723955, |
| "learning_rate": 5e-05, |
| "loss": 0.1985, |
| "loss/crossentropy": 2.383344888687134, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1533203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04519602656364441, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.003181222545472527, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 4.0865224202473955, |
| "learning_rate": 5e-05, |
| "loss": 0.235, |
| "loss/crossentropy": 2.4512088298797607, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1806640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.054316744208335876, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.003189464054657689, |
| "grad_norm": 2.03125, |
| "grad_norm_var": 4.271190388997396, |
| "learning_rate": 5e-05, |
| "loss": 0.187, |
| "loss/crossentropy": 1.7021174430847168, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034697070717811584, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.003197705563842851, |
| "grad_norm": 6.78125, |
| "grad_norm_var": 4.400902303059896, |
| "learning_rate": 5e-05, |
| "loss": 0.3263, |
| "loss/crossentropy": 2.0133919715881348, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07632862031459808, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.0032059470730280127, |
| "grad_norm": 2.484375, |
| "grad_norm_var": 4.458426920572917, |
| "learning_rate": 5e-05, |
| "loss": 0.2092, |
| "loss/crossentropy": 2.6211116313934326, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.162109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04707195237278938, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.003214188582213175, |
| "grad_norm": 1.671875, |
| "grad_norm_var": 4.766141764322916, |
| "learning_rate": 5e-05, |
| "loss": 0.1619, |
| "loss/crossentropy": 1.861954689025879, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1298828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.031997717916965485, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.003222430091398337, |
| "grad_norm": 3.953125, |
| "grad_norm_var": 4.661246744791667, |
| "learning_rate": 5e-05, |
| "loss": 0.2297, |
| "loss/crossentropy": 1.4026638269424438, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03833915665745735, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.003230671600583499, |
| "grad_norm": 4.71875, |
| "grad_norm_var": 4.401887003580729, |
| "learning_rate": 5e-05, |
| "loss": 0.3195, |
| "loss/crossentropy": 2.687175750732422, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.251953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06756812334060669, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.0032389131097686607, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 4.736358388264974, |
| "learning_rate": 5e-05, |
| "loss": 0.1761, |
| "loss/crossentropy": 1.6555976867675781, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1435546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.032518401741981506, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.003247154618953823, |
| "grad_norm": 3.640625, |
| "grad_norm_var": 4.736462148030599, |
| "learning_rate": 5e-05, |
| "loss": 0.3446, |
| "loss/crossentropy": 2.2599704265594482, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.279296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06530951708555222, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.003255396128138985, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 3.825056966145833, |
| "learning_rate": 5e-05, |
| "loss": 0.204, |
| "loss/crossentropy": 2.564816951751709, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04773015156388283, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.003263637637324147, |
| "grad_norm": 2.390625, |
| "grad_norm_var": 3.8267242431640627, |
| "learning_rate": 5e-05, |
| "loss": 0.1961, |
| "loss/crossentropy": 2.249958038330078, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.039828941226005554, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.0032718791465093087, |
| "grad_norm": 1.3671875, |
| "grad_norm_var": 4.103281402587891, |
| "learning_rate": 5e-05, |
| "loss": 0.1342, |
| "loss/crossentropy": 1.058944821357727, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11474609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019409142434597015, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.0032801206556944706, |
| "grad_norm": 13.375, |
| "grad_norm_var": 8.815500640869141, |
| "learning_rate": 5e-05, |
| "loss": 0.3495, |
| "loss/crossentropy": 2.6670608520507812, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06825672090053558, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.003288362164879633, |
| "grad_norm": 2.140625, |
| "grad_norm_var": 8.952433013916016, |
| "learning_rate": 5e-05, |
| "loss": 0.2057, |
| "loss/crossentropy": 2.589582920074463, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1669921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.038732096552848816, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.003296603674064795, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 9.012959543863932, |
| "learning_rate": 5e-05, |
| "loss": 0.134, |
| "loss/crossentropy": 0.9774411916732788, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01683815009891987, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0033048451832499567, |
| "grad_norm": 5.3125, |
| "grad_norm_var": 9.071028391520182, |
| "learning_rate": 5e-05, |
| "loss": 0.3151, |
| "loss/crossentropy": 2.308528184890747, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.057268112897872925, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.0033130866924351186, |
| "grad_norm": 1.75, |
| "grad_norm_var": 9.138868967692057, |
| "learning_rate": 5e-05, |
| "loss": 0.1162, |
| "loss/crossentropy": 0.1983821541070938, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1083984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.007823487743735313, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.0033213282016202805, |
| "grad_norm": 3.578125, |
| "grad_norm_var": 8.964241282145183, |
| "learning_rate": 5e-05, |
| "loss": 0.1463, |
| "loss/crossentropy": 0.282149076461792, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1337890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.012518523260951042, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.003329569710805443, |
| "grad_norm": 5.4375, |
| "grad_norm_var": 8.52498550415039, |
| "learning_rate": 5e-05, |
| "loss": 0.3084, |
| "loss/crossentropy": 2.364654302597046, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2333984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07504182308912277, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.0033378112199906047, |
| "grad_norm": 6.40625, |
| "grad_norm_var": 8.894703928629557, |
| "learning_rate": 5e-05, |
| "loss": 0.2316, |
| "loss/crossentropy": 1.4973769187927246, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036246173083782196, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.0033460527291757666, |
| "grad_norm": 14.9375, |
| "grad_norm_var": 16.021522776285806, |
| "learning_rate": 5e-05, |
| "loss": 0.3451, |
| "loss/crossentropy": 2.6580722332000732, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05991474539041519, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.0033542942383609285, |
| "grad_norm": 1.7109375, |
| "grad_norm_var": 16.55601298014323, |
| "learning_rate": 5e-05, |
| "loss": 0.1524, |
| "loss/crossentropy": 2.4848172664642334, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0323210209608078, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.003362535747546091, |
| "grad_norm": 25.625, |
| "grad_norm_var": 44.34390360514323, |
| "learning_rate": 5e-05, |
| "loss": 0.3535, |
| "loss/crossentropy": 2.135502338409424, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06830734014511108, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.0033707772567312527, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 44.203704579671225, |
| "learning_rate": 5e-05, |
| "loss": 0.1807, |
| "loss/crossentropy": 1.5167546272277832, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.032285355031490326, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.0033790187659164146, |
| "grad_norm": 2.25, |
| "grad_norm_var": 44.73858820597331, |
| "learning_rate": 5e-05, |
| "loss": 0.1845, |
| "loss/crossentropy": 2.5554723739624023, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.146484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03800758719444275, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0033872602751015765, |
| "grad_norm": 5.125, |
| "grad_norm_var": 43.70799051920573, |
| "learning_rate": 5e-05, |
| "loss": 0.1819, |
| "loss/crossentropy": 1.3965002298355103, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.162109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019777944311499596, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.0033955017842867384, |
| "grad_norm": 2.640625, |
| "grad_norm_var": 43.591942342122394, |
| "learning_rate": 5e-05, |
| "loss": 0.2145, |
| "loss/crossentropy": 1.4451302289962769, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04454413428902626, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.0034037432934719007, |
| "grad_norm": 2.125, |
| "grad_norm_var": 43.159234364827476, |
| "learning_rate": 5e-05, |
| "loss": 0.1472, |
| "loss/crossentropy": 0.820690929889679, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1259765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02119414508342743, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.0034119848026570626, |
| "grad_norm": 2.546875, |
| "grad_norm_var": 39.915026601155596, |
| "learning_rate": 5e-05, |
| "loss": 0.189, |
| "loss/crossentropy": 2.5425262451171875, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0405312180519104, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.0034202263118422245, |
| "grad_norm": 1.96875, |
| "grad_norm_var": 39.991005198160806, |
| "learning_rate": 5e-05, |
| "loss": 0.161, |
| "loss/crossentropy": 1.5188648700714111, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1337890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027209658175706863, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.0034284678210273864, |
| "grad_norm": 3.40625, |
| "grad_norm_var": 39.60099461873372, |
| "learning_rate": 5e-05, |
| "loss": 0.2137, |
| "loss/crossentropy": 1.8609509468078613, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.173828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03991951048374176, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.0034367093302125483, |
| "grad_norm": 4.03125, |
| "grad_norm_var": 39.72469863891602, |
| "learning_rate": 5e-05, |
| "loss": 0.3144, |
| "loss/crossentropy": 2.865185260772705, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.244140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07025311887264252, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.0034449508393977106, |
| "grad_norm": 5.0, |
| "grad_norm_var": 38.82227350870768, |
| "learning_rate": 5e-05, |
| "loss": 0.2031, |
| "loss/crossentropy": 1.3800697326660156, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.17578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027345672249794006, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.0034531923485828725, |
| "grad_norm": 2.921875, |
| "grad_norm_var": 39.02252375284831, |
| "learning_rate": 5e-05, |
| "loss": 0.1523, |
| "loss/crossentropy": 0.4079228937625885, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.13671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.015567103400826454, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.0034614338577680344, |
| "grad_norm": 4.96875, |
| "grad_norm_var": 39.04129206339518, |
| "learning_rate": 5e-05, |
| "loss": 0.1958, |
| "loss/crossentropy": 2.5757017135620117, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.031719379127025604, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.0034696753669531963, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 39.65029271443685, |
| "learning_rate": 5e-05, |
| "loss": 0.2189, |
| "loss/crossentropy": 1.558744192123413, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04112962260842323, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.0034779168761383586, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 33.22532526652018, |
| "learning_rate": 5e-05, |
| "loss": 0.2209, |
| "loss/crossentropy": 2.4556710720062256, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04905615746974945, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.0034861583853235205, |
| "grad_norm": 4.03125, |
| "grad_norm_var": 32.71692606608073, |
| "learning_rate": 5e-05, |
| "loss": 0.2253, |
| "loss/crossentropy": 2.399423360824585, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.18359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04174065962433815, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.0034943998945086824, |
| "grad_norm": 6.53125, |
| "grad_norm_var": 1.9431711832682292, |
| "learning_rate": 5e-05, |
| "loss": 0.3418, |
| "loss/crossentropy": 1.4518251419067383, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.30859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03323998302221298, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.0035026414036938443, |
| "grad_norm": 2.703125, |
| "grad_norm_var": 1.87164306640625, |
| "learning_rate": 5e-05, |
| "loss": 0.2027, |
| "loss/crossentropy": 2.5503337383270264, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1611328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04152850806713104, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.0035108829128790062, |
| "grad_norm": 3.140625, |
| "grad_norm_var": 1.7813629150390624, |
| "learning_rate": 5e-05, |
| "loss": 0.21, |
| "loss/crossentropy": 2.405348539352417, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0400310643017292, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.0035191244220641685, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 1.694677734375, |
| "learning_rate": 5e-05, |
| "loss": 0.2251, |
| "loss/crossentropy": 2.5670955181121826, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.173828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05131090059876442, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.0035273659312493304, |
| "grad_norm": 5.46875, |
| "grad_norm_var": 1.9485829671223958, |
| "learning_rate": 5e-05, |
| "loss": 0.2417, |
| "loss/crossentropy": 1.348537564277649, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.208984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.032746195793151855, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.0035356074404344923, |
| "grad_norm": 2.796875, |
| "grad_norm_var": 1.8563313802083334, |
| "learning_rate": 5e-05, |
| "loss": 0.2383, |
| "loss/crossentropy": 2.7552454471588135, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.189453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04886690154671669, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.0035438489496196542, |
| "grad_norm": 2.703125, |
| "grad_norm_var": 1.83775634765625, |
| "learning_rate": 5e-05, |
| "loss": 0.3084, |
| "loss/crossentropy": 2.6097259521484375, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05057002976536751, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.003552090458804816, |
| "grad_norm": 4.3125, |
| "grad_norm_var": 1.695849609375, |
| "learning_rate": 5e-05, |
| "loss": 0.2062, |
| "loss/crossentropy": 1.8095245361328125, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036256495863199234, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.0035603319679899785, |
| "grad_norm": 6.3125, |
| "grad_norm_var": 2.12232666015625, |
| "learning_rate": 5e-05, |
| "loss": 0.2564, |
| "loss/crossentropy": 2.088921546936035, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0551944300532341, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.0035685734771751403, |
| "grad_norm": 42.0, |
| "grad_norm_var": 93.143505859375, |
| "learning_rate": 5e-05, |
| "loss": 0.7318, |
| "loss/crossentropy": 2.4523119926452637, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.5859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.14583945274353027, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.0035768149863603022, |
| "grad_norm": 3.09375, |
| "grad_norm_var": 93.68137613932292, |
| "learning_rate": 5e-05, |
| "loss": 0.2037, |
| "loss/crossentropy": 2.0388007164001465, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1689453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03475724905729294, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.003585056495545464, |
| "grad_norm": 4.1875, |
| "grad_norm_var": 93.24458719889323, |
| "learning_rate": 5e-05, |
| "loss": 0.2542, |
| "loss/crossentropy": 2.6730620861053467, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05885430425405502, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.0035932980047306265, |
| "grad_norm": 3.34375, |
| "grad_norm_var": 93.6726308186849, |
| "learning_rate": 5e-05, |
| "loss": 0.1555, |
| "loss/crossentropy": 0.4195747375488281, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1416015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.013927996158599854, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.0036015395139157884, |
| "grad_norm": 3.421875, |
| "grad_norm_var": 93.11043294270833, |
| "learning_rate": 5e-05, |
| "loss": 0.254, |
| "loss/crossentropy": 1.7030478715896606, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.208984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04505797103047371, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.0036097810231009502, |
| "grad_norm": 1.71875, |
| "grad_norm_var": 93.49947509765624, |
| "learning_rate": 5e-05, |
| "loss": 0.1532, |
| "loss/crossentropy": 0.4963390529155731, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1357421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.017449375241994858, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.003618022532286112, |
| "grad_norm": 3.125, |
| "grad_norm_var": 93.80262044270833, |
| "learning_rate": 5e-05, |
| "loss": 0.2339, |
| "loss/crossentropy": 2.744438648223877, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05223686248064041, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.003626264041471274, |
| "grad_norm": 1.1875, |
| "grad_norm_var": 95.25058186848959, |
| "learning_rate": 5e-05, |
| "loss": 0.1237, |
| "loss/crossentropy": 0.5037131905555725, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.014304354786872864, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.0036345055506564364, |
| "grad_norm": 4.5, |
| "grad_norm_var": 94.72848205566406, |
| "learning_rate": 5e-05, |
| "loss": 0.1741, |
| "loss/crossentropy": 1.277227759361267, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021763307973742485, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.0036427470598415983, |
| "grad_norm": 8.5625, |
| "grad_norm_var": 94.61658528645833, |
| "learning_rate": 5e-05, |
| "loss": 0.3688, |
| "loss/crossentropy": 2.51446533203125, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07975561916828156, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.00365098856902676, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 94.38951822916667, |
| "learning_rate": 5e-05, |
| "loss": 0.2143, |
| "loss/crossentropy": 2.614020347595215, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.044419120997190475, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.003659230078211922, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 95.37579650878907, |
| "learning_rate": 5e-05, |
| "loss": 0.1516, |
| "loss/crossentropy": 0.43124791979789734, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01679244264960289, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.0036674715873970844, |
| "grad_norm": 1.4609375, |
| "grad_norm_var": 96.05772476196289, |
| "learning_rate": 5e-05, |
| "loss": 0.1449, |
| "loss/crossentropy": 1.3757065534591675, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.119140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02579795941710472, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.0036757130965822463, |
| "grad_norm": 2.859375, |
| "grad_norm_var": 95.99232559204101, |
| "learning_rate": 5e-05, |
| "loss": 0.2612, |
| "loss/crossentropy": 1.5693522691726685, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034596264362335205, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.003683954605767408, |
| "grad_norm": 2.25, |
| "grad_norm_var": 96.70171279907227, |
| "learning_rate": 5e-05, |
| "loss": 0.1481, |
| "loss/crossentropy": 1.425809621810913, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12060546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02751866541802883, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.00369219611495257, |
| "grad_norm": 4.53125, |
| "grad_norm_var": 96.77743911743164, |
| "learning_rate": 5e-05, |
| "loss": 0.2282, |
| "loss/crossentropy": 1.8082743883132935, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04072072356939316, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.003700437624137732, |
| "grad_norm": 4.5, |
| "grad_norm_var": 3.0933570861816406, |
| "learning_rate": 5e-05, |
| "loss": 0.284, |
| "loss/crossentropy": 0.5547680854797363, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0339670293033123, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.0037086791333228943, |
| "grad_norm": 4.25, |
| "grad_norm_var": 3.1387489318847654, |
| "learning_rate": 5e-05, |
| "loss": 0.2519, |
| "loss/crossentropy": 1.6840208768844604, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.205078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04682979732751846, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.003716920642508056, |
| "grad_norm": 3.765625, |
| "grad_norm_var": 3.1063392639160154, |
| "learning_rate": 5e-05, |
| "loss": 0.2045, |
| "loss/crossentropy": 2.407160520553589, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.162109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04241305589675903, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.003725162151693218, |
| "grad_norm": 5.6875, |
| "grad_norm_var": 3.4360816955566404, |
| "learning_rate": 5e-05, |
| "loss": 0.3723, |
| "loss/crossentropy": 2.6594908237457275, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.279296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09303879737854004, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.00373340366087838, |
| "grad_norm": 1.6328125, |
| "grad_norm_var": 3.6628011067708335, |
| "learning_rate": 5e-05, |
| "loss": 0.118, |
| "loss/crossentropy": 0.4520578682422638, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1044921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01347460225224495, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.003741645170063542, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 3.5754150390625, |
| "learning_rate": 5e-05, |
| "loss": 0.2743, |
| "loss/crossentropy": 2.884896755218506, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04774241894483566, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.003749886679248704, |
| "grad_norm": 1.8359375, |
| "grad_norm_var": 3.7349952697753905, |
| "learning_rate": 5e-05, |
| "loss": 0.1811, |
| "loss/crossentropy": 2.609929323196411, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.146484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034633196890354156, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.003758128188433866, |
| "grad_norm": 10.6875, |
| "grad_norm_var": 6.612827301025391, |
| "learning_rate": 5e-05, |
| "loss": 0.7979, |
| "loss/crossentropy": 2.925989866256714, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.5390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.2587950825691223, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.003766369697619028, |
| "grad_norm": 3.109375, |
| "grad_norm_var": 6.634012603759766, |
| "learning_rate": 5e-05, |
| "loss": 0.2532, |
| "loss/crossentropy": 2.1211068630218506, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.193359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.059833116829395294, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.00377461120680419, |
| "grad_norm": 1.8359375, |
| "grad_norm_var": 5.25826416015625, |
| "learning_rate": 5e-05, |
| "loss": 0.1663, |
| "loss/crossentropy": 2.170849323272705, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1337890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03252778202295303, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.003782852715989352, |
| "grad_norm": 1.1640625, |
| "grad_norm_var": 5.541731516520183, |
| "learning_rate": 5e-05, |
| "loss": 0.1449, |
| "loss/crossentropy": 1.5572426319122314, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.123046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021882327273488045, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.003791094225174514, |
| "grad_norm": 1.96875, |
| "grad_norm_var": 5.581648508707683, |
| "learning_rate": 5e-05, |
| "loss": 0.1801, |
| "loss/crossentropy": 2.4034504890441895, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03750108927488327, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.003799335734359676, |
| "grad_norm": 1.46875, |
| "grad_norm_var": 5.579678344726562, |
| "learning_rate": 5e-05, |
| "loss": 0.1353, |
| "loss/crossentropy": 2.3504481315612793, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1103515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024939250200986862, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.003807577243544838, |
| "grad_norm": 4.53125, |
| "grad_norm_var": 5.643570963541666, |
| "learning_rate": 5e-05, |
| "loss": 0.1909, |
| "loss/crossentropy": 0.8316883444786072, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.022978752851486206, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.0038158187527299998, |
| "grad_norm": 3.46875, |
| "grad_norm_var": 5.539628092447916, |
| "learning_rate": 5e-05, |
| "loss": 0.2104, |
| "loss/crossentropy": 2.7462053298950195, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.046341672539711, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.003824060261915162, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 5.536083984375, |
| "learning_rate": 5e-05, |
| "loss": 0.1901, |
| "loss/crossentropy": 1.570056676864624, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02994626574218273, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.003832301771100324, |
| "grad_norm": 5.125, |
| "grad_norm_var": 5.651643880208334, |
| "learning_rate": 5e-05, |
| "loss": 0.2886, |
| "loss/crossentropy": 3.014599084854126, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05423382669687271, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.003840543280285486, |
| "grad_norm": 4.65625, |
| "grad_norm_var": 5.705546061197917, |
| "learning_rate": 5e-05, |
| "loss": 0.2568, |
| "loss/crossentropy": 2.5307607650756836, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.212890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.043907828629016876, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.0038487847894706478, |
| "grad_norm": 2.65625, |
| "grad_norm_var": 5.738841756184896, |
| "learning_rate": 5e-05, |
| "loss": 0.1794, |
| "loss/crossentropy": 2.301478624343872, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.14453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034871190786361694, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.0038570262986558097, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 5.421996053059896, |
| "learning_rate": 5e-05, |
| "loss": 0.1572, |
| "loss/crossentropy": 2.81413197517395, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03511942923069, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.003865267807840972, |
| "grad_norm": 3.234375, |
| "grad_norm_var": 5.24969253540039, |
| "learning_rate": 5e-05, |
| "loss": 0.2422, |
| "loss/crossentropy": 2.552509069442749, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.193359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.048864759504795074, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.003873509317026134, |
| "grad_norm": 2.65625, |
| "grad_norm_var": 5.189699045817057, |
| "learning_rate": 5e-05, |
| "loss": 0.2269, |
| "loss/crossentropy": 1.746779441833496, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1787109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.048161737620830536, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.0038817508262112958, |
| "grad_norm": 2.40625, |
| "grad_norm_var": 5.097041829427083, |
| "learning_rate": 5e-05, |
| "loss": 0.2224, |
| "loss/crossentropy": 1.6436595916748047, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1806640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04173795133829117, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.0038899923353964577, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 1.336993153889974, |
| "learning_rate": 5e-05, |
| "loss": 0.1768, |
| "loss/crossentropy": 2.6210246086120605, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1357421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.041082605719566345, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.00389823384458162, |
| "grad_norm": 7.125, |
| "grad_norm_var": 2.5066485087076824, |
| "learning_rate": 5e-05, |
| "loss": 0.2539, |
| "loss/crossentropy": 2.7276291847229004, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.050726860761642456, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.003906475353766782, |
| "grad_norm": 2.0, |
| "grad_norm_var": 2.4815958658854167, |
| "learning_rate": 5e-05, |
| "loss": 0.1564, |
| "loss/crossentropy": 2.6117374897003174, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034293532371520996, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.003914716862951944, |
| "grad_norm": 10.3125, |
| "grad_norm_var": 5.389619700113932, |
| "learning_rate": 5e-05, |
| "loss": 0.3135, |
| "loss/crossentropy": 2.8067679405212402, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2451171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06842821836471558, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.003922958372137106, |
| "grad_norm": 3.1875, |
| "grad_norm_var": 5.210853830973307, |
| "learning_rate": 5e-05, |
| "loss": 0.237, |
| "loss/crossentropy": 2.265045642852783, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.049473538994789124, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.003931199881322268, |
| "grad_norm": 2.5, |
| "grad_norm_var": 4.968281809488932, |
| "learning_rate": 5e-05, |
| "loss": 0.22, |
| "loss/crossentropy": 2.7731900215148926, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05008155107498169, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.0039394413905074295, |
| "grad_norm": 7.75, |
| "grad_norm_var": 5.937888336181641, |
| "learning_rate": 5e-05, |
| "loss": 0.4192, |
| "loss/crossentropy": 2.859137535095215, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.333984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08519326895475388, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.003947682899692591, |
| "grad_norm": 3.734375, |
| "grad_norm_var": 5.924122873942057, |
| "learning_rate": 5e-05, |
| "loss": 0.2565, |
| "loss/crossentropy": 1.434816598892212, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.224609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03185056895017624, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.003955924408877754, |
| "grad_norm": 1.90625, |
| "grad_norm_var": 6.052335357666015, |
| "learning_rate": 5e-05, |
| "loss": 0.207, |
| "loss/crossentropy": 2.1310391426086426, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04687424749135971, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.003964165918062916, |
| "grad_norm": 3.25, |
| "grad_norm_var": 5.982144927978515, |
| "learning_rate": 5e-05, |
| "loss": 0.2501, |
| "loss/crossentropy": 2.3727288246154785, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1982421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05187632888555527, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.003972407427248078, |
| "grad_norm": 1.046875, |
| "grad_norm_var": 6.40746841430664, |
| "learning_rate": 5e-05, |
| "loss": 0.1219, |
| "loss/crossentropy": 1.4023224115371704, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.103515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.018374113366007805, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.00398064893643324, |
| "grad_norm": 4.0625, |
| "grad_norm_var": 6.349881744384765, |
| "learning_rate": 5e-05, |
| "loss": 0.2168, |
| "loss/crossentropy": 1.5084764957427979, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1669921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04980340600013733, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.003988890445618402, |
| "grad_norm": 2.09375, |
| "grad_norm_var": 6.393645985921224, |
| "learning_rate": 5e-05, |
| "loss": 0.1647, |
| "loss/crossentropy": 1.9611366987228394, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1259765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.038717180490493774, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.003997131954803564, |
| "grad_norm": 3.296875, |
| "grad_norm_var": 6.390036773681641, |
| "learning_rate": 5e-05, |
| "loss": 0.2526, |
| "loss/crossentropy": 2.1912283897399902, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05140523985028267, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.0040053734639887255, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 6.45104751586914, |
| "learning_rate": 5e-05, |
| "loss": 0.2049, |
| "loss/crossentropy": 1.1990885734558105, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.173828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03110179677605629, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.004013614973173887, |
| "grad_norm": 2.109375, |
| "grad_norm_var": 6.506866200764974, |
| "learning_rate": 5e-05, |
| "loss": 0.1932, |
| "loss/crossentropy": 2.469733715057373, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.150390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04281339794397354, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.004021856482359049, |
| "grad_norm": 318.0, |
| "grad_norm_var": 6177.285184733073, |
| "learning_rate": 5e-05, |
| "loss": 1.5086, |
| "loss/crossentropy": 1.5801646709442139, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 1.390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.1180073618888855, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.004030097991544211, |
| "grad_norm": 11.75, |
| "grad_norm_var": 6168.57597249349, |
| "learning_rate": 5e-05, |
| "loss": 0.4099, |
| "loss/crossentropy": 2.5404622554779053, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.3359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0739157497882843, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.004038339500729374, |
| "grad_norm": 4.125, |
| "grad_norm_var": 6162.7084269205725, |
| "learning_rate": 5e-05, |
| "loss": 0.2788, |
| "loss/crossentropy": 2.6295320987701416, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06782936304807663, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.004046581009914536, |
| "grad_norm": 9.6875, |
| "grad_norm_var": 6163.8599568684895, |
| "learning_rate": 5e-05, |
| "loss": 0.4341, |
| "loss/crossentropy": 3.0478451251983643, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.310546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.12352639436721802, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.004054822519099698, |
| "grad_norm": 5.90625, |
| "grad_norm_var": 6156.850325520833, |
| "learning_rate": 5e-05, |
| "loss": 0.1865, |
| "loss/crossentropy": 1.3709689378738403, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.162109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02439779043197632, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.00406306402828486, |
| "grad_norm": 1.625, |
| "grad_norm_var": 6159.402864583333, |
| "learning_rate": 5e-05, |
| "loss": 0.1459, |
| "loss/crossentropy": 1.324977159500122, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02481193095445633, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.0040713055374700215, |
| "grad_norm": 8.4375, |
| "grad_norm_var": 6157.950699869792, |
| "learning_rate": 5e-05, |
| "loss": 0.2132, |
| "loss/crossentropy": 1.4893817901611328, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.173828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.039393968880176544, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.004079547046655183, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 6162.678776041666, |
| "learning_rate": 5e-05, |
| "loss": 0.132, |
| "loss/crossentropy": 1.3435920476913452, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.017779778689146042, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.004087788555840345, |
| "grad_norm": 3.65625, |
| "grad_norm_var": 6157.749609375, |
| "learning_rate": 5e-05, |
| "loss": 0.2431, |
| "loss/crossentropy": 2.082836151123047, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.185546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.057581763714551926, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.004096030065025507, |
| "grad_norm": 6.34375, |
| "grad_norm_var": 6149.804553222656, |
| "learning_rate": 5e-05, |
| "loss": 0.2089, |
| "loss/crossentropy": 1.231292486190796, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02723127231001854, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.004104271574210669, |
| "grad_norm": 3.296875, |
| "grad_norm_var": 6143.188732910156, |
| "learning_rate": 5e-05, |
| "loss": 0.2633, |
| "loss/crossentropy": 2.7151858806610107, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.197265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06607217341661453, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.004112513083395832, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 6147.511221313476, |
| "learning_rate": 5e-05, |
| "loss": 0.1795, |
| "loss/crossentropy": 2.5187623500823975, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.040810734033584595, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.004120754592580994, |
| "grad_norm": 3.640625, |
| "grad_norm_var": 6143.101721191406, |
| "learning_rate": 5e-05, |
| "loss": 0.3104, |
| "loss/crossentropy": 2.562577962875366, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07988132536411285, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.004128996101766156, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 6145.463117472331, |
| "learning_rate": 5e-05, |
| "loss": 0.1763, |
| "loss/crossentropy": 2.076406478881836, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04152751341462135, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.0041372376109513175, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 6145.830125935872, |
| "learning_rate": 5e-05, |
| "loss": 0.1825, |
| "loss/crossentropy": 1.5978915691375732, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1474609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03506774455308914, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.004145479120136479, |
| "grad_norm": 6.5, |
| "grad_norm_var": 6134.082059733073, |
| "learning_rate": 5e-05, |
| "loss": 0.343, |
| "loss/crossentropy": 2.7203476428985596, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0890902578830719, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.004153720629321641, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 9.201432291666666, |
| "learning_rate": 5e-05, |
| "loss": 0.197, |
| "loss/crossentropy": 1.9945552349090576, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.040720634162425995, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.004161962138506803, |
| "grad_norm": 2.71875, |
| "grad_norm_var": 5.917020670572916, |
| "learning_rate": 5e-05, |
| "loss": 0.1455, |
| "loss/crossentropy": 0.7928286790847778, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.010724226012825966, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.004170203647691965, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 6.374580637613932, |
| "learning_rate": 5e-05, |
| "loss": 0.1175, |
| "loss/crossentropy": 1.454852819442749, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0986328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.018820609897375107, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.004178445156877127, |
| "grad_norm": 5.0, |
| "grad_norm_var": 4.231941477457682, |
| "learning_rate": 5e-05, |
| "loss": 0.2223, |
| "loss/crossentropy": 1.54378080368042, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1865234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03581738844513893, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.00418668666606229, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 4.020247141520183, |
| "learning_rate": 5e-05, |
| "loss": 0.1535, |
| "loss/crossentropy": 2.197096109390259, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1259765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02750355750322342, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.004194928175247452, |
| "grad_norm": 3.890625, |
| "grad_norm_var": 3.761824289957682, |
| "learning_rate": 5e-05, |
| "loss": 0.2321, |
| "loss/crossentropy": 2.152005195617676, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.040720127522945404, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.0042031696844326135, |
| "grad_norm": 2.625, |
| "grad_norm_var": 2.189497629801432, |
| "learning_rate": 5e-05, |
| "loss": 0.1794, |
| "loss/crossentropy": 1.419573187828064, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1455078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03389629349112511, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.004211411193617775, |
| "grad_norm": 2.59375, |
| "grad_norm_var": 2.1152992248535156, |
| "learning_rate": 5e-05, |
| "loss": 0.1537, |
| "loss/crossentropy": 1.2792004346847534, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.020872898399829865, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.004219652702802937, |
| "grad_norm": 1.8984375, |
| "grad_norm_var": 2.2378082275390625, |
| "learning_rate": 5e-05, |
| "loss": 0.1677, |
| "loss/crossentropy": 2.1597256660461426, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03487637639045715, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.004227894211988099, |
| "grad_norm": 3.3125, |
| "grad_norm_var": 1.5597239176432292, |
| "learning_rate": 5e-05, |
| "loss": 0.2512, |
| "loss/crossentropy": 1.6841063499450684, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05194816365838051, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.004236135721173261, |
| "grad_norm": 4.78125, |
| "grad_norm_var": 1.7451741536458334, |
| "learning_rate": 5e-05, |
| "loss": 0.2239, |
| "loss/crossentropy": 2.816648006439209, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.051988691091537476, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.004244377230358423, |
| "grad_norm": 11.6875, |
| "grad_norm_var": 6.229002888997396, |
| "learning_rate": 5e-05, |
| "loss": 0.3317, |
| "loss/crossentropy": 2.0484461784362793, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.275390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05626022815704346, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.004252618739543585, |
| "grad_norm": 3.546875, |
| "grad_norm_var": 6.2305653889973955, |
| "learning_rate": 5e-05, |
| "loss": 0.1705, |
| "loss/crossentropy": 1.2704802751541138, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029856139793992043, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.004260860248728747, |
| "grad_norm": 2.859375, |
| "grad_norm_var": 6.175150553385417, |
| "learning_rate": 5e-05, |
| "loss": 0.2187, |
| "loss/crossentropy": 1.2983540296554565, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1826171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03611702471971512, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.0042691017579139096, |
| "grad_norm": 2.390625, |
| "grad_norm_var": 6.1290842692057295, |
| "learning_rate": 5e-05, |
| "loss": 0.1968, |
| "loss/crossentropy": 2.924328088760376, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.150390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04644050449132919, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.0042773432670990714, |
| "grad_norm": 2.453125, |
| "grad_norm_var": 5.671439615885417, |
| "learning_rate": 5e-05, |
| "loss": 0.2026, |
| "loss/crossentropy": 2.4892868995666504, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1611328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0414496548473835, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.004285584776284233, |
| "grad_norm": 2.453125, |
| "grad_norm_var": 5.673607381184896, |
| "learning_rate": 5e-05, |
| "loss": 0.243, |
| "loss/crossentropy": 0.931300163269043, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06334017217159271, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.004293826285469395, |
| "grad_norm": 1.5390625, |
| "grad_norm_var": 5.883624013264974, |
| "learning_rate": 5e-05, |
| "loss": 0.1502, |
| "loss/crossentropy": 1.3612920045852661, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02915302664041519, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.004302067794654557, |
| "grad_norm": 1.4609375, |
| "grad_norm_var": 5.899733225504558, |
| "learning_rate": 5e-05, |
| "loss": 0.1209, |
| "loss/crossentropy": 0.7956821918487549, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10498046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01589544117450714, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.004310309303839719, |
| "grad_norm": 8.9375, |
| "grad_norm_var": 7.696473948160807, |
| "learning_rate": 5e-05, |
| "loss": 0.466, |
| "loss/crossentropy": 2.8766582012176514, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.40234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06366438418626785, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.004318550813024881, |
| "grad_norm": 10.75, |
| "grad_norm_var": 10.611466217041016, |
| "learning_rate": 5e-05, |
| "loss": 0.2406, |
| "loss/crossentropy": 1.433667540550232, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03946308791637421, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.004326792322210043, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 10.94590555826823, |
| "learning_rate": 5e-05, |
| "loss": 0.1674, |
| "loss/crossentropy": 0.8868244290351868, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03266747295856476, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.004335033831395205, |
| "grad_norm": 10.9375, |
| "grad_norm_var": 13.659373982747395, |
| "learning_rate": 5e-05, |
| "loss": 0.2533, |
| "loss/crossentropy": 1.5202522277832031, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2236328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02963770553469658, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.0043432753405803675, |
| "grad_norm": 2.5, |
| "grad_norm_var": 13.684911092122396, |
| "learning_rate": 5e-05, |
| "loss": 0.1905, |
| "loss/crossentropy": 2.2358787059783936, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1494140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.041103295981884, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.004351516849765529, |
| "grad_norm": 5.65625, |
| "grad_norm_var": 13.220444488525391, |
| "learning_rate": 5e-05, |
| "loss": 0.2986, |
| "loss/crossentropy": 2.475597381591797, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2333984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06522452086210251, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.004359758358950691, |
| "grad_norm": 6.125, |
| "grad_norm_var": 13.148850250244141, |
| "learning_rate": 5e-05, |
| "loss": 0.2751, |
| "loss/crossentropy": 2.1268441677093506, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2275390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.047553326934576035, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.004367999868135853, |
| "grad_norm": 4.1875, |
| "grad_norm_var": 13.188008371988932, |
| "learning_rate": 5e-05, |
| "loss": 0.2687, |
| "loss/crossentropy": 1.8196269273757935, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.22265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04601012170314789, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.004376241377321015, |
| "grad_norm": 5.96875, |
| "grad_norm_var": 10.102638498942058, |
| "learning_rate": 5e-05, |
| "loss": 0.4053, |
| "loss/crossentropy": 2.056028127670288, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.291015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.11430098116397858, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.004384482886506177, |
| "grad_norm": 2.828125, |
| "grad_norm_var": 10.236140696207682, |
| "learning_rate": 5e-05, |
| "loss": 0.1589, |
| "loss/crossentropy": 1.7495375871658325, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1298828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02899114228785038, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.004392724395691339, |
| "grad_norm": 2.65625, |
| "grad_norm_var": 10.28472671508789, |
| "learning_rate": 5e-05, |
| "loss": 0.177, |
| "loss/crossentropy": 1.6073510646820068, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034396156668663025, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.004400965904876501, |
| "grad_norm": 2.484375, |
| "grad_norm_var": 10.258341217041016, |
| "learning_rate": 5e-05, |
| "loss": 0.1938, |
| "loss/crossentropy": 1.503987193107605, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.154296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03948179632425308, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.004409207414061663, |
| "grad_norm": 1.140625, |
| "grad_norm_var": 10.733182525634765, |
| "learning_rate": 5e-05, |
| "loss": 0.1132, |
| "loss/crossentropy": 1.3690646886825562, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.09375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019410330802202225, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.004417448923246825, |
| "grad_norm": 1.3671875, |
| "grad_norm_var": 11.098802693684895, |
| "learning_rate": 5e-05, |
| "loss": 0.1406, |
| "loss/crossentropy": 1.3874551057815552, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023373104631900787, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.004425690432431987, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 10.911236317952474, |
| "learning_rate": 5e-05, |
| "loss": 0.1453, |
| "loss/crossentropy": 1.0550464391708374, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12353515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021797355264425278, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.004433931941617149, |
| "grad_norm": 4.65625, |
| "grad_norm_var": 10.28226318359375, |
| "learning_rate": 5e-05, |
| "loss": 0.229, |
| "loss/crossentropy": 2.7873692512512207, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05122203379869461, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.004442173450802311, |
| "grad_norm": 4.40625, |
| "grad_norm_var": 8.965958658854166, |
| "learning_rate": 5e-05, |
| "loss": 0.3081, |
| "loss/crossentropy": 2.410203695297241, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25, |
| "loss/idx": 0.0, |
| "loss/logits": 0.058051109313964844, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.004450414959987473, |
| "grad_norm": 2.671875, |
| "grad_norm_var": 6.152814737955729, |
| "learning_rate": 5e-05, |
| "loss": 0.1837, |
| "loss/crossentropy": 1.0460572242736816, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1591796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024545256048440933, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.004458656469172635, |
| "grad_norm": 2.25, |
| "grad_norm_var": 6.065093739827474, |
| "learning_rate": 5e-05, |
| "loss": 0.1368, |
| "loss/crossentropy": 0.29145750403404236, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.126953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.00982777401804924, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.004466897978357797, |
| "grad_norm": 3.53125, |
| "grad_norm_var": 2.513854726155599, |
| "learning_rate": 5e-05, |
| "loss": 0.1929, |
| "loss/crossentropy": 1.3329319953918457, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.162109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030750418081879616, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.004475139487542959, |
| "grad_norm": 3.265625, |
| "grad_norm_var": 2.457928212483724, |
| "learning_rate": 5e-05, |
| "loss": 0.2152, |
| "loss/crossentropy": 2.0753843784332275, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.045312896370887756, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.0044833809967281205, |
| "grad_norm": 3.1875, |
| "grad_norm_var": 2.114135487874349, |
| "learning_rate": 5e-05, |
| "loss": 0.1793, |
| "loss/crossentropy": 1.0670427083969116, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1455078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033770665526390076, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.004491622505913282, |
| "grad_norm": 1.6328125, |
| "grad_norm_var": 1.6834879557291667, |
| "learning_rate": 5e-05, |
| "loss": 0.1622, |
| "loss/crossentropy": 1.4867587089538574, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1337890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02845672518014908, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.004499864015098445, |
| "grad_norm": 2.859375, |
| "grad_norm_var": 1.5869049072265624, |
| "learning_rate": 5e-05, |
| "loss": 0.1723, |
| "loss/crossentropy": 1.544826865196228, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.146484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025784984230995178, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.004508105524283607, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 0.9517730712890625, |
| "learning_rate": 5e-05, |
| "loss": 0.2222, |
| "loss/crossentropy": 2.5712733268737793, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05422845110297203, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.004516347033468769, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 1.0136311848958333, |
| "learning_rate": 5e-05, |
| "loss": 0.1554, |
| "loss/crossentropy": 1.6466069221496582, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1279296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027493983507156372, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.004524588542653931, |
| "grad_norm": 1.5546875, |
| "grad_norm_var": 1.085455067952474, |
| "learning_rate": 5e-05, |
| "loss": 0.1578, |
| "loss/crossentropy": 2.487321615219116, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12451171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03331330791115761, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.004532830051839093, |
| "grad_norm": 2.6875, |
| "grad_norm_var": 1.0859840393066407, |
| "learning_rate": 5e-05, |
| "loss": 0.2285, |
| "loss/crossentropy": 2.7870802879333496, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.173828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.054664455354213715, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.004541071561024255, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 1.000249989827474, |
| "learning_rate": 5e-05, |
| "loss": 0.1342, |
| "loss/crossentropy": 1.667282223701477, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1123046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02186622843146324, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.0045493130702094165, |
| "grad_norm": 1.265625, |
| "grad_norm_var": 1.0176829020182292, |
| "learning_rate": 5e-05, |
| "loss": 0.1424, |
| "loss/crossentropy": 2.4660463333129883, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029071927070617676, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.0045575545793945784, |
| "grad_norm": 1.4609375, |
| "grad_norm_var": 1.084484608968099, |
| "learning_rate": 5e-05, |
| "loss": 0.1525, |
| "loss/crossentropy": 2.228982925415039, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030380506068468094, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.00456579608857974, |
| "grad_norm": 1.4296875, |
| "grad_norm_var": 0.8341949462890625, |
| "learning_rate": 5e-05, |
| "loss": 0.1534, |
| "loss/crossentropy": 2.0212230682373047, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0362619124352932, |
| "step": 554 |
| }, |
| { |
| "epoch": 0.004574037597764903, |
| "grad_norm": 2.890625, |
| "grad_norm_var": 0.5643229166666667, |
| "learning_rate": 5e-05, |
| "loss": 0.171, |
| "loss/crossentropy": 1.549071192741394, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1455078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02545047551393509, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.004582279106950065, |
| "grad_norm": 4.3125, |
| "grad_norm_var": 0.8214182535807292, |
| "learning_rate": 5e-05, |
| "loss": 0.3118, |
| "loss/crossentropy": 2.652635097503662, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.057920753955841064, |
| "step": 556 |
| }, |
| { |
| "epoch": 0.004590520616135227, |
| "grad_norm": 1.140625, |
| "grad_norm_var": 0.9158162434895833, |
| "learning_rate": 5e-05, |
| "loss": 0.1287, |
| "loss/crossentropy": 1.6453478336334229, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1064453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.022242678329348564, |
| "step": 557 |
| }, |
| { |
| "epoch": 0.004598762125320389, |
| "grad_norm": 3.265625, |
| "grad_norm_var": 0.8765777587890625, |
| "learning_rate": 5e-05, |
| "loss": 0.2774, |
| "loss/crossentropy": 1.622141718864441, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2373046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04007745534181595, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.004607003634505551, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 0.8157867431640625, |
| "learning_rate": 5e-05, |
| "loss": 0.1513, |
| "loss/crossentropy": 1.737197995185852, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.123046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0283003281801939, |
| "step": 559 |
| }, |
| { |
| "epoch": 0.0046152451436907126, |
| "grad_norm": 2.328125, |
| "grad_norm_var": 0.74814453125, |
| "learning_rate": 5e-05, |
| "loss": 0.1046, |
| "loss/crossentropy": 0.2462574690580368, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0966796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.00796109065413475, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.0046234866528758745, |
| "grad_norm": 3.421875, |
| "grad_norm_var": 0.8270566304524739, |
| "learning_rate": 5e-05, |
| "loss": 0.2059, |
| "loss/crossentropy": 2.6141371726989746, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.162109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.043790802359580994, |
| "step": 561 |
| }, |
| { |
| "epoch": 0.004631728162061036, |
| "grad_norm": 3.421875, |
| "grad_norm_var": 0.892352040608724, |
| "learning_rate": 5e-05, |
| "loss": 0.1974, |
| "loss/crossentropy": 1.422098994255066, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0411381721496582, |
| "step": 562 |
| }, |
| { |
| "epoch": 0.004639969671246198, |
| "grad_norm": 6.125, |
| "grad_norm_var": 1.808794911702474, |
| "learning_rate": 5e-05, |
| "loss": 0.3067, |
| "loss/crossentropy": 2.6955533027648926, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.232421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07427150756120682, |
| "step": 563 |
| }, |
| { |
| "epoch": 0.004648211180431361, |
| "grad_norm": 3.28125, |
| "grad_norm_var": 1.787731679280599, |
| "learning_rate": 5e-05, |
| "loss": 0.2236, |
| "loss/crossentropy": 2.841552972793579, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.041970379650592804, |
| "step": 564 |
| }, |
| { |
| "epoch": 0.004656452689616523, |
| "grad_norm": 3.453125, |
| "grad_norm_var": 1.7399617513020833, |
| "learning_rate": 5e-05, |
| "loss": 0.1892, |
| "loss/crossentropy": 2.6695666313171387, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1494140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.039814580231904984, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.004664694198801685, |
| "grad_norm": 23.0, |
| "grad_norm_var": 27.352754720052083, |
| "learning_rate": 5e-05, |
| "loss": 0.3797, |
| "loss/crossentropy": 2.7561914920806885, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.306640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0730535015463829, |
| "step": 566 |
| }, |
| { |
| "epoch": 0.004672935707986847, |
| "grad_norm": 4.125, |
| "grad_norm_var": 26.965547688802083, |
| "learning_rate": 5e-05, |
| "loss": 0.2569, |
| "loss/crossentropy": 1.6983141899108887, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.212890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04405267536640167, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.004681177217172009, |
| "grad_norm": 3.28125, |
| "grad_norm_var": 26.437889607747397, |
| "learning_rate": 5e-05, |
| "loss": 0.1744, |
| "loss/crossentropy": 1.50763738155365, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.14453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029890574514865875, |
| "step": 568 |
| }, |
| { |
| "epoch": 0.0046894187263571705, |
| "grad_norm": 4.1875, |
| "grad_norm_var": 25.870477040608723, |
| "learning_rate": 5e-05, |
| "loss": 0.2786, |
| "loss/crossentropy": 2.469428300857544, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.216796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06181073188781738, |
| "step": 569 |
| }, |
| { |
| "epoch": 0.004697660235542332, |
| "grad_norm": 1.4140625, |
| "grad_norm_var": 25.87682673136393, |
| "learning_rate": 5e-05, |
| "loss": 0.1614, |
| "loss/crossentropy": 1.521830439567566, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02856556512415409, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.004705901744727494, |
| "grad_norm": 3.109375, |
| "grad_norm_var": 25.83377456665039, |
| "learning_rate": 5e-05, |
| "loss": 0.1871, |
| "loss/crossentropy": 2.5410780906677246, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.038688138127326965, |
| "step": 571 |
| }, |
| { |
| "epoch": 0.004714143253912656, |
| "grad_norm": 20.25, |
| "grad_norm_var": 41.34689712524414, |
| "learning_rate": 5e-05, |
| "loss": 0.3154, |
| "loss/crossentropy": 0.9852694272994995, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.27734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0380379781126976, |
| "step": 572 |
| }, |
| { |
| "epoch": 0.004722384763097818, |
| "grad_norm": 6.46875, |
| "grad_norm_var": 40.0391476949056, |
| "learning_rate": 5e-05, |
| "loss": 0.3102, |
| "loss/crossentropy": 2.0467264652252197, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.25, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06023257598280907, |
| "step": 573 |
| }, |
| { |
| "epoch": 0.004730626272282981, |
| "grad_norm": 4.40625, |
| "grad_norm_var": 39.73319880167643, |
| "learning_rate": 5e-05, |
| "loss": 0.2448, |
| "loss/crossentropy": 2.768284320831299, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.057292141020298004, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.004738867781468143, |
| "grad_norm": 3.25, |
| "grad_norm_var": 39.10796076456706, |
| "learning_rate": 5e-05, |
| "loss": 0.1371, |
| "loss/crossentropy": 0.4907649755477905, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01206925604492426, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.004747109290653305, |
| "grad_norm": 1.1328125, |
| "grad_norm_var": 39.77771708170573, |
| "learning_rate": 5e-05, |
| "loss": 0.1305, |
| "loss/crossentropy": 1.4613217115402222, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021137792617082596, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.0047553507998384665, |
| "grad_norm": 1.4453125, |
| "grad_norm_var": 40.673797353108725, |
| "learning_rate": 5e-05, |
| "loss": 0.1593, |
| "loss/crossentropy": 2.2154600620269775, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030367335304617882, |
| "step": 577 |
| }, |
| { |
| "epoch": 0.004763592309023628, |
| "grad_norm": 2.40625, |
| "grad_norm_var": 41.05650812784831, |
| "learning_rate": 5e-05, |
| "loss": 0.1951, |
| "loss/crossentropy": 2.493523120880127, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1513671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04371439293026924, |
| "step": 578 |
| }, |
| { |
| "epoch": 0.00477183381820879, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 41.76645075480143, |
| "learning_rate": 5e-05, |
| "loss": 0.1766, |
| "loss/crossentropy": 2.818694829940796, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04187324270606041, |
| "step": 579 |
| }, |
| { |
| "epoch": 0.004780075327393952, |
| "grad_norm": 2.5625, |
| "grad_norm_var": 42.00832697550456, |
| "learning_rate": 5e-05, |
| "loss": 0.213, |
| "loss/crossentropy": 3.1531453132629395, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1630859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.049924593418836594, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.004788316836579114, |
| "grad_norm": 2.359375, |
| "grad_norm_var": 42.370418039957684, |
| "learning_rate": 5e-05, |
| "loss": 0.1859, |
| "loss/crossentropy": 2.076953172683716, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.14453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.041379667818546295, |
| "step": 581 |
| }, |
| { |
| "epoch": 0.004796558345764276, |
| "grad_norm": 2.328125, |
| "grad_norm_var": 20.444233957926432, |
| "learning_rate": 5e-05, |
| "loss": 0.1769, |
| "loss/crossentropy": 2.5741026401519775, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1396484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03728485107421875, |
| "step": 582 |
| }, |
| { |
| "epoch": 0.004804799854949439, |
| "grad_norm": 2.875, |
| "grad_norm_var": 20.5315549214681, |
| "learning_rate": 5e-05, |
| "loss": 0.1681, |
| "loss/crossentropy": 2.4309182167053223, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033366359770298004, |
| "step": 583 |
| }, |
| { |
| "epoch": 0.004813041364134601, |
| "grad_norm": 0.94921875, |
| "grad_norm_var": 21.090232785542806, |
| "learning_rate": 5e-05, |
| "loss": 0.1001, |
| "loss/crossentropy": 0.35457542538642883, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.09228515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.007841967046260834, |
| "step": 584 |
| }, |
| { |
| "epoch": 0.0048212828733197625, |
| "grad_norm": 6.71875, |
| "grad_norm_var": 21.60826562245687, |
| "learning_rate": 5e-05, |
| "loss": 0.3594, |
| "loss/crossentropy": 2.4798266887664795, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07420751452445984, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.004829524382504924, |
| "grad_norm": 4.15625, |
| "grad_norm_var": 21.133738644917806, |
| "learning_rate": 5e-05, |
| "loss": 0.2067, |
| "loss/crossentropy": 2.589935064315796, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.166015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04071066156029701, |
| "step": 586 |
| }, |
| { |
| "epoch": 0.004837765891690086, |
| "grad_norm": 2.640625, |
| "grad_norm_var": 21.213679440816243, |
| "learning_rate": 5e-05, |
| "loss": 0.1794, |
| "loss/crossentropy": 2.3674118518829346, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1396484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03970456123352051, |
| "step": 587 |
| }, |
| { |
| "epoch": 0.004846007400875248, |
| "grad_norm": 3.921875, |
| "grad_norm_var": 2.8025491714477537, |
| "learning_rate": 5e-05, |
| "loss": 0.1419, |
| "loss/crossentropy": 0.37833818793296814, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.126953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01495468057692051, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.00485424891006041, |
| "grad_norm": 5.21875, |
| "grad_norm_var": 2.3418965021769207, |
| "learning_rate": 5e-05, |
| "loss": 0.2142, |
| "loss/crossentropy": 2.5115513801574707, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04236599802970886, |
| "step": 589 |
| }, |
| { |
| "epoch": 0.004862490419245572, |
| "grad_norm": 1.421875, |
| "grad_norm_var": 2.3552057266235353, |
| "learning_rate": 5e-05, |
| "loss": 0.1423, |
| "loss/crossentropy": 1.5357518196105957, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1162109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.026050515472888947, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.004870731928430734, |
| "grad_norm": 2.59375, |
| "grad_norm_var": 2.3474939346313475, |
| "learning_rate": 5e-05, |
| "loss": 0.1597, |
| "loss/crossentropy": 2.9540340900421143, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03472492843866348, |
| "step": 591 |
| }, |
| { |
| "epoch": 0.004878973437615897, |
| "grad_norm": 1.40625, |
| "grad_norm_var": 2.2909016291300457, |
| "learning_rate": 5e-05, |
| "loss": 0.127, |
| "loss/crossentropy": 2.3022918701171875, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10302734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024019837379455566, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.0048872149468010585, |
| "grad_norm": 1.6171875, |
| "grad_norm_var": 2.261008135477702, |
| "learning_rate": 5e-05, |
| "loss": 0.1432, |
| "loss/crossentropy": 0.7338389158248901, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12451171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.018692631274461746, |
| "step": 593 |
| }, |
| { |
| "epoch": 0.00489545645598622, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 2.2899148941040037, |
| "learning_rate": 5e-05, |
| "loss": 0.1657, |
| "loss/crossentropy": 2.504016160964966, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036834657192230225, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.004903697965171382, |
| "grad_norm": 3.953125, |
| "grad_norm_var": 2.344827715555827, |
| "learning_rate": 5e-05, |
| "loss": 0.1572, |
| "loss/crossentropy": 1.7626408338546753, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1279296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029316924512386322, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.004911939474356544, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 2.3973347345987954, |
| "learning_rate": 5e-05, |
| "loss": 0.141, |
| "loss/crossentropy": 1.9684767723083496, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.026772357523441315, |
| "step": 596 |
| }, |
| { |
| "epoch": 0.004920180983541706, |
| "grad_norm": 10.625, |
| "grad_norm_var": 6.088076210021972, |
| "learning_rate": 5e-05, |
| "loss": 0.2187, |
| "loss/crossentropy": 1.9169155359268188, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1865234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03213420510292053, |
| "step": 597 |
| }, |
| { |
| "epoch": 0.004928422492726868, |
| "grad_norm": 4.5625, |
| "grad_norm_var": 6.080293718973795, |
| "learning_rate": 5e-05, |
| "loss": 0.1759, |
| "loss/crossentropy": 1.367024540901184, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03334027901291847, |
| "step": 598 |
| }, |
| { |
| "epoch": 0.00493666400191203, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 6.131121762593588, |
| "learning_rate": 5e-05, |
| "loss": 0.2373, |
| "loss/crossentropy": 2.452223300933838, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.038047999143600464, |
| "step": 599 |
| }, |
| { |
| "epoch": 0.004944905511097192, |
| "grad_norm": 3.859375, |
| "grad_norm_var": 5.665278879801432, |
| "learning_rate": 5e-05, |
| "loss": 0.2239, |
| "loss/crossentropy": 1.2548903226852417, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1982421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025662653148174286, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.004953147020282354, |
| "grad_norm": 3.265625, |
| "grad_norm_var": 5.018717193603516, |
| "learning_rate": 5e-05, |
| "loss": 0.2458, |
| "loss/crossentropy": 2.334947347640991, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.058346427977085114, |
| "step": 601 |
| }, |
| { |
| "epoch": 0.0049613885294675164, |
| "grad_norm": 1.140625, |
| "grad_norm_var": 5.315175120035807, |
| "learning_rate": 5e-05, |
| "loss": 0.0935, |
| "loss/crossentropy": 0.5309077501296997, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0849609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.008535758592188358, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.004969630038652678, |
| "grad_norm": 2.859375, |
| "grad_norm_var": 5.299181874593099, |
| "learning_rate": 5e-05, |
| "loss": 0.2036, |
| "loss/crossentropy": 2.4964077472686768, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.158203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.045372504740953445, |
| "step": 603 |
| }, |
| { |
| "epoch": 0.00497787154783784, |
| "grad_norm": 1.515625, |
| "grad_norm_var": 5.463201649983724, |
| "learning_rate": 5e-05, |
| "loss": 0.1444, |
| "loss/crossentropy": 1.4457087516784668, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.119140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025238193571567535, |
| "step": 604 |
| }, |
| { |
| "epoch": 0.004986113057023002, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 5.188616689046224, |
| "learning_rate": 5e-05, |
| "loss": 0.164, |
| "loss/crossentropy": 1.5325767993927002, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033134825527668, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.004994354566208164, |
| "grad_norm": 2.875, |
| "grad_norm_var": 5.019653065999349, |
| "learning_rate": 5e-05, |
| "loss": 0.2116, |
| "loss/crossentropy": 1.964009404182434, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1494140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06218406930565834, |
| "step": 606 |
| }, |
| { |
| "epoch": 0.005002596075393326, |
| "grad_norm": 1.796875, |
| "grad_norm_var": 5.109509023030599, |
| "learning_rate": 5e-05, |
| "loss": 0.1477, |
| "loss/crossentropy": 1.4914216995239258, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12353515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024151228368282318, |
| "step": 607 |
| }, |
| { |
| "epoch": 0.005010837584578488, |
| "grad_norm": 2.6875, |
| "grad_norm_var": 4.937090810139974, |
| "learning_rate": 5e-05, |
| "loss": 0.167, |
| "loss/crossentropy": 1.5185096263885498, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02637227438390255, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.00501907909376365, |
| "grad_norm": 3.21875, |
| "grad_norm_var": 4.781574503580729, |
| "learning_rate": 5e-05, |
| "loss": 0.2422, |
| "loss/crossentropy": 2.671537160873413, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1904296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05179464817047119, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.005027320602948812, |
| "grad_norm": 2.640625, |
| "grad_norm_var": 4.712612915039062, |
| "learning_rate": 5e-05, |
| "loss": 0.1652, |
| "loss/crossentropy": 1.1440718173980713, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.14453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.020696066319942474, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.005035562112133974, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 4.7286529541015625, |
| "learning_rate": 5e-05, |
| "loss": 0.1605, |
| "loss/crossentropy": 0.47643014788627625, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.14453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.016013499349355698, |
| "step": 611 |
| }, |
| { |
| "epoch": 0.005043803621319136, |
| "grad_norm": 1.7421875, |
| "grad_norm_var": 4.764475250244141, |
| "learning_rate": 5e-05, |
| "loss": 0.1235, |
| "loss/crossentropy": 1.5909593105316162, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.018061984330415726, |
| "step": 612 |
| }, |
| { |
| "epoch": 0.005052045130504298, |
| "grad_norm": 1.4765625, |
| "grad_norm_var": 0.8343994140625, |
| "learning_rate": 5e-05, |
| "loss": 0.1189, |
| "loss/crossentropy": 2.336538076400757, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0966796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.022249765694141388, |
| "step": 613 |
| }, |
| { |
| "epoch": 0.00506028663968946, |
| "grad_norm": 4.9375, |
| "grad_norm_var": 0.9441650390625, |
| "learning_rate": 5e-05, |
| "loss": 0.2263, |
| "loss/crossentropy": 1.665325403213501, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.189453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036880407482385635, |
| "step": 614 |
| }, |
| { |
| "epoch": 0.005068528148874622, |
| "grad_norm": 3.765625, |
| "grad_norm_var": 1.0315826416015625, |
| "learning_rate": 5e-05, |
| "loss": 0.2465, |
| "loss/crossentropy": 1.6349374055862427, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.185546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06092265248298645, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.005076769658059784, |
| "grad_norm": 3.8125, |
| "grad_norm_var": 1.02415771484375, |
| "learning_rate": 5e-05, |
| "loss": 0.2903, |
| "loss/crossentropy": 2.2787890434265137, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07248455286026001, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.005085011167244946, |
| "grad_norm": 1.421875, |
| "grad_norm_var": 1.0844156901041666, |
| "learning_rate": 5e-05, |
| "loss": 0.1222, |
| "loss/crossentropy": 0.9886749982833862, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10595703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.016225244849920273, |
| "step": 617 |
| }, |
| { |
| "epoch": 0.005093252676430108, |
| "grad_norm": 2.703125, |
| "grad_norm_var": 0.9472900390625, |
| "learning_rate": 5e-05, |
| "loss": 0.217, |
| "loss/crossentropy": 2.012622594833374, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0451560840010643, |
| "step": 618 |
| }, |
| { |
| "epoch": 0.0051014941856152695, |
| "grad_norm": 1.9375, |
| "grad_norm_var": 0.9720774332682292, |
| "learning_rate": 5e-05, |
| "loss": 0.1792, |
| "loss/crossentropy": 2.2141380310058594, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1435546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03566785901784897, |
| "step": 619 |
| }, |
| { |
| "epoch": 0.005109735694800432, |
| "grad_norm": 1.9765625, |
| "grad_norm_var": 0.9204770406087239, |
| "learning_rate": 5e-05, |
| "loss": 0.1819, |
| "loss/crossentropy": 2.5171334743499756, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1474609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03444764018058777, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.005117977203985594, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 0.9281979878743489, |
| "learning_rate": 5e-05, |
| "loss": 0.1252, |
| "loss/crossentropy": 1.3747243881225586, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1025390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.022622188553214073, |
| "step": 621 |
| }, |
| { |
| "epoch": 0.005126218713170756, |
| "grad_norm": 3.5625, |
| "grad_norm_var": 0.9839230855305989, |
| "learning_rate": 5e-05, |
| "loss": 0.1941, |
| "loss/crossentropy": 1.6743167638778687, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03780867159366608, |
| "step": 622 |
| }, |
| { |
| "epoch": 0.005134460222355918, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 1.0123687744140626, |
| "learning_rate": 5e-05, |
| "loss": 0.1265, |
| "loss/crossentropy": 2.2040176391601562, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.09814453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028337322175502777, |
| "step": 623 |
| }, |
| { |
| "epoch": 0.00514270173154108, |
| "grad_norm": 2.5625, |
| "grad_norm_var": 1.0121897379557292, |
| "learning_rate": 5e-05, |
| "loss": 0.1402, |
| "loss/crossentropy": 0.7447776794433594, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.016176920384168625, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.005150943240726242, |
| "grad_norm": 1.6953125, |
| "grad_norm_var": 1.0336626688639323, |
| "learning_rate": 5e-05, |
| "loss": 0.1657, |
| "loss/crossentropy": 1.4739638566970825, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03092530183494091, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.005159184749911404, |
| "grad_norm": 2.484375, |
| "grad_norm_var": 1.032574208577474, |
| "learning_rate": 5e-05, |
| "loss": 0.1703, |
| "loss/crossentropy": 2.6135871410369873, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1318359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03844151645898819, |
| "step": 626 |
| }, |
| { |
| "epoch": 0.0051674262590965655, |
| "grad_norm": 2.75, |
| "grad_norm_var": 1.0317543029785157, |
| "learning_rate": 5e-05, |
| "loss": 0.1908, |
| "loss/crossentropy": 2.5848896503448486, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03841109946370125, |
| "step": 627 |
| }, |
| { |
| "epoch": 0.005175667768281727, |
| "grad_norm": 1.9375, |
| "grad_norm_var": 1.0134755452473958, |
| "learning_rate": 5e-05, |
| "loss": 0.1499, |
| "loss/crossentropy": 0.32518985867500305, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1396484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.010256130248308182, |
| "step": 628 |
| }, |
| { |
| "epoch": 0.005183909277466889, |
| "grad_norm": 3.296875, |
| "grad_norm_var": 0.9605608622233073, |
| "learning_rate": 5e-05, |
| "loss": 0.1996, |
| "loss/crossentropy": 2.3065407276153564, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.035511888563632965, |
| "step": 629 |
| }, |
| { |
| "epoch": 0.005192150786652052, |
| "grad_norm": 1.828125, |
| "grad_norm_var": 0.6212827046712239, |
| "learning_rate": 5e-05, |
| "loss": 0.1813, |
| "loss/crossentropy": 2.5903186798095703, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.040679510682821274, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.005200392295837214, |
| "grad_norm": 2.828125, |
| "grad_norm_var": 0.5139218648274739, |
| "learning_rate": 5e-05, |
| "loss": 0.1537, |
| "loss/crossentropy": 1.5172139406204224, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029673364013433456, |
| "step": 631 |
| }, |
| { |
| "epoch": 0.005208633805022376, |
| "grad_norm": 2.765625, |
| "grad_norm_var": 0.3864702860514323, |
| "learning_rate": 5e-05, |
| "loss": 0.1728, |
| "loss/crossentropy": 1.9679391384124756, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1396484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03311806917190552, |
| "step": 632 |
| }, |
| { |
| "epoch": 0.005216875314207538, |
| "grad_norm": 1.453125, |
| "grad_norm_var": 0.38269220987955727, |
| "learning_rate": 5e-05, |
| "loss": 0.1564, |
| "loss/crossentropy": 1.8976471424102783, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1259765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030387457460165024, |
| "step": 633 |
| }, |
| { |
| "epoch": 0.0052251168233927, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 0.4224077860514323, |
| "learning_rate": 5e-05, |
| "loss": 0.1537, |
| "loss/crossentropy": 2.396242618560791, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03165833652019501, |
| "step": 634 |
| }, |
| { |
| "epoch": 0.0052333583325778615, |
| "grad_norm": 3.296875, |
| "grad_norm_var": 0.4783404032389323, |
| "learning_rate": 5e-05, |
| "loss": 0.2148, |
| "loss/crossentropy": 2.8331291675567627, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1669921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04783923923969269, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.005241599841763023, |
| "grad_norm": 2.640625, |
| "grad_norm_var": 0.47274169921875, |
| "learning_rate": 5e-05, |
| "loss": 0.1847, |
| "loss/crossentropy": 0.8493193984031677, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028483962640166283, |
| "step": 636 |
| }, |
| { |
| "epoch": 0.005249841350948185, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 0.4718831380208333, |
| "learning_rate": 5e-05, |
| "loss": 0.157, |
| "loss/crossentropy": 2.3299150466918945, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.032000426203012466, |
| "step": 637 |
| }, |
| { |
| "epoch": 0.005258082860133347, |
| "grad_norm": 2.796875, |
| "grad_norm_var": 0.3892893473307292, |
| "learning_rate": 5e-05, |
| "loss": 0.181, |
| "loss/crossentropy": 1.8806333541870117, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.14453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03642831742763519, |
| "step": 638 |
| }, |
| { |
| "epoch": 0.00526632436931851, |
| "grad_norm": 3.828125, |
| "grad_norm_var": 0.4741778055826823, |
| "learning_rate": 5e-05, |
| "loss": 0.2628, |
| "loss/crossentropy": 2.58074951171875, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2021484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06061544269323349, |
| "step": 639 |
| }, |
| { |
| "epoch": 0.005274565878503672, |
| "grad_norm": 3.0625, |
| "grad_norm_var": 0.49478327433268227, |
| "learning_rate": 5e-05, |
| "loss": 0.2085, |
| "loss/crossentropy": 2.6464812755584717, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1630859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04545789211988449, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.005282807387688834, |
| "grad_norm": 32.0, |
| "grad_norm_var": 54.56477762858073, |
| "learning_rate": 5e-05, |
| "loss": 0.8256, |
| "loss/crossentropy": 3.0567638874053955, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.27873265743255615, |
| "step": 641 |
| }, |
| { |
| "epoch": 0.005291048896873996, |
| "grad_norm": 3.09375, |
| "grad_norm_var": 54.431278483072916, |
| "learning_rate": 5e-05, |
| "loss": 0.2481, |
| "loss/crossentropy": 1.814937949180603, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.197265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.050825513899326324, |
| "step": 642 |
| }, |
| { |
| "epoch": 0.0052992904060591576, |
| "grad_norm": 3.5, |
| "grad_norm_var": 54.29631754557292, |
| "learning_rate": 5e-05, |
| "loss": 0.1537, |
| "loss/crossentropy": 1.2851777076721191, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1337890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019936833530664444, |
| "step": 643 |
| }, |
| { |
| "epoch": 0.0053075319152443194, |
| "grad_norm": 7.25, |
| "grad_norm_var": 54.24651285807292, |
| "learning_rate": 5e-05, |
| "loss": 0.2315, |
| "loss/crossentropy": 1.3410425186157227, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2001953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03128086030483246, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.005315773424429481, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 54.49813537597656, |
| "learning_rate": 5e-05, |
| "loss": 0.1844, |
| "loss/crossentropy": 2.3179078102111816, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1435546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04083427041769028, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.005324014933614643, |
| "grad_norm": 3.90625, |
| "grad_norm_var": 53.952762858072916, |
| "learning_rate": 5e-05, |
| "loss": 0.2196, |
| "loss/crossentropy": 2.614328384399414, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1689453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05064046010375023, |
| "step": 646 |
| }, |
| { |
| "epoch": 0.005332256442799805, |
| "grad_norm": 3088.0, |
| "grad_norm_var": 594094.3569895426, |
| "learning_rate": 5e-05, |
| "loss": 74.0435, |
| "loss/crossentropy": 5.280629634857178, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 70.5, |
| "loss/idx": 0.0, |
| "loss/logits": 3.543522834777832, |
| "step": 647 |
| }, |
| { |
| "epoch": 0.005340497951984968, |
| "grad_norm": 6.65625, |
| "grad_norm_var": 593994.1685831706, |
| "learning_rate": 5e-05, |
| "loss": 0.3181, |
| "loss/crossentropy": 2.1775877475738525, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.26171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05634221434593201, |
| "step": 648 |
| }, |
| { |
| "epoch": 0.00534873946117013, |
| "grad_norm": 3.375, |
| "grad_norm_var": 593944.0428049724, |
| "learning_rate": 5e-05, |
| "loss": 0.2034, |
| "loss/crossentropy": 1.4645721912384033, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03352264687418938, |
| "step": 649 |
| }, |
| { |
| "epoch": 0.005356980970355292, |
| "grad_norm": 1.15625, |
| "grad_norm_var": 593951.4221018474, |
| "learning_rate": 5e-05, |
| "loss": 0.1173, |
| "loss/crossentropy": 1.4656540155410767, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0986328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.018643483519554138, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.005365222479540454, |
| "grad_norm": 1.1953125, |
| "grad_norm_var": 594006.2750038147, |
| "learning_rate": 5e-05, |
| "loss": 0.1105, |
| "loss/crossentropy": 0.5469728708267212, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.095703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.014785300940275192, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.0053734639887256155, |
| "grad_norm": 1.796875, |
| "grad_norm_var": 594028.2904518128, |
| "learning_rate": 5e-05, |
| "loss": 0.1277, |
| "loss/crossentropy": 1.5997320413589478, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.103515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024179790169000626, |
| "step": 652 |
| }, |
| { |
| "epoch": 0.005381705497910777, |
| "grad_norm": 2.546875, |
| "grad_norm_var": 594019.3290728251, |
| "learning_rate": 5e-05, |
| "loss": 0.2144, |
| "loss/crossentropy": 1.897312045097351, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04644346237182617, |
| "step": 653 |
| }, |
| { |
| "epoch": 0.005389947007095939, |
| "grad_norm": 7.25, |
| "grad_norm_var": 593904.7219866435, |
| "learning_rate": 5e-05, |
| "loss": 0.1896, |
| "loss/crossentropy": 0.3107888996601105, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021600116044282913, |
| "step": 654 |
| }, |
| { |
| "epoch": 0.005398188516281101, |
| "grad_norm": 3.40625, |
| "grad_norm_var": 593915.6656878154, |
| "learning_rate": 5e-05, |
| "loss": 0.2583, |
| "loss/crossentropy": 2.793339967727661, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.057117462158203125, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.005406430025466263, |
| "grad_norm": 2.328125, |
| "grad_norm_var": 593934.8025632222, |
| "learning_rate": 5e-05, |
| "loss": 0.18, |
| "loss/crossentropy": 1.4796594381332397, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027638476341962814, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.005414671534651425, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 594663.6030799865, |
| "learning_rate": 5e-05, |
| "loss": 0.1305, |
| "loss/crossentropy": 1.4301519393920898, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021160051226615906, |
| "step": 657 |
| }, |
| { |
| "epoch": 0.005422913043836588, |
| "grad_norm": 1.8203125, |
| "grad_norm_var": 594696.4953653972, |
| "learning_rate": 5e-05, |
| "loss": 0.1742, |
| "loss/crossentropy": 1.6321804523468018, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1376953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03655288740992546, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.00543115455302175, |
| "grad_norm": 3.46875, |
| "grad_norm_var": 594697.2980875651, |
| "learning_rate": 5e-05, |
| "loss": 0.263, |
| "loss/crossentropy": 2.723104476928711, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0637696385383606, |
| "step": 659 |
| }, |
| { |
| "epoch": 0.0054393960622069115, |
| "grad_norm": 3.140625, |
| "grad_norm_var": 594801.8477040608, |
| "learning_rate": 5e-05, |
| "loss": 0.1395, |
| "loss/crossentropy": 1.4695775508880615, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021351780742406845, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.005447637571392073, |
| "grad_norm": 1.9375, |
| "grad_norm_var": 594812.3412261963, |
| "learning_rate": 5e-05, |
| "loss": 0.1788, |
| "loss/crossentropy": 2.657424211502075, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.040120575577020645, |
| "step": 661 |
| }, |
| { |
| "epoch": 0.005455879080577235, |
| "grad_norm": 5.0, |
| "grad_norm_var": 594784.4235422771, |
| "learning_rate": 5e-05, |
| "loss": 0.1576, |
| "loss/crossentropy": 0.4258406162261963, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1396484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01790551282465458, |
| "step": 662 |
| }, |
| { |
| "epoch": 0.005464120589762397, |
| "grad_norm": 3.109375, |
| "grad_norm_var": 3.251456705729167, |
| "learning_rate": 5e-05, |
| "loss": 0.2469, |
| "loss/crossentropy": 1.6977794170379639, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.208984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03788114711642265, |
| "step": 663 |
| }, |
| { |
| "epoch": 0.005472362098947559, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 2.4230974833170573, |
| "learning_rate": 5e-05, |
| "loss": 0.1297, |
| "loss/crossentropy": 1.3648561239242554, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1083984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021297637373209, |
| "step": 664 |
| }, |
| { |
| "epoch": 0.005480603608132721, |
| "grad_norm": 1.828125, |
| "grad_norm_var": 2.4579424540201824, |
| "learning_rate": 5e-05, |
| "loss": 0.1722, |
| "loss/crossentropy": 2.6582846641540527, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1337890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03840293735265732, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.005488845117317883, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 2.3624529520670574, |
| "learning_rate": 5e-05, |
| "loss": 0.1233, |
| "loss/crossentropy": 1.4073052406311035, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.103515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019805913791060448, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.005497086626503046, |
| "grad_norm": 3.015625, |
| "grad_norm_var": 2.1906728108723956, |
| "learning_rate": 5e-05, |
| "loss": 0.1421, |
| "loss/crossentropy": 1.2232915163040161, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02389139123260975, |
| "step": 667 |
| }, |
| { |
| "epoch": 0.0055053281356882075, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 2.155370076497396, |
| "learning_rate": 5e-05, |
| "loss": 0.1769, |
| "loss/crossentropy": 2.633976697921753, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03825639933347702, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.005513569644873369, |
| "grad_norm": 1.3984375, |
| "grad_norm_var": 2.289989980061849, |
| "learning_rate": 5e-05, |
| "loss": 0.1432, |
| "loss/crossentropy": 2.7594661712646484, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1123046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030905555933713913, |
| "step": 669 |
| }, |
| { |
| "epoch": 0.005521811154058531, |
| "grad_norm": 2.875, |
| "grad_norm_var": 0.899731190999349, |
| "learning_rate": 5e-05, |
| "loss": 0.2301, |
| "loss/crossentropy": 2.9316701889038086, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.058252155780792236, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.005530052663243693, |
| "grad_norm": 2.109375, |
| "grad_norm_var": 0.8554888407389323, |
| "learning_rate": 5e-05, |
| "loss": 0.1341, |
| "loss/crossentropy": 1.2458611726760864, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.111328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.022815225645899773, |
| "step": 671 |
| }, |
| { |
| "epoch": 0.005538294172428855, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 0.8775530497233073, |
| "learning_rate": 5e-05, |
| "loss": 0.1773, |
| "loss/crossentropy": 1.787272572517395, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03861699998378754, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.005546535681614017, |
| "grad_norm": 5.40625, |
| "grad_norm_var": 1.3726600646972655, |
| "learning_rate": 5e-05, |
| "loss": 0.2194, |
| "loss/crossentropy": 1.330421805381775, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1943359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025075137615203857, |
| "step": 673 |
| }, |
| { |
| "epoch": 0.005554777190799179, |
| "grad_norm": 0.9609375, |
| "grad_norm_var": 1.5155535380045573, |
| "learning_rate": 5e-05, |
| "loss": 0.1263, |
| "loss/crossentropy": 2.6272873878479004, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024767953902482986, |
| "step": 674 |
| }, |
| { |
| "epoch": 0.005563018699984341, |
| "grad_norm": 5.375, |
| "grad_norm_var": 1.9607175191243489, |
| "learning_rate": 5e-05, |
| "loss": 0.3144, |
| "loss/crossentropy": 3.5015265941619873, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.23828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07607264816761017, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.0055712602091695035, |
| "grad_norm": 2.546875, |
| "grad_norm_var": 1.9502418518066407, |
| "learning_rate": 5e-05, |
| "loss": 0.2386, |
| "loss/crossentropy": 2.711198091506958, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05106600373983383, |
| "step": 676 |
| }, |
| { |
| "epoch": 0.005579501718354665, |
| "grad_norm": 2.765625, |
| "grad_norm_var": 1.909698232014974, |
| "learning_rate": 5e-05, |
| "loss": 0.203, |
| "loss/crossentropy": 1.9062882661819458, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03898348659276962, |
| "step": 677 |
| }, |
| { |
| "epoch": 0.005587743227539827, |
| "grad_norm": 1.796875, |
| "grad_norm_var": 1.5877174377441405, |
| "learning_rate": 5e-05, |
| "loss": 0.1617, |
| "loss/crossentropy": 2.5185117721557617, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.126953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03473159298300743, |
| "step": 678 |
| }, |
| { |
| "epoch": 0.005595984736724989, |
| "grad_norm": 3.71875, |
| "grad_norm_var": 1.6568275451660157, |
| "learning_rate": 5e-05, |
| "loss": 0.1797, |
| "loss/crossentropy": 1.0520906448364258, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.015656642615795135, |
| "step": 679 |
| }, |
| { |
| "epoch": 0.005604226245910151, |
| "grad_norm": 3.34375, |
| "grad_norm_var": 1.6539265950520834, |
| "learning_rate": 5e-05, |
| "loss": 0.1532, |
| "loss/crossentropy": 1.6271475553512573, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.022353362292051315, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.005612467755095313, |
| "grad_norm": 1.6796875, |
| "grad_norm_var": 1.6720415751139324, |
| "learning_rate": 5e-05, |
| "loss": 0.1228, |
| "loss/crossentropy": 0.8219252228736877, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1083984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.014401828870177269, |
| "step": 681 |
| }, |
| { |
| "epoch": 0.005620709264280475, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 1.6105323791503907, |
| "learning_rate": 5e-05, |
| "loss": 0.1697, |
| "loss/crossentropy": 1.7836802005767822, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03692241013050079, |
| "step": 682 |
| }, |
| { |
| "epoch": 0.005628950773465637, |
| "grad_norm": 5.0, |
| "grad_norm_var": 1.9368690490722655, |
| "learning_rate": 5e-05, |
| "loss": 0.2266, |
| "loss/crossentropy": 2.947277784347534, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04695526510477066, |
| "step": 683 |
| }, |
| { |
| "epoch": 0.005637192282650799, |
| "grad_norm": 3.171875, |
| "grad_norm_var": 1.9010515848795573, |
| "learning_rate": 5e-05, |
| "loss": 0.1932, |
| "loss/crossentropy": 2.684128999710083, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04086027294397354, |
| "step": 684 |
| }, |
| { |
| "epoch": 0.005645433791835961, |
| "grad_norm": 3.90625, |
| "grad_norm_var": 1.7904368082682292, |
| "learning_rate": 5e-05, |
| "loss": 0.3706, |
| "loss/crossentropy": 2.2005116939544678, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.30078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06984560191631317, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.005653675301021123, |
| "grad_norm": 4.5625, |
| "grad_norm_var": 1.9264475504557292, |
| "learning_rate": 5e-05, |
| "loss": 0.2539, |
| "loss/crossentropy": 2.7152457237243652, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05464334040880203, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.005661916810206285, |
| "grad_norm": 3.6875, |
| "grad_norm_var": 1.8595621744791666, |
| "learning_rate": 5e-05, |
| "loss": 0.2398, |
| "loss/crossentropy": 3.029996633529663, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.060077205300331116, |
| "step": 687 |
| }, |
| { |
| "epoch": 0.005670158319391447, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 1.787433878580729, |
| "learning_rate": 5e-05, |
| "loss": 0.1699, |
| "loss/crossentropy": 2.006049156188965, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1376953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.032181136310100555, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.005678399828576609, |
| "grad_norm": 3.296875, |
| "grad_norm_var": 1.4714396158854166, |
| "learning_rate": 5e-05, |
| "loss": 0.176, |
| "loss/crossentropy": 2.032977819442749, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.037300050258636475, |
| "step": 689 |
| }, |
| { |
| "epoch": 0.005686641337761771, |
| "grad_norm": 2.0, |
| "grad_norm_var": 1.2339637756347657, |
| "learning_rate": 5e-05, |
| "loss": 0.1712, |
| "loss/crossentropy": 1.572161316871643, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1474609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023761317133903503, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.005694882846946933, |
| "grad_norm": 1.578125, |
| "grad_norm_var": 1.0475807189941406, |
| "learning_rate": 5e-05, |
| "loss": 0.1608, |
| "loss/crossentropy": 0.5517882704734802, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02215992659330368, |
| "step": 691 |
| }, |
| { |
| "epoch": 0.005703124356132095, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 1.076873524983724, |
| "learning_rate": 5e-05, |
| "loss": 0.1669, |
| "loss/crossentropy": 2.8021109104156494, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036080196499824524, |
| "step": 692 |
| }, |
| { |
| "epoch": 0.005711365865317257, |
| "grad_norm": 1.140625, |
| "grad_norm_var": 1.2856056213378906, |
| "learning_rate": 5e-05, |
| "loss": 0.1113, |
| "loss/crossentropy": 0.46256670355796814, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.09716796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.014169261790812016, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.0057196073745024185, |
| "grad_norm": 1.8046875, |
| "grad_norm_var": 1.284496053059896, |
| "learning_rate": 5e-05, |
| "loss": 0.145, |
| "loss/crossentropy": 1.454952359199524, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023933004587888718, |
| "step": 694 |
| }, |
| { |
| "epoch": 0.005727848883687581, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 1.3670644124348958, |
| "learning_rate": 5e-05, |
| "loss": 0.1255, |
| "loss/crossentropy": 1.4848099946975708, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.103515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021940922364592552, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.005736090392872743, |
| "grad_norm": 4.125, |
| "grad_norm_var": 1.470417277018229, |
| "learning_rate": 5e-05, |
| "loss": 0.2243, |
| "loss/crossentropy": 2.6562836170196533, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1748046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04948100447654724, |
| "step": 696 |
| }, |
| { |
| "epoch": 0.005744331902057905, |
| "grad_norm": 3.71875, |
| "grad_norm_var": 1.4347735087076823, |
| "learning_rate": 5e-05, |
| "loss": 0.1446, |
| "loss/crossentropy": 1.2862632274627686, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019616402685642242, |
| "step": 697 |
| }, |
| { |
| "epoch": 0.005752573411243067, |
| "grad_norm": 2.53125, |
| "grad_norm_var": 1.4314735412597657, |
| "learning_rate": 5e-05, |
| "loss": 0.1774, |
| "loss/crossentropy": 2.1273880004882812, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036726418882608414, |
| "step": 698 |
| }, |
| { |
| "epoch": 0.005760814920428229, |
| "grad_norm": 1.7421875, |
| "grad_norm_var": 1.1817291259765625, |
| "learning_rate": 5e-05, |
| "loss": 0.1739, |
| "loss/crossentropy": 2.4990146160125732, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1357421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03811194375157356, |
| "step": 699 |
| }, |
| { |
| "epoch": 0.005769056429613391, |
| "grad_norm": 4.15625, |
| "grad_norm_var": 1.30496826171875, |
| "learning_rate": 5e-05, |
| "loss": 0.252, |
| "loss/crossentropy": 2.5132620334625244, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2041015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04793284088373184, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.005777297938798553, |
| "grad_norm": 3.359375, |
| "grad_norm_var": 1.2397776285807292, |
| "learning_rate": 5e-05, |
| "loss": 0.2582, |
| "loss/crossentropy": 2.5892200469970703, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05511770024895668, |
| "step": 701 |
| }, |
| { |
| "epoch": 0.0057855394479837145, |
| "grad_norm": 3.328125, |
| "grad_norm_var": 1.0320393880208334, |
| "learning_rate": 5e-05, |
| "loss": 0.2048, |
| "loss/crossentropy": 1.6278858184814453, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1650390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03974189609289169, |
| "step": 702 |
| }, |
| { |
| "epoch": 0.005793780957168876, |
| "grad_norm": 5.4375, |
| "grad_norm_var": 1.4668050130208334, |
| "learning_rate": 5e-05, |
| "loss": 0.2686, |
| "loss/crossentropy": 1.9843369722366333, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.047865502536296844, |
| "step": 703 |
| }, |
| { |
| "epoch": 0.005802022466354039, |
| "grad_norm": 1.1796875, |
| "grad_norm_var": 1.6136797587076823, |
| "learning_rate": 5e-05, |
| "loss": 0.1149, |
| "loss/crossentropy": 1.3343570232391357, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.095703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019155774265527725, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.005810263975539201, |
| "grad_norm": 2.78125, |
| "grad_norm_var": 1.5880999247233072, |
| "learning_rate": 5e-05, |
| "loss": 0.2146, |
| "loss/crossentropy": 1.3166966438293457, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03489375486969948, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.005818505484724363, |
| "grad_norm": 3.109375, |
| "grad_norm_var": 1.568743642171224, |
| "learning_rate": 5e-05, |
| "loss": 0.2285, |
| "loss/crossentropy": 1.6788283586502075, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1845703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.043936047703027725, |
| "step": 706 |
| }, |
| { |
| "epoch": 0.005826746993909525, |
| "grad_norm": 3.3125, |
| "grad_norm_var": 1.4926389058430989, |
| "learning_rate": 5e-05, |
| "loss": 0.1728, |
| "loss/crossentropy": 2.161801815032959, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034114591777324677, |
| "step": 707 |
| }, |
| { |
| "epoch": 0.005834988503094687, |
| "grad_norm": 3.015625, |
| "grad_norm_var": 1.4647112528483073, |
| "learning_rate": 5e-05, |
| "loss": 0.1577, |
| "loss/crossentropy": 2.5755579471588135, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.032723598182201385, |
| "step": 708 |
| }, |
| { |
| "epoch": 0.005843230012279849, |
| "grad_norm": 3.015625, |
| "grad_norm_var": 1.2495012919108073, |
| "learning_rate": 5e-05, |
| "loss": 0.2659, |
| "loss/crossentropy": 2.3093197345733643, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.21484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05105920881032944, |
| "step": 709 |
| }, |
| { |
| "epoch": 0.0058514715214650105, |
| "grad_norm": 3.3125, |
| "grad_norm_var": 1.1517781575520833, |
| "learning_rate": 5e-05, |
| "loss": 0.2048, |
| "loss/crossentropy": 1.972427248954773, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04072684049606323, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.005859713030650172, |
| "grad_norm": 1.375, |
| "grad_norm_var": 1.1445556640625, |
| "learning_rate": 5e-05, |
| "loss": 0.1302, |
| "loss/crossentropy": 1.6267015933990479, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.107421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.022783808410167694, |
| "step": 711 |
| }, |
| { |
| "epoch": 0.005867954539835334, |
| "grad_norm": 3.046875, |
| "grad_norm_var": 1.0689605712890624, |
| "learning_rate": 5e-05, |
| "loss": 0.1712, |
| "loss/crossentropy": 2.76011323928833, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1337890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.037429310381412506, |
| "step": 712 |
| }, |
| { |
| "epoch": 0.005876196049020497, |
| "grad_norm": 1.8125, |
| "grad_norm_var": 1.1200917561848958, |
| "learning_rate": 5e-05, |
| "loss": 0.1373, |
| "loss/crossentropy": 1.5537292957305908, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11572265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021540062502026558, |
| "step": 713 |
| }, |
| { |
| "epoch": 0.005884437558205659, |
| "grad_norm": 1.90625, |
| "grad_norm_var": 1.1758371988932292, |
| "learning_rate": 5e-05, |
| "loss": 0.1585, |
| "loss/crossentropy": 2.612987995147705, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036412760615348816, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.005892679067390821, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 1.1580474853515625, |
| "learning_rate": 5e-05, |
| "loss": 0.1586, |
| "loss/crossentropy": 2.5890583992004395, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033603958785533905, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.005900920576575983, |
| "grad_norm": 1.671875, |
| "grad_norm_var": 1.11971435546875, |
| "learning_rate": 5e-05, |
| "loss": 0.1545, |
| "loss/crossentropy": 2.612929344177246, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0363757461309433, |
| "step": 716 |
| }, |
| { |
| "epoch": 0.005909162085761145, |
| "grad_norm": 3.078125, |
| "grad_norm_var": 1.1007080078125, |
| "learning_rate": 5e-05, |
| "loss": 0.1632, |
| "loss/crossentropy": 0.4507390856742859, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1416015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021595872938632965, |
| "step": 717 |
| }, |
| { |
| "epoch": 0.0059174035949463065, |
| "grad_norm": 8.25, |
| "grad_norm_var": 3.0249176025390625, |
| "learning_rate": 5e-05, |
| "loss": 0.1985, |
| "loss/crossentropy": 0.8891280889511108, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1748046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023697488009929657, |
| "step": 718 |
| }, |
| { |
| "epoch": 0.005925645104131468, |
| "grad_norm": 0.90625, |
| "grad_norm_var": 2.842015584309896, |
| "learning_rate": 5e-05, |
| "loss": 0.1113, |
| "loss/crossentropy": 1.2778152227401733, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.09423828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.017108086496591568, |
| "step": 719 |
| }, |
| { |
| "epoch": 0.00593388661331663, |
| "grad_norm": 3.625, |
| "grad_norm_var": 2.711073557535807, |
| "learning_rate": 5e-05, |
| "loss": 0.1612, |
| "loss/crossentropy": 2.4469552040100098, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1318359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029361439868807793, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.005942128122501792, |
| "grad_norm": 1.4453125, |
| "grad_norm_var": 2.8402750651041666, |
| "learning_rate": 5e-05, |
| "loss": 0.1502, |
| "loss/crossentropy": 2.1400833129882812, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11669921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033529091626405716, |
| "step": 721 |
| }, |
| { |
| "epoch": 0.005950369631686954, |
| "grad_norm": 2.109375, |
| "grad_norm_var": 2.8611083984375, |
| "learning_rate": 5e-05, |
| "loss": 0.1749, |
| "loss/crossentropy": 1.3967795372009277, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03236193209886551, |
| "step": 722 |
| }, |
| { |
| "epoch": 0.005958611140872117, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 2.855537923177083, |
| "learning_rate": 5e-05, |
| "loss": 0.2143, |
| "loss/crossentropy": 2.475792646408081, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.166015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.048240065574645996, |
| "step": 723 |
| }, |
| { |
| "epoch": 0.005966852650057279, |
| "grad_norm": 2.359375, |
| "grad_norm_var": 2.8515218098958335, |
| "learning_rate": 5e-05, |
| "loss": 0.1735, |
| "loss/crossentropy": 2.5644333362579346, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04069886356592178, |
| "step": 724 |
| }, |
| { |
| "epoch": 0.005975094159242441, |
| "grad_norm": 2.0625, |
| "grad_norm_var": 2.8581614176432293, |
| "learning_rate": 5e-05, |
| "loss": 0.1677, |
| "loss/crossentropy": 2.2065787315368652, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0367942750453949, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.0059833356684276025, |
| "grad_norm": 2.734375, |
| "grad_norm_var": 2.8211629231770834, |
| "learning_rate": 5e-05, |
| "loss": 0.2124, |
| "loss/crossentropy": 2.876722574234009, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1630859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.049362972378730774, |
| "step": 726 |
| }, |
| { |
| "epoch": 0.0059915771776127644, |
| "grad_norm": 8.0, |
| "grad_norm_var": 4.548148600260417, |
| "learning_rate": 5e-05, |
| "loss": 0.1838, |
| "loss/crossentropy": 1.2283939123153687, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03141150623559952, |
| "step": 727 |
| }, |
| { |
| "epoch": 0.005999818686797926, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 4.618230946858724, |
| "learning_rate": 5e-05, |
| "loss": 0.1644, |
| "loss/crossentropy": 2.0428686141967773, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03161695599555969, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.006008060195983088, |
| "grad_norm": 7.65625, |
| "grad_norm_var": 5.931933339436849, |
| "learning_rate": 5e-05, |
| "loss": 0.3075, |
| "loss/crossentropy": 2.5533790588378906, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0672769546508789, |
| "step": 729 |
| }, |
| { |
| "epoch": 0.00601630170516825, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 5.852355702718099, |
| "learning_rate": 5e-05, |
| "loss": 0.1774, |
| "loss/crossentropy": 1.7351176738739014, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.150390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027032926678657532, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.006024543214353412, |
| "grad_norm": 1.9453125, |
| "grad_norm_var": 5.838165028889974, |
| "learning_rate": 5e-05, |
| "loss": 0.1815, |
| "loss/crossentropy": 2.6725094318389893, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.038890693336725235, |
| "step": 731 |
| }, |
| { |
| "epoch": 0.006032784723538575, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 5.731445058186849, |
| "learning_rate": 5e-05, |
| "loss": 0.1753, |
| "loss/crossentropy": 1.6290442943572998, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1435546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.031768545508384705, |
| "step": 732 |
| }, |
| { |
| "epoch": 0.006041026232723737, |
| "grad_norm": 5.59375, |
| "grad_norm_var": 6.049501291910807, |
| "learning_rate": 5e-05, |
| "loss": 0.2555, |
| "loss/crossentropy": 2.4887194633483887, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.205078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05038648098707199, |
| "step": 733 |
| }, |
| { |
| "epoch": 0.0060492677419088986, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 4.525903065999349, |
| "learning_rate": 5e-05, |
| "loss": 0.1672, |
| "loss/crossentropy": 1.4916491508483887, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03243900462985039, |
| "step": 734 |
| }, |
| { |
| "epoch": 0.0060575092510940605, |
| "grad_norm": 6.28125, |
| "grad_norm_var": 4.783105214436849, |
| "learning_rate": 5e-05, |
| "loss": 0.2469, |
| "loss/crossentropy": 1.9904972314834595, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.193359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05358533933758736, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.006065750760279222, |
| "grad_norm": 4.90625, |
| "grad_norm_var": 4.923659006754558, |
| "learning_rate": 5e-05, |
| "loss": 0.3124, |
| "loss/crossentropy": 3.1595237255096436, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07215666770935059, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.006073992269464384, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 4.737629191080729, |
| "learning_rate": 5e-05, |
| "loss": 0.1351, |
| "loss/crossentropy": 0.5982239842414856, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11767578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.017444239929318428, |
| "step": 737 |
| }, |
| { |
| "epoch": 0.006082233778649546, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 4.723148600260417, |
| "learning_rate": 5e-05, |
| "loss": 0.1614, |
| "loss/crossentropy": 1.076785922050476, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.13671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02469494380056858, |
| "step": 738 |
| }, |
| { |
| "epoch": 0.006090475287834708, |
| "grad_norm": 2.609375, |
| "grad_norm_var": 4.6523183186848955, |
| "learning_rate": 5e-05, |
| "loss": 0.2057, |
| "loss/crossentropy": 2.2849977016448975, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1513671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05430486798286438, |
| "step": 739 |
| }, |
| { |
| "epoch": 0.00609871679701987, |
| "grad_norm": 2.578125, |
| "grad_norm_var": 4.620018513997396, |
| "learning_rate": 5e-05, |
| "loss": 0.2088, |
| "loss/crossentropy": 1.5881438255310059, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1708984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.037879034876823425, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.006106958306205033, |
| "grad_norm": 1.671875, |
| "grad_norm_var": 4.708748372395833, |
| "learning_rate": 5e-05, |
| "loss": 0.1337, |
| "loss/crossentropy": 1.5729397535324097, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01942865364253521, |
| "step": 741 |
| }, |
| { |
| "epoch": 0.006115199815390195, |
| "grad_norm": 1.7578125, |
| "grad_norm_var": 4.875673166910807, |
| "learning_rate": 5e-05, |
| "loss": 0.1473, |
| "loss/crossentropy": 2.5460681915283203, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11865234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028684455901384354, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.0061234413245753565, |
| "grad_norm": 3.484375, |
| "grad_norm_var": 3.4392575581868488, |
| "learning_rate": 5e-05, |
| "loss": 0.197, |
| "loss/crossentropy": 2.502234697341919, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.158203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03875020891427994, |
| "step": 743 |
| }, |
| { |
| "epoch": 0.006131682833760518, |
| "grad_norm": 1.5625, |
| "grad_norm_var": 3.499828084309896, |
| "learning_rate": 5e-05, |
| "loss": 0.1167, |
| "loss/crossentropy": 0.5112316608428955, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.011273887008428574, |
| "step": 744 |
| }, |
| { |
| "epoch": 0.00613992434294568, |
| "grad_norm": 1.734375, |
| "grad_norm_var": 2.17010498046875, |
| "learning_rate": 5e-05, |
| "loss": 0.1642, |
| "loss/crossentropy": 1.5730743408203125, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029403435066342354, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.006148165852130842, |
| "grad_norm": 5.875, |
| "grad_norm_var": 2.7329345703125, |
| "learning_rate": 5e-05, |
| "loss": 0.2765, |
| "loss/crossentropy": 2.3317244052886963, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2236328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05287738889455795, |
| "step": 746 |
| }, |
| { |
| "epoch": 0.006156407361316004, |
| "grad_norm": 4.65625, |
| "grad_norm_var": 2.7969134012858072, |
| "learning_rate": 5e-05, |
| "loss": 0.3948, |
| "loss/crossentropy": 2.6969289779663086, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.287109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10772477090358734, |
| "step": 747 |
| }, |
| { |
| "epoch": 0.006164648870501166, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 2.7894304911295573, |
| "learning_rate": 5e-05, |
| "loss": 0.1563, |
| "loss/crossentropy": 2.7726356983184814, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03231953829526901, |
| "step": 748 |
| }, |
| { |
| "epoch": 0.006172890379686328, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 2.4205034891764323, |
| "learning_rate": 5e-05, |
| "loss": 0.1669, |
| "loss/crossentropy": 2.8912436962127686, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03605188801884651, |
| "step": 749 |
| }, |
| { |
| "epoch": 0.00618113188887149, |
| "grad_norm": 1.6796875, |
| "grad_norm_var": 2.4500244140625, |
| "learning_rate": 5e-05, |
| "loss": 0.1556, |
| "loss/crossentropy": 1.5787498950958252, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1318359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02375878393650055, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.0061893733980566525, |
| "grad_norm": 6.1875, |
| "grad_norm_var": 2.40950927734375, |
| "learning_rate": 5e-05, |
| "loss": 0.2622, |
| "loss/crossentropy": 1.5021111965179443, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.212890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04928001016378403, |
| "step": 751 |
| }, |
| { |
| "epoch": 0.006197614907241814, |
| "grad_norm": 1.828125, |
| "grad_norm_var": 2.2153228759765624, |
| "learning_rate": 5e-05, |
| "loss": 0.1812, |
| "loss/crossentropy": 2.771916627883911, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.13671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.044479530304670334, |
| "step": 752 |
| }, |
| { |
| "epoch": 0.006205856416426976, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 2.2174713134765627, |
| "learning_rate": 5e-05, |
| "loss": 0.1752, |
| "loss/crossentropy": 1.6487168073654175, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03264114260673523, |
| "step": 753 |
| }, |
| { |
| "epoch": 0.006214097925612138, |
| "grad_norm": 2.90625, |
| "grad_norm_var": 2.191454060872396, |
| "learning_rate": 5e-05, |
| "loss": 0.2076, |
| "loss/crossentropy": 3.0564393997192383, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.158203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04940104857087135, |
| "step": 754 |
| }, |
| { |
| "epoch": 0.0062223394347973, |
| "grad_norm": 1.7109375, |
| "grad_norm_var": 2.269628651936849, |
| "learning_rate": 5e-05, |
| "loss": 0.1733, |
| "loss/crossentropy": 2.259493112564087, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1337890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03946584463119507, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.006230580943982462, |
| "grad_norm": 1.8046875, |
| "grad_norm_var": 2.328316243489583, |
| "learning_rate": 5e-05, |
| "loss": 0.1575, |
| "loss/crossentropy": 2.8051905632019043, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.037370190024375916, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.006238822453167624, |
| "grad_norm": 2.65625, |
| "grad_norm_var": 2.2491689046223957, |
| "learning_rate": 5e-05, |
| "loss": 0.193, |
| "loss/crossentropy": 2.3121681213378906, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1494140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.043607860803604126, |
| "step": 757 |
| }, |
| { |
| "epoch": 0.006247063962352786, |
| "grad_norm": 1.7265625, |
| "grad_norm_var": 2.2535634358723957, |
| "learning_rate": 5e-05, |
| "loss": 0.1707, |
| "loss/crossentropy": 2.660825490951538, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03984237462282181, |
| "step": 758 |
| }, |
| { |
| "epoch": 0.006255305471537948, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 2.224800618489583, |
| "learning_rate": 5e-05, |
| "loss": 0.1356, |
| "loss/crossentropy": 1.1409167051315308, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11279296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02284255623817444, |
| "step": 759 |
| }, |
| { |
| "epoch": 0.00626354698072311, |
| "grad_norm": 1.96875, |
| "grad_norm_var": 2.171744791666667, |
| "learning_rate": 5e-05, |
| "loss": 0.1432, |
| "loss/crossentropy": 1.4278790950775146, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.115234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02792993187904358, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.006271788489908272, |
| "grad_norm": 2.875, |
| "grad_norm_var": 2.0974110921223956, |
| "learning_rate": 5e-05, |
| "loss": 0.1644, |
| "loss/crossentropy": 1.693648099899292, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1357421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028608174994587898, |
| "step": 761 |
| }, |
| { |
| "epoch": 0.006280029999093434, |
| "grad_norm": 1.4140625, |
| "grad_norm_var": 1.5294837951660156, |
| "learning_rate": 5e-05, |
| "loss": 0.1372, |
| "loss/crossentropy": 1.6515194177627563, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.020010514184832573, |
| "step": 762 |
| }, |
| { |
| "epoch": 0.006288271508278596, |
| "grad_norm": 3.578125, |
| "grad_norm_var": 1.2993995666503906, |
| "learning_rate": 5e-05, |
| "loss": 0.228, |
| "loss/crossentropy": 2.8086752891540527, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.17578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.052195869386196136, |
| "step": 763 |
| }, |
| { |
| "epoch": 0.006296513017463758, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 1.2980567932128906, |
| "learning_rate": 5e-05, |
| "loss": 0.1333, |
| "loss/crossentropy": 2.0333292484283447, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023911556228995323, |
| "step": 764 |
| }, |
| { |
| "epoch": 0.00630475452664892, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 1.3359840393066407, |
| "learning_rate": 5e-05, |
| "loss": 0.1194, |
| "loss/crossentropy": 1.3590047359466553, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10302734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.016341259703040123, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.006312996035834082, |
| "grad_norm": 1.875, |
| "grad_norm_var": 1.3181630452473958, |
| "learning_rate": 5e-05, |
| "loss": 0.1641, |
| "loss/crossentropy": 2.541020631790161, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03523167222738266, |
| "step": 766 |
| }, |
| { |
| "epoch": 0.006321237545019244, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 0.3344960530598958, |
| "learning_rate": 5e-05, |
| "loss": 0.1642, |
| "loss/crossentropy": 3.1116065979003906, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1259765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.038257598876953125, |
| "step": 767 |
| }, |
| { |
| "epoch": 0.0063294790542044056, |
| "grad_norm": 3.984375, |
| "grad_norm_var": 0.5136220296223958, |
| "learning_rate": 5e-05, |
| "loss": 0.2127, |
| "loss/crossentropy": 0.4452613890171051, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1884765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024267811328172684, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.006337720563389568, |
| "grad_norm": 2.421875, |
| "grad_norm_var": 0.5133778889973958, |
| "learning_rate": 5e-05, |
| "loss": 0.1991, |
| "loss/crossentropy": 2.1372740268707275, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1572265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04187183082103729, |
| "step": 769 |
| }, |
| { |
| "epoch": 0.00634596207257473, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 0.4933502197265625, |
| "learning_rate": 5e-05, |
| "loss": 0.147, |
| "loss/crossentropy": 1.2034099102020264, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024930456653237343, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.006354203581759892, |
| "grad_norm": 1.640625, |
| "grad_norm_var": 0.4993263244628906, |
| "learning_rate": 5e-05, |
| "loss": 0.154, |
| "loss/crossentropy": 1.4210480451583862, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0289776474237442, |
| "step": 771 |
| }, |
| { |
| "epoch": 0.006362445090945054, |
| "grad_norm": 2.90625, |
| "grad_norm_var": 0.5007965087890625, |
| "learning_rate": 5e-05, |
| "loss": 0.1915, |
| "loss/crossentropy": 2.461935520172119, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1494140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.042080432176589966, |
| "step": 772 |
| }, |
| { |
| "epoch": 0.006370686600130216, |
| "grad_norm": 3.984375, |
| "grad_norm_var": 0.6599812825520833, |
| "learning_rate": 5e-05, |
| "loss": 0.2646, |
| "loss/crossentropy": 2.870908260345459, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2041015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.06047297269105911, |
| "step": 773 |
| }, |
| { |
| "epoch": 0.006378928109315378, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 0.6368242899576823, |
| "learning_rate": 5e-05, |
| "loss": 0.1692, |
| "loss/crossentropy": 2.4187848567962646, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03446546941995621, |
| "step": 774 |
| }, |
| { |
| "epoch": 0.00638716961850054, |
| "grad_norm": 3.140625, |
| "grad_norm_var": 0.6639442443847656, |
| "learning_rate": 5e-05, |
| "loss": 0.2993, |
| "loss/crossentropy": 2.4693610668182373, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.21484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.08449941873550415, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.006395411127685702, |
| "grad_norm": 1.7578125, |
| "grad_norm_var": 0.68231201171875, |
| "learning_rate": 5e-05, |
| "loss": 0.1579, |
| "loss/crossentropy": 2.635270833969116, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1259765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.031969744712114334, |
| "step": 776 |
| }, |
| { |
| "epoch": 0.0064036526368708635, |
| "grad_norm": 1.203125, |
| "grad_norm_var": 0.7755930582682292, |
| "learning_rate": 5e-05, |
| "loss": 0.1124, |
| "loss/crossentropy": 1.4390558004379272, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0966796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.015761706978082657, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.006411894146056025, |
| "grad_norm": 4.09375, |
| "grad_norm_var": 0.8702369689941406, |
| "learning_rate": 5e-05, |
| "loss": 0.3235, |
| "loss/crossentropy": 1.998376727104187, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.263671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05982797592878342, |
| "step": 778 |
| }, |
| { |
| "epoch": 0.006420135655241188, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 0.813372548421224, |
| "learning_rate": 5e-05, |
| "loss": 0.2049, |
| "loss/crossentropy": 1.550020694732666, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04472289979457855, |
| "step": 779 |
| }, |
| { |
| "epoch": 0.00642837716442635, |
| "grad_norm": 1.2578125, |
| "grad_norm_var": 0.90545654296875, |
| "learning_rate": 5e-05, |
| "loss": 0.1262, |
| "loss/crossentropy": 2.6197104454040527, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.099609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02659156545996666, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.006436618673611512, |
| "grad_norm": 2.25, |
| "grad_norm_var": 0.8735911051432291, |
| "learning_rate": 5e-05, |
| "loss": 0.1781, |
| "loss/crossentropy": 2.597325563430786, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.045299261808395386, |
| "step": 781 |
| }, |
| { |
| "epoch": 0.006444860182796674, |
| "grad_norm": 2.09375, |
| "grad_norm_var": 0.8603749593098958, |
| "learning_rate": 5e-05, |
| "loss": 0.1826, |
| "loss/crossentropy": 1.9669688940048218, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1455078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03709650784730911, |
| "step": 782 |
| }, |
| { |
| "epoch": 0.006453101691981836, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 0.89061279296875, |
| "learning_rate": 5e-05, |
| "loss": 0.1355, |
| "loss/crossentropy": 1.5600085258483887, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.111328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02414374053478241, |
| "step": 783 |
| }, |
| { |
| "epoch": 0.006461343201166998, |
| "grad_norm": 3.65625, |
| "grad_norm_var": 0.8287261962890625, |
| "learning_rate": 5e-05, |
| "loss": 0.2038, |
| "loss/crossentropy": 2.052290678024292, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033876098692417145, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.0064695847103521595, |
| "grad_norm": 1.78125, |
| "grad_norm_var": 0.8521240234375, |
| "learning_rate": 5e-05, |
| "loss": 0.1581, |
| "loss/crossentropy": 0.9798577427864075, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023312989622354507, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.006477826219537321, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 0.8504221598307292, |
| "learning_rate": 5e-05, |
| "loss": 0.147, |
| "loss/crossentropy": 2.4323980808258057, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11865234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028353385627269745, |
| "step": 786 |
| }, |
| { |
| "epoch": 0.006486067728722483, |
| "grad_norm": 3.484375, |
| "grad_norm_var": 0.8854726155598959, |
| "learning_rate": 5e-05, |
| "loss": 0.2295, |
| "loss/crossentropy": 2.9406378269195557, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.17578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05376865714788437, |
| "step": 787 |
| }, |
| { |
| "epoch": 0.006494309237907646, |
| "grad_norm": 1.71875, |
| "grad_norm_var": 0.9057281494140625, |
| "learning_rate": 5e-05, |
| "loss": 0.1408, |
| "loss/crossentropy": 2.3078272342681885, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.111328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029444556683301926, |
| "step": 788 |
| }, |
| { |
| "epoch": 0.006502550747092808, |
| "grad_norm": 1.125, |
| "grad_norm_var": 0.81395263671875, |
| "learning_rate": 5e-05, |
| "loss": 0.1168, |
| "loss/crossentropy": 1.5063586235046387, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.095703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021064041182398796, |
| "step": 789 |
| }, |
| { |
| "epoch": 0.00651079225627797, |
| "grad_norm": 2.09375, |
| "grad_norm_var": 0.8121571858723958, |
| "learning_rate": 5e-05, |
| "loss": 0.1659, |
| "loss/crossentropy": 2.5850398540496826, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1337890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03208760544657707, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.006519033765463132, |
| "grad_norm": 3.03125, |
| "grad_norm_var": 0.7996175130208333, |
| "learning_rate": 5e-05, |
| "loss": 0.1852, |
| "loss/crossentropy": 1.5392999649047852, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.146484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.038750022649765015, |
| "step": 791 |
| }, |
| { |
| "epoch": 0.006527275274648294, |
| "grad_norm": 1.7890625, |
| "grad_norm_var": 0.79774169921875, |
| "learning_rate": 5e-05, |
| "loss": 0.1902, |
| "loss/crossentropy": 2.854396343231201, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.146484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.043708011507987976, |
| "step": 792 |
| }, |
| { |
| "epoch": 0.0065355167838334555, |
| "grad_norm": 3.671875, |
| "grad_norm_var": 0.8424235026041667, |
| "learning_rate": 5e-05, |
| "loss": 0.2264, |
| "loss/crossentropy": 2.2643284797668457, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.18359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04277587682008743, |
| "step": 793 |
| }, |
| { |
| "epoch": 0.006543758293018617, |
| "grad_norm": 2.96875, |
| "grad_norm_var": 0.6642985026041667, |
| "learning_rate": 5e-05, |
| "loss": 0.161, |
| "loss/crossentropy": 2.5523831844329834, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1279296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0331122949719429, |
| "step": 794 |
| }, |
| { |
| "epoch": 0.006551999802203779, |
| "grad_norm": 2.328125, |
| "grad_norm_var": 0.6581949869791667, |
| "learning_rate": 5e-05, |
| "loss": 0.177, |
| "loss/crossentropy": 3.067322015762329, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1357421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.041216038167476654, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.006560241311388941, |
| "grad_norm": 1.171875, |
| "grad_norm_var": 0.6709205627441406, |
| "learning_rate": 5e-05, |
| "loss": 0.1258, |
| "loss/crossentropy": 1.477942943572998, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1044921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021268734708428383, |
| "step": 796 |
| }, |
| { |
| "epoch": 0.006568482820574104, |
| "grad_norm": 7.375, |
| "grad_norm_var": 2.2628069559733075, |
| "learning_rate": 5e-05, |
| "loss": 0.2407, |
| "loss/crossentropy": 2.4961607456207275, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.049283482134342194, |
| "step": 797 |
| }, |
| { |
| "epoch": 0.006576724329759266, |
| "grad_norm": 2.828125, |
| "grad_norm_var": 2.2427263895670575, |
| "learning_rate": 5e-05, |
| "loss": 0.2416, |
| "loss/crossentropy": 2.435840129852295, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1826171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05896880850195885, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.006584965838944428, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 2.261286417643229, |
| "learning_rate": 5e-05, |
| "loss": 0.147, |
| "loss/crossentropy": 1.4684247970581055, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.119140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027891069650650024, |
| "step": 799 |
| }, |
| { |
| "epoch": 0.00659320734812959, |
| "grad_norm": 1.296875, |
| "grad_norm_var": 2.302298990885417, |
| "learning_rate": 5e-05, |
| "loss": 0.122, |
| "loss/crossentropy": 1.3427636623382568, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.020389681681990623, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0066014488573147515, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 2.2629109700520833, |
| "learning_rate": 5e-05, |
| "loss": 0.0932, |
| "loss/crossentropy": 0.27775296568870544, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.08642578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.006820976734161377, |
| "step": 801 |
| }, |
| { |
| "epoch": 0.006609690366499913, |
| "grad_norm": 1.4140625, |
| "grad_norm_var": 2.344496409098307, |
| "learning_rate": 5e-05, |
| "loss": 0.1262, |
| "loss/crossentropy": 0.8798704147338867, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10791015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.018274936825037003, |
| "step": 802 |
| }, |
| { |
| "epoch": 0.006617931875685075, |
| "grad_norm": 1.625, |
| "grad_norm_var": 2.321738433837891, |
| "learning_rate": 5e-05, |
| "loss": 0.1389, |
| "loss/crossentropy": 2.3500454425811768, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1103515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02855532616376877, |
| "step": 803 |
| }, |
| { |
| "epoch": 0.006626173384870237, |
| "grad_norm": 3.21875, |
| "grad_norm_var": 2.325156402587891, |
| "learning_rate": 5e-05, |
| "loss": 0.2325, |
| "loss/crossentropy": 2.58161997795105, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.177734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05475683510303497, |
| "step": 804 |
| }, |
| { |
| "epoch": 0.006634414894055399, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 2.1975786844889322, |
| "learning_rate": 5e-05, |
| "loss": 0.2116, |
| "loss/crossentropy": 2.6687753200531006, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.051458559930324554, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.006642656403240561, |
| "grad_norm": 3.078125, |
| "grad_norm_var": 2.1956560770670572, |
| "learning_rate": 5e-05, |
| "loss": 0.1836, |
| "loss/crossentropy": 2.341993808746338, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.150390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033185191452503204, |
| "step": 806 |
| }, |
| { |
| "epoch": 0.006650897912425724, |
| "grad_norm": 1.8515625, |
| "grad_norm_var": 2.2197336832682293, |
| "learning_rate": 5e-05, |
| "loss": 0.1828, |
| "loss/crossentropy": 2.465346336364746, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1416015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.041189346462488174, |
| "step": 807 |
| }, |
| { |
| "epoch": 0.006659139421610886, |
| "grad_norm": 1.09375, |
| "grad_norm_var": 2.3212013244628906, |
| "learning_rate": 5e-05, |
| "loss": 0.1117, |
| "loss/crossentropy": 1.3634965419769287, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0927734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01888800971210003, |
| "step": 808 |
| }, |
| { |
| "epoch": 0.0066673809307960475, |
| "grad_norm": 1.046875, |
| "grad_norm_var": 2.3466651916503904, |
| "learning_rate": 5e-05, |
| "loss": 0.095, |
| "loss/crossentropy": 1.5679908990859985, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.08251953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.012483851984143257, |
| "step": 809 |
| }, |
| { |
| "epoch": 0.006675622439981209, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 2.3237383524576822, |
| "learning_rate": 5e-05, |
| "loss": 0.1209, |
| "loss/crossentropy": 1.3377238512039185, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10107421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01983119174838066, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.006683863949166371, |
| "grad_norm": 2.875, |
| "grad_norm_var": 2.3450294494628907, |
| "learning_rate": 5e-05, |
| "loss": 0.2099, |
| "loss/crossentropy": 1.367640495300293, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1826171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027279119938611984, |
| "step": 811 |
| }, |
| { |
| "epoch": 0.006692105458351533, |
| "grad_norm": 2.453125, |
| "grad_norm_var": 2.2503537495930988, |
| "learning_rate": 5e-05, |
| "loss": 0.2034, |
| "loss/crossentropy": 2.2119548320770264, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.158203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.045244939625263214, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.006700346967536695, |
| "grad_norm": 2.140625, |
| "grad_norm_var": 0.4953386942545573, |
| "learning_rate": 5e-05, |
| "loss": 0.1415, |
| "loss/crossentropy": 1.21071457862854, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.020365629345178604, |
| "step": 813 |
| }, |
| { |
| "epoch": 0.006708588476721857, |
| "grad_norm": 3.8125, |
| "grad_norm_var": 0.6541460673014323, |
| "learning_rate": 5e-05, |
| "loss": 0.2487, |
| "loss/crossentropy": 2.8512344360351562, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.193359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.055374931544065475, |
| "step": 814 |
| }, |
| { |
| "epoch": 0.006716829985907019, |
| "grad_norm": 2.671875, |
| "grad_norm_var": 0.6461496988932292, |
| "learning_rate": 5e-05, |
| "loss": 0.1403, |
| "loss/crossentropy": 0.623081624507904, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12158203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01867399737238884, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.006725071495092182, |
| "grad_norm": 1.2421875, |
| "grad_norm_var": 0.6529945373535156, |
| "learning_rate": 5e-05, |
| "loss": 0.1044, |
| "loss/crossentropy": 0.35599735379219055, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0966796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.007739436347037554, |
| "step": 816 |
| }, |
| { |
| "epoch": 0.0067333130042773436, |
| "grad_norm": 2.0625, |
| "grad_norm_var": 0.6491065979003906, |
| "learning_rate": 5e-05, |
| "loss": 0.1802, |
| "loss/crossentropy": 2.3499081134796143, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.039534226059913635, |
| "step": 817 |
| }, |
| { |
| "epoch": 0.0067415545134625054, |
| "grad_norm": 2.953125, |
| "grad_norm_var": 0.6397379557291667, |
| "learning_rate": 5e-05, |
| "loss": 0.1885, |
| "loss/crossentropy": 1.8850847482681274, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02836015075445175, |
| "step": 818 |
| }, |
| { |
| "epoch": 0.006749796022647667, |
| "grad_norm": 2.703125, |
| "grad_norm_var": 0.6186106363932292, |
| "learning_rate": 5e-05, |
| "loss": 0.216, |
| "loss/crossentropy": 1.4330132007598877, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04408085718750954, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.006758037531832829, |
| "grad_norm": 3.296875, |
| "grad_norm_var": 0.6280965169270833, |
| "learning_rate": 5e-05, |
| "loss": 0.2694, |
| "loss/crossentropy": 1.9407634735107422, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.20703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.062371157109737396, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.006766279041017991, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 0.6700266520182292, |
| "learning_rate": 5e-05, |
| "loss": 0.1509, |
| "loss/crossentropy": 1.891184687614441, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.032700151205062866, |
| "step": 821 |
| }, |
| { |
| "epoch": 0.006774520550203153, |
| "grad_norm": 2.921875, |
| "grad_norm_var": 0.6554189046223958, |
| "learning_rate": 5e-05, |
| "loss": 0.212, |
| "loss/crossentropy": 2.192823648452759, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1650390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04695805162191391, |
| "step": 822 |
| }, |
| { |
| "epoch": 0.006782762059388315, |
| "grad_norm": 1.2734375, |
| "grad_norm_var": 0.7104085286458334, |
| "learning_rate": 5e-05, |
| "loss": 0.1329, |
| "loss/crossentropy": 1.4268293380737305, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023486068472266197, |
| "step": 823 |
| }, |
| { |
| "epoch": 0.006791003568573477, |
| "grad_norm": 5.03125, |
| "grad_norm_var": 1.0682698567708333, |
| "learning_rate": 5e-05, |
| "loss": 0.4061, |
| "loss/crossentropy": 2.2408840656280518, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.30078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10528542101383209, |
| "step": 824 |
| }, |
| { |
| "epoch": 0.00679924507775864, |
| "grad_norm": 1.6328125, |
| "grad_norm_var": 0.9758969624837239, |
| "learning_rate": 5e-05, |
| "loss": 0.1375, |
| "loss/crossentropy": 2.6388261318206787, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1083984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029141269624233246, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.0068074865869438015, |
| "grad_norm": 1.328125, |
| "grad_norm_var": 1.055492909749349, |
| "learning_rate": 5e-05, |
| "loss": 0.1284, |
| "loss/crossentropy": 1.3783127069473267, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1103515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01809128187596798, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.006815728096128963, |
| "grad_norm": 1.140625, |
| "grad_norm_var": 1.155761464436849, |
| "learning_rate": 5e-05, |
| "loss": 0.1161, |
| "loss/crossentropy": 1.5458354949951172, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0966796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01938330940902233, |
| "step": 827 |
| }, |
| { |
| "epoch": 0.006823969605314125, |
| "grad_norm": 3.234375, |
| "grad_norm_var": 1.200774892171224, |
| "learning_rate": 5e-05, |
| "loss": 0.2329, |
| "loss/crossentropy": 1.8702012300491333, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.045410916209220886, |
| "step": 828 |
| }, |
| { |
| "epoch": 0.006832211114499287, |
| "grad_norm": 5.4375, |
| "grad_norm_var": 1.7502540588378905, |
| "learning_rate": 5e-05, |
| "loss": 0.3512, |
| "loss/crossentropy": 2.7246875762939453, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0699634999036789, |
| "step": 829 |
| }, |
| { |
| "epoch": 0.006840452623684449, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 1.6584083557128906, |
| "learning_rate": 5e-05, |
| "loss": 0.1294, |
| "loss/crossentropy": 1.506727695465088, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10791015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021486353129148483, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.006848694132869611, |
| "grad_norm": 2.25, |
| "grad_norm_var": 1.6624183654785156, |
| "learning_rate": 5e-05, |
| "loss": 0.1725, |
| "loss/crossentropy": 2.507237434387207, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1357421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03672666847705841, |
| "step": 831 |
| }, |
| { |
| "epoch": 0.006856935642054773, |
| "grad_norm": 1.96875, |
| "grad_norm_var": 1.57171630859375, |
| "learning_rate": 5e-05, |
| "loss": 0.1601, |
| "loss/crossentropy": 1.883087158203125, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.13671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023375539109110832, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.006865177151239935, |
| "grad_norm": 1.296875, |
| "grad_norm_var": 1.6595937093098958, |
| "learning_rate": 5e-05, |
| "loss": 0.1151, |
| "loss/crossentropy": 2.2904350757598877, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.09375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02137480303645134, |
| "step": 833 |
| }, |
| { |
| "epoch": 0.006873418660425097, |
| "grad_norm": 1.546875, |
| "grad_norm_var": 1.7013417561848958, |
| "learning_rate": 5e-05, |
| "loss": 0.1588, |
| "loss/crossentropy": 2.104323625564575, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1279296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03088623285293579, |
| "step": 834 |
| }, |
| { |
| "epoch": 0.006881660169610259, |
| "grad_norm": 1.40625, |
| "grad_norm_var": 1.7590087890625, |
| "learning_rate": 5e-05, |
| "loss": 0.1215, |
| "loss/crossentropy": 1.4084559679031372, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.103515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.017994871363043785, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.006889901678795421, |
| "grad_norm": 1.5, |
| "grad_norm_var": 1.7333892822265624, |
| "learning_rate": 5e-05, |
| "loss": 0.1111, |
| "loss/crossentropy": 1.1881444454193115, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.095703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01541186310350895, |
| "step": 836 |
| }, |
| { |
| "epoch": 0.006898143187980583, |
| "grad_norm": 3.4375, |
| "grad_norm_var": 1.7815419514973958, |
| "learning_rate": 5e-05, |
| "loss": 0.1738, |
| "loss/crossentropy": 2.4373316764831543, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1357421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.038025081157684326, |
| "step": 837 |
| }, |
| { |
| "epoch": 0.006906384697165745, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 1.81781005859375, |
| "learning_rate": 5e-05, |
| "loss": 0.1448, |
| "loss/crossentropy": 2.4372975826263428, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030569259077310562, |
| "step": 838 |
| }, |
| { |
| "epoch": 0.006914626206350907, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 1.7505734761555989, |
| "learning_rate": 5e-05, |
| "loss": 0.1729, |
| "loss/crossentropy": 1.551772117614746, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.14453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028406476601958275, |
| "step": 839 |
| }, |
| { |
| "epoch": 0.006922867715536069, |
| "grad_norm": 1.78125, |
| "grad_norm_var": 1.2323931376139323, |
| "learning_rate": 5e-05, |
| "loss": 0.1473, |
| "loss/crossentropy": 2.1337997913360596, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033026814460754395, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.006931109224721231, |
| "grad_norm": 2.453125, |
| "grad_norm_var": 1.2223795572916667, |
| "learning_rate": 5e-05, |
| "loss": 0.1839, |
| "loss/crossentropy": 2.638575792312622, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1416015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04230286926031113, |
| "step": 841 |
| }, |
| { |
| "epoch": 0.006939350733906393, |
| "grad_norm": 2.390625, |
| "grad_norm_var": 1.1750651041666667, |
| "learning_rate": 5e-05, |
| "loss": 0.2205, |
| "loss/crossentropy": 2.267352342605591, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.169921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05056004598736763, |
| "step": 842 |
| }, |
| { |
| "epoch": 0.0069475922430915545, |
| "grad_norm": 4.34375, |
| "grad_norm_var": 1.3525299072265624, |
| "learning_rate": 5e-05, |
| "loss": 0.3381, |
| "loss/crossentropy": 2.6636710166931152, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.248046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.09001342952251434, |
| "step": 843 |
| }, |
| { |
| "epoch": 0.006955833752276717, |
| "grad_norm": 3.109375, |
| "grad_norm_var": 1.340046183268229, |
| "learning_rate": 5e-05, |
| "loss": 0.1769, |
| "loss/crossentropy": 1.5471034049987793, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02849499136209488, |
| "step": 844 |
| }, |
| { |
| "epoch": 0.006964075261461879, |
| "grad_norm": 2.546875, |
| "grad_norm_var": 0.698876953125, |
| "learning_rate": 5e-05, |
| "loss": 0.1365, |
| "loss/crossentropy": 2.5193700790405273, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.107421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029105795547366142, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.006972316770647041, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 0.6989491780598959, |
| "learning_rate": 5e-05, |
| "loss": 0.1387, |
| "loss/crossentropy": 1.5674251317977905, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11376953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024912243708968163, |
| "step": 846 |
| }, |
| { |
| "epoch": 0.006980558279832203, |
| "grad_norm": 3.28125, |
| "grad_norm_var": 0.7668935139973958, |
| "learning_rate": 5e-05, |
| "loss": 0.1924, |
| "loss/crossentropy": 1.3732967376708984, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036177463829517365, |
| "step": 847 |
| }, |
| { |
| "epoch": 0.006988799789017365, |
| "grad_norm": 4.0625, |
| "grad_norm_var": 0.9473704020182292, |
| "learning_rate": 5e-05, |
| "loss": 0.2466, |
| "loss/crossentropy": 1.4407217502593994, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04741185903549194, |
| "step": 848 |
| }, |
| { |
| "epoch": 0.006997041298202527, |
| "grad_norm": 1.5078125, |
| "grad_norm_var": 0.9181536356608073, |
| "learning_rate": 5e-05, |
| "loss": 0.1338, |
| "loss/crossentropy": 2.713207721710205, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1064453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027391444891691208, |
| "step": 849 |
| }, |
| { |
| "epoch": 0.007005282807387689, |
| "grad_norm": 2.5, |
| "grad_norm_var": 0.8604448954264323, |
| "learning_rate": 5e-05, |
| "loss": 0.2414, |
| "loss/crossentropy": 1.8647682666778564, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.038255929946899414, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.0070135243165728505, |
| "grad_norm": 3.640625, |
| "grad_norm_var": 0.8444435119628906, |
| "learning_rate": 5e-05, |
| "loss": 0.1953, |
| "loss/crossentropy": 3.0968027114868164, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1513671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04395551607012749, |
| "step": 851 |
| }, |
| { |
| "epoch": 0.0070217658257580124, |
| "grad_norm": 1.1015625, |
| "grad_norm_var": 0.9152984619140625, |
| "learning_rate": 5e-05, |
| "loss": 0.1179, |
| "loss/crossentropy": 1.364888072013855, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.09765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.020214572548866272, |
| "step": 852 |
| }, |
| { |
| "epoch": 0.007030007334943175, |
| "grad_norm": 4.03125, |
| "grad_norm_var": 1.0018870035807292, |
| "learning_rate": 5e-05, |
| "loss": 0.2262, |
| "loss/crossentropy": 1.5738481283187866, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1923828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03383617848157883, |
| "step": 853 |
| }, |
| { |
| "epoch": 0.007038248844128337, |
| "grad_norm": 2.375, |
| "grad_norm_var": 0.8874827067057292, |
| "learning_rate": 5e-05, |
| "loss": 0.147, |
| "loss/crossentropy": 1.7854039669036865, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02201111428439617, |
| "step": 854 |
| }, |
| { |
| "epoch": 0.007046490353313499, |
| "grad_norm": 1.078125, |
| "grad_norm_var": 1.0427154541015624, |
| "learning_rate": 5e-05, |
| "loss": 0.1042, |
| "loss/crossentropy": 0.8679842352867126, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0908203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.013427493162453175, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.007054731862498661, |
| "grad_norm": 2.0, |
| "grad_norm_var": 1.020213826497396, |
| "learning_rate": 5e-05, |
| "loss": 0.1693, |
| "loss/crossentropy": 1.9278184175491333, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03644675761461258, |
| "step": 856 |
| }, |
| { |
| "epoch": 0.007062973371683823, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 1.0278065999348958, |
| "learning_rate": 5e-05, |
| "loss": 0.1806, |
| "loss/crossentropy": 1.801297664642334, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03802645206451416, |
| "step": 857 |
| }, |
| { |
| "epoch": 0.007071214880868985, |
| "grad_norm": 2.390625, |
| "grad_norm_var": 1.0278065999348958, |
| "learning_rate": 5e-05, |
| "loss": 0.1861, |
| "loss/crossentropy": 2.6730916500091553, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04350658878684044, |
| "step": 858 |
| }, |
| { |
| "epoch": 0.0070794563900541466, |
| "grad_norm": 1.625, |
| "grad_norm_var": 0.8784169514973958, |
| "learning_rate": 5e-05, |
| "loss": 0.1474, |
| "loss/crossentropy": 2.8773488998413086, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033148590475320816, |
| "step": 859 |
| }, |
| { |
| "epoch": 0.0070876978992393085, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 0.8555898030598958, |
| "learning_rate": 5e-05, |
| "loss": 0.1824, |
| "loss/crossentropy": 2.271367311477661, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1435546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03879944607615471, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.00709593940842447, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 0.8619374593098958, |
| "learning_rate": 5e-05, |
| "loss": 0.1606, |
| "loss/crossentropy": 2.637540340423584, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.126953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033611979335546494, |
| "step": 861 |
| }, |
| { |
| "epoch": 0.007104180917609632, |
| "grad_norm": 1.375, |
| "grad_norm_var": 0.9275461832682291, |
| "learning_rate": 5e-05, |
| "loss": 0.1324, |
| "loss/crossentropy": 1.5838335752487183, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02299325168132782, |
| "step": 862 |
| }, |
| { |
| "epoch": 0.007112422426794795, |
| "grad_norm": 1.1484375, |
| "grad_norm_var": 0.9449724833170573, |
| "learning_rate": 5e-05, |
| "loss": 0.1209, |
| "loss/crossentropy": 1.4155738353729248, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019340351223945618, |
| "step": 863 |
| }, |
| { |
| "epoch": 0.007120663935979957, |
| "grad_norm": 29.625, |
| "grad_norm_var": 48.10079523722331, |
| "learning_rate": 5e-05, |
| "loss": 0.374, |
| "loss/crossentropy": 2.0414719581604004, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.32421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.049798041582107544, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.007128905445165119, |
| "grad_norm": 1.3515625, |
| "grad_norm_var": 48.15022354125976, |
| "learning_rate": 5e-05, |
| "loss": 0.1175, |
| "loss/crossentropy": 1.4641659259796143, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0986328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.018840216100215912, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.007137146954350281, |
| "grad_norm": 1.2890625, |
| "grad_norm_var": 48.451341756184895, |
| "learning_rate": 5e-05, |
| "loss": 0.1129, |
| "loss/crossentropy": 0.8852246403694153, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.09619140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01674000360071659, |
| "step": 866 |
| }, |
| { |
| "epoch": 0.007145388463535443, |
| "grad_norm": 1.4296875, |
| "grad_norm_var": 48.78075129191081, |
| "learning_rate": 5e-05, |
| "loss": 0.1627, |
| "loss/crossentropy": 2.618283748626709, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03862760215997696, |
| "step": 867 |
| }, |
| { |
| "epoch": 0.0071536299727206045, |
| "grad_norm": 2.390625, |
| "grad_norm_var": 48.45802408854167, |
| "learning_rate": 5e-05, |
| "loss": 0.1756, |
| "loss/crossentropy": 1.5870869159698486, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0232391357421875, |
| "step": 868 |
| }, |
| { |
| "epoch": 0.007161871481905766, |
| "grad_norm": 2.375, |
| "grad_norm_var": 48.54838460286458, |
| "learning_rate": 5e-05, |
| "loss": 0.1535, |
| "loss/crossentropy": 1.9420498609542847, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03145188093185425, |
| "step": 869 |
| }, |
| { |
| "epoch": 0.007170112991090928, |
| "grad_norm": 1.375, |
| "grad_norm_var": 48.76895751953125, |
| "learning_rate": 5e-05, |
| "loss": 0.1167, |
| "loss/crossentropy": 1.6243377923965454, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0947265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.021999340504407883, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.00717835450027609, |
| "grad_norm": 1.6484375, |
| "grad_norm_var": 48.60527114868164, |
| "learning_rate": 5e-05, |
| "loss": 0.1465, |
| "loss/crossentropy": 0.5207220911979675, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.015599271282553673, |
| "step": 871 |
| }, |
| { |
| "epoch": 0.007186596009461253, |
| "grad_norm": 1.6015625, |
| "grad_norm_var": 48.69667053222656, |
| "learning_rate": 5e-05, |
| "loss": 0.1485, |
| "loss/crossentropy": 2.410845994949341, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030334439128637314, |
| "step": 872 |
| }, |
| { |
| "epoch": 0.007194837518646415, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 48.66558024088542, |
| "learning_rate": 5e-05, |
| "loss": 0.1478, |
| "loss/crossentropy": 1.4278953075408936, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01892838627099991, |
| "step": 873 |
| }, |
| { |
| "epoch": 0.007203079027831577, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 48.73091227213542, |
| "learning_rate": 5e-05, |
| "loss": 0.1412, |
| "loss/crossentropy": 2.3847408294677734, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.111328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029860273003578186, |
| "step": 874 |
| }, |
| { |
| "epoch": 0.007211320537016739, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 48.56925862630208, |
| "learning_rate": 5e-05, |
| "loss": 0.2361, |
| "loss/crossentropy": 2.355764627456665, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.185546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05050516501069069, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.0072195620462019005, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 48.67285054524739, |
| "learning_rate": 5e-05, |
| "loss": 0.2005, |
| "loss/crossentropy": 2.284627676010132, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.158203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04234454780817032, |
| "step": 876 |
| }, |
| { |
| "epoch": 0.007227803555387062, |
| "grad_norm": 3.296875, |
| "grad_norm_var": 48.53161519368489, |
| "learning_rate": 5e-05, |
| "loss": 0.2251, |
| "loss/crossentropy": 2.510892152786255, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.18359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.041470736265182495, |
| "step": 877 |
| }, |
| { |
| "epoch": 0.007236045064572224, |
| "grad_norm": 1.90625, |
| "grad_norm_var": 48.39202372233073, |
| "learning_rate": 5e-05, |
| "loss": 0.1637, |
| "loss/crossentropy": 1.8948522806167603, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030879024416208267, |
| "step": 878 |
| }, |
| { |
| "epoch": 0.007244286573757386, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 48.13868993123372, |
| "learning_rate": 5e-05, |
| "loss": 0.1593, |
| "loss/crossentropy": 1.3858805894851685, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1259765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03331441059708595, |
| "step": 879 |
| }, |
| { |
| "epoch": 0.007252528082942548, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 0.29590021769205727, |
| "learning_rate": 5e-05, |
| "loss": 0.1446, |
| "loss/crossentropy": 1.4740588665008545, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02642858400940895, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.007260769592127711, |
| "grad_norm": 3.1875, |
| "grad_norm_var": 0.3551259358723958, |
| "learning_rate": 5e-05, |
| "loss": 0.1482, |
| "loss/crossentropy": 1.0553547143936157, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01929531991481781, |
| "step": 881 |
| }, |
| { |
| "epoch": 0.007269011101312873, |
| "grad_norm": 2.375, |
| "grad_norm_var": 0.31359024047851564, |
| "learning_rate": 5e-05, |
| "loss": 0.1372, |
| "loss/crossentropy": 1.9523429870605469, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027855783700942993, |
| "step": 882 |
| }, |
| { |
| "epoch": 0.007277252610498035, |
| "grad_norm": 1.390625, |
| "grad_norm_var": 0.3174519856770833, |
| "learning_rate": 5e-05, |
| "loss": 0.0988, |
| "loss/crossentropy": 0.5503354072570801, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0888671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.009944621473550797, |
| "step": 883 |
| }, |
| { |
| "epoch": 0.0072854941196831965, |
| "grad_norm": 1.2578125, |
| "grad_norm_var": 0.36137059529622395, |
| "learning_rate": 5e-05, |
| "loss": 0.1208, |
| "loss/crossentropy": 1.7299476861953735, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10009765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.020662667229771614, |
| "step": 884 |
| }, |
| { |
| "epoch": 0.007293735628868358, |
| "grad_norm": 3.65625, |
| "grad_norm_var": 0.5144365946451823, |
| "learning_rate": 5e-05, |
| "loss": 0.1408, |
| "loss/crossentropy": 1.2476385831832886, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.115234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025581957772374153, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.00730197713805352, |
| "grad_norm": 1.078125, |
| "grad_norm_var": 0.5510047912597656, |
| "learning_rate": 5e-05, |
| "loss": 0.103, |
| "loss/crossentropy": 1.68153715133667, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0849609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.018041210249066353, |
| "step": 886 |
| }, |
| { |
| "epoch": 0.007310218647238682, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 0.5380849202473958, |
| "learning_rate": 5e-05, |
| "loss": 0.1718, |
| "loss/crossentropy": 2.690136671066284, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1318359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03993324190378189, |
| "step": 887 |
| }, |
| { |
| "epoch": 0.007318460156423844, |
| "grad_norm": 1.59375, |
| "grad_norm_var": 0.5387021382649739, |
| "learning_rate": 5e-05, |
| "loss": 0.1553, |
| "loss/crossentropy": 1.3326576948165894, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03129717335104942, |
| "step": 888 |
| }, |
| { |
| "epoch": 0.007326701665609006, |
| "grad_norm": 2.109375, |
| "grad_norm_var": 0.5334144592285156, |
| "learning_rate": 5e-05, |
| "loss": 0.1613, |
| "loss/crossentropy": 2.293314218521118, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03244052827358246, |
| "step": 889 |
| }, |
| { |
| "epoch": 0.007334943174794169, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 0.5329119364420573, |
| "learning_rate": 5e-05, |
| "loss": 0.1668, |
| "loss/crossentropy": 2.6757304668426514, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03789503872394562, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.007343184683979331, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 0.5289955139160156, |
| "learning_rate": 5e-05, |
| "loss": 0.172, |
| "loss/crossentropy": 1.9997798204421997, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03719984367489815, |
| "step": 891 |
| }, |
| { |
| "epoch": 0.0073514261931644925, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 0.5374755859375, |
| "learning_rate": 5e-05, |
| "loss": 0.1209, |
| "loss/crossentropy": 0.4429371654987335, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019379278644919395, |
| "step": 892 |
| }, |
| { |
| "epoch": 0.007359667702349654, |
| "grad_norm": 1.3515625, |
| "grad_norm_var": 0.48118057250976565, |
| "learning_rate": 5e-05, |
| "loss": 0.0994, |
| "loss/crossentropy": 0.2325073629617691, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.09375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.005671404767781496, |
| "step": 893 |
| }, |
| { |
| "epoch": 0.007367909211534816, |
| "grad_norm": 1.328125, |
| "grad_norm_var": 0.512872060139974, |
| "learning_rate": 5e-05, |
| "loss": 0.1444, |
| "loss/crossentropy": 2.507219076156616, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.035071056336164474, |
| "step": 894 |
| }, |
| { |
| "epoch": 0.007376150720719978, |
| "grad_norm": 2.234375, |
| "grad_norm_var": 0.5158119201660156, |
| "learning_rate": 5e-05, |
| "loss": 0.1701, |
| "loss/crossentropy": 2.556119918823242, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.037296053022146225, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.00738439222990514, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 0.5198951721191406, |
| "learning_rate": 5e-05, |
| "loss": 0.2003, |
| "loss/crossentropy": 2.8151919841766357, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04793722182512283, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.007392633739090302, |
| "grad_norm": 1.9296875, |
| "grad_norm_var": 0.4244537353515625, |
| "learning_rate": 5e-05, |
| "loss": 0.2205, |
| "loss/crossentropy": 2.4031951427459717, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05256333947181702, |
| "step": 897 |
| }, |
| { |
| "epoch": 0.007400875248275464, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 0.4171295166015625, |
| "learning_rate": 5e-05, |
| "loss": 0.1722, |
| "loss/crossentropy": 3.179619312286377, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0393570140004158, |
| "step": 898 |
| }, |
| { |
| "epoch": 0.007409116757460626, |
| "grad_norm": 4.625, |
| "grad_norm_var": 0.83385009765625, |
| "learning_rate": 5e-05, |
| "loss": 0.2282, |
| "loss/crossentropy": 1.429465651512146, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.19921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.028934892266988754, |
| "step": 899 |
| }, |
| { |
| "epoch": 0.0074173582666457885, |
| "grad_norm": 1.515625, |
| "grad_norm_var": 0.8075904846191406, |
| "learning_rate": 5e-05, |
| "loss": 0.1484, |
| "loss/crossentropy": 1.4451292753219604, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024369023740291595, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.0074255997758309504, |
| "grad_norm": 103.5, |
| "grad_norm_var": 643.7922401428223, |
| "learning_rate": 5e-05, |
| "loss": 0.5508, |
| "loss/crossentropy": 0.928047239780426, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.50390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04687977582216263, |
| "step": 901 |
| }, |
| { |
| "epoch": 0.007433841285016112, |
| "grad_norm": 1.0078125, |
| "grad_norm_var": 643.861181640625, |
| "learning_rate": 5e-05, |
| "loss": 0.1275, |
| "loss/crossentropy": 2.235487461090088, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1044921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.022974800318479538, |
| "step": 902 |
| }, |
| { |
| "epoch": 0.007442082794201274, |
| "grad_norm": 2.421875, |
| "grad_norm_var": 643.8736073811849, |
| "learning_rate": 5e-05, |
| "loss": 0.1372, |
| "loss/crossentropy": 1.5220298767089844, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02395622618496418, |
| "step": 903 |
| }, |
| { |
| "epoch": 0.007450324303386436, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 643.2287831624349, |
| "learning_rate": 5e-05, |
| "loss": 0.1307, |
| "loss/crossentropy": 1.4191033840179443, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10595703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024734167382121086, |
| "step": 904 |
| }, |
| { |
| "epoch": 0.007458565812571598, |
| "grad_norm": 5.125, |
| "grad_norm_var": 641.2515462239584, |
| "learning_rate": 5e-05, |
| "loss": 0.3705, |
| "loss/crossentropy": 1.2650396823883057, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.26953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.10096491873264313, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.00746680732175676, |
| "grad_norm": 3.234375, |
| "grad_norm_var": 640.5282704671224, |
| "learning_rate": 5e-05, |
| "loss": 0.1926, |
| "loss/crossentropy": 1.6492811441421509, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.154296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0383470356464386, |
| "step": 906 |
| }, |
| { |
| "epoch": 0.007475048830941922, |
| "grad_norm": 2.671875, |
| "grad_norm_var": 640.190786743164, |
| "learning_rate": 5e-05, |
| "loss": 0.1486, |
| "loss/crossentropy": 1.6101174354553223, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12353515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02505500242114067, |
| "step": 907 |
| }, |
| { |
| "epoch": 0.007483290340127084, |
| "grad_norm": 3.21875, |
| "grad_norm_var": 638.7909563700358, |
| "learning_rate": 5e-05, |
| "loss": 0.1805, |
| "loss/crossentropy": 1.6961467266082764, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.150390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030135734006762505, |
| "step": 908 |
| }, |
| { |
| "epoch": 0.0074915318493122465, |
| "grad_norm": 3.28125, |
| "grad_norm_var": 637.1034220377604, |
| "learning_rate": 5e-05, |
| "loss": 0.2268, |
| "loss/crossentropy": 2.749086618423462, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1787109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.048118021339178085, |
| "step": 909 |
| }, |
| { |
| "epoch": 0.007499773358497408, |
| "grad_norm": 1.3515625, |
| "grad_norm_var": 637.0796831766764, |
| "learning_rate": 5e-05, |
| "loss": 0.1485, |
| "loss/crossentropy": 1.4836238622665405, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02646125853061676, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.00750801486768257, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 637.0936622619629, |
| "learning_rate": 5e-05, |
| "loss": 0.16, |
| "loss/crossentropy": 1.5685328245162964, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.126953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033031269907951355, |
| "step": 911 |
| }, |
| { |
| "epoch": 0.007516256376867732, |
| "grad_norm": 1.7890625, |
| "grad_norm_var": 637.5730539957682, |
| "learning_rate": 5e-05, |
| "loss": 0.1701, |
| "loss/crossentropy": 0.7678622007369995, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1435546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02650710754096508, |
| "step": 912 |
| }, |
| { |
| "epoch": 0.007524497886052894, |
| "grad_norm": 2.5625, |
| "grad_norm_var": 637.0096819559733, |
| "learning_rate": 5e-05, |
| "loss": 0.1833, |
| "loss/crossentropy": 2.773200273513794, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04464181140065193, |
| "step": 913 |
| }, |
| { |
| "epoch": 0.007532739395238056, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 637.4885821024577, |
| "learning_rate": 5e-05, |
| "loss": 0.1809, |
| "loss/crossentropy": 2.4580299854278564, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04220724105834961, |
| "step": 914 |
| }, |
| { |
| "epoch": 0.007540980904423218, |
| "grad_norm": 1.6796875, |
| "grad_norm_var": 639.71376953125, |
| "learning_rate": 5e-05, |
| "loss": 0.1522, |
| "loss/crossentropy": 2.1130568981170654, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1201171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03203842043876648, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.00754922241360838, |
| "grad_norm": 1.84375, |
| "grad_norm_var": 639.4050201416015, |
| "learning_rate": 5e-05, |
| "loss": 0.1293, |
| "loss/crossentropy": 1.8496575355529785, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.103515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025790153071284294, |
| "step": 916 |
| }, |
| { |
| "epoch": 0.007557463922793542, |
| "grad_norm": 3.078125, |
| "grad_norm_var": 0.9873331705729167, |
| "learning_rate": 5e-05, |
| "loss": 0.1361, |
| "loss/crossentropy": 1.5800864696502686, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1123046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023767339065670967, |
| "step": 917 |
| }, |
| { |
| "epoch": 0.007565705431978704, |
| "grad_norm": 1.8359375, |
| "grad_norm_var": 0.8686676025390625, |
| "learning_rate": 5e-05, |
| "loss": 0.1117, |
| "loss/crossentropy": 1.1425081491470337, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.09375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01797986589372158, |
| "step": 918 |
| }, |
| { |
| "epoch": 0.007573946941163866, |
| "grad_norm": 3.578125, |
| "grad_norm_var": 0.9367177327473958, |
| "learning_rate": 5e-05, |
| "loss": 0.1796, |
| "loss/crossentropy": 2.826726198196411, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036978572607040405, |
| "step": 919 |
| }, |
| { |
| "epoch": 0.007582188450349028, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 0.9668596903483073, |
| "learning_rate": 5e-05, |
| "loss": 0.1869, |
| "loss/crossentropy": 2.1164968013763428, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03847302123904228, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.00759042995953419, |
| "grad_norm": 4.40625, |
| "grad_norm_var": 0.753808339436849, |
| "learning_rate": 5e-05, |
| "loss": 0.1586, |
| "loss/crossentropy": 2.5811195373535156, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034622643142938614, |
| "step": 921 |
| }, |
| { |
| "epoch": 0.007598671468719352, |
| "grad_norm": 1.65625, |
| "grad_norm_var": 0.7591509501139323, |
| "learning_rate": 5e-05, |
| "loss": 0.1544, |
| "loss/crossentropy": 2.408280611038208, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.036218732595443726, |
| "step": 922 |
| }, |
| { |
| "epoch": 0.007606912977904514, |
| "grad_norm": 1.78125, |
| "grad_norm_var": 0.7789812723795573, |
| "learning_rate": 5e-05, |
| "loss": 0.1502, |
| "loss/crossentropy": 1.7369745969772339, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.123046875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.027189793065190315, |
| "step": 923 |
| }, |
| { |
| "epoch": 0.007615154487089676, |
| "grad_norm": 4.75, |
| "grad_norm_var": 1.0996864318847657, |
| "learning_rate": 5e-05, |
| "loss": 0.4286, |
| "loss/crossentropy": 2.7182440757751465, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.3671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.061443451792001724, |
| "step": 924 |
| }, |
| { |
| "epoch": 0.007623395996274838, |
| "grad_norm": 1.671875, |
| "grad_norm_var": 1.0856463114420574, |
| "learning_rate": 5e-05, |
| "loss": 0.1476, |
| "loss/crossentropy": 2.5658042430877686, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1171875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030445091426372528, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.0076316375054599995, |
| "grad_norm": 1.578125, |
| "grad_norm_var": 1.0583658854166667, |
| "learning_rate": 5e-05, |
| "loss": 0.1394, |
| "loss/crossentropy": 1.2689893245697021, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11279296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02664242871105671, |
| "step": 926 |
| }, |
| { |
| "epoch": 0.007639879014645161, |
| "grad_norm": 1.8125, |
| "grad_norm_var": 1.0771443684895834, |
| "learning_rate": 5e-05, |
| "loss": 0.1498, |
| "loss/crossentropy": 2.381871223449707, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.119140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03070569597184658, |
| "step": 927 |
| }, |
| { |
| "epoch": 0.007648120523830324, |
| "grad_norm": 2.25, |
| "grad_norm_var": 1.0559730529785156, |
| "learning_rate": 5e-05, |
| "loss": 0.1529, |
| "loss/crossentropy": 2.887047529220581, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11962890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033232178539037704, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.007656362033015486, |
| "grad_norm": 11.875, |
| "grad_norm_var": 6.704707590738932, |
| "learning_rate": 5e-05, |
| "loss": 0.8791, |
| "loss/crossentropy": 1.3170801401138306, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.14475420117378235, |
| "step": 929 |
| }, |
| { |
| "epoch": 0.007664603542200648, |
| "grad_norm": 1.6484375, |
| "grad_norm_var": 6.7140625, |
| "learning_rate": 5e-05, |
| "loss": 0.127, |
| "loss/crossentropy": 1.6071490049362183, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10107421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025960184633731842, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.00767284505138581, |
| "grad_norm": 1.8984375, |
| "grad_norm_var": 6.67979736328125, |
| "learning_rate": 5e-05, |
| "loss": 0.1926, |
| "loss/crossentropy": 2.840394973754883, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1455078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.047091417014598846, |
| "step": 931 |
| }, |
| { |
| "epoch": 0.007681086560570972, |
| "grad_norm": 1.9921875, |
| "grad_norm_var": 6.658870188395182, |
| "learning_rate": 5e-05, |
| "loss": 0.155, |
| "loss/crossentropy": 1.9618257284164429, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1240234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03101358562707901, |
| "step": 932 |
| }, |
| { |
| "epoch": 0.007689328069756134, |
| "grad_norm": 1.6953125, |
| "grad_norm_var": 6.760285441080729, |
| "learning_rate": 5e-05, |
| "loss": 0.1729, |
| "loss/crossentropy": 0.5429065823554993, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1552734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.017611898481845856, |
| "step": 933 |
| }, |
| { |
| "epoch": 0.0076975695789412955, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 6.744618479410807, |
| "learning_rate": 5e-05, |
| "loss": 0.1526, |
| "loss/crossentropy": 2.004976749420166, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12255859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030033178627490997, |
| "step": 934 |
| }, |
| { |
| "epoch": 0.007705811088126457, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 6.8623606363932295, |
| "learning_rate": 5e-05, |
| "loss": 0.106, |
| "loss/crossentropy": 1.3220717906951904, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.08984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.01619834452867508, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.007714052597311619, |
| "grad_norm": 1.46875, |
| "grad_norm_var": 6.919648996988932, |
| "learning_rate": 5e-05, |
| "loss": 0.1283, |
| "loss/crossentropy": 1.6176024675369263, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1044921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023826373741030693, |
| "step": 936 |
| }, |
| { |
| "epoch": 0.007722294106496782, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 6.727388254801432, |
| "learning_rate": 5e-05, |
| "loss": 0.1905, |
| "loss/crossentropy": 2.7859342098236084, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.14453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04592683166265488, |
| "step": 937 |
| }, |
| { |
| "epoch": 0.007730535615681944, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 6.672985585530599, |
| "learning_rate": 5e-05, |
| "loss": 0.1824, |
| "loss/crossentropy": 2.764125108718872, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1416015625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04075000435113907, |
| "step": 938 |
| }, |
| { |
| "epoch": 0.007738777124867106, |
| "grad_norm": 1.8203125, |
| "grad_norm_var": 6.668602498372396, |
| "learning_rate": 5e-05, |
| "loss": 0.1234, |
| "loss/crossentropy": 2.6147968769073486, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.099609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.023791346698999405, |
| "step": 939 |
| }, |
| { |
| "epoch": 0.007747018634052268, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 6.377123769124349, |
| "learning_rate": 5e-05, |
| "loss": 0.155, |
| "loss/crossentropy": 1.4540939331054688, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029985029250383377, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.00775526014323743, |
| "grad_norm": 2.0625, |
| "grad_norm_var": 6.345385487874349, |
| "learning_rate": 5e-05, |
| "loss": 0.1772, |
| "loss/crossentropy": 2.8355026245117188, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.138671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.038478825241327286, |
| "step": 941 |
| }, |
| { |
| "epoch": 0.0077635016524225916, |
| "grad_norm": 2.6875, |
| "grad_norm_var": 6.287605539957682, |
| "learning_rate": 5e-05, |
| "loss": 0.1997, |
| "loss/crossentropy": 2.347745895385742, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.154296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0454033724963665, |
| "step": 942 |
| }, |
| { |
| "epoch": 0.0077717431616077534, |
| "grad_norm": 1.03125, |
| "grad_norm_var": 6.403419748942057, |
| "learning_rate": 5e-05, |
| "loss": 0.098, |
| "loss/crossentropy": 0.4349134564399719, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.087890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.010073849000036716, |
| "step": 943 |
| }, |
| { |
| "epoch": 0.007779984670792915, |
| "grad_norm": 1.5, |
| "grad_norm_var": 6.464503733317057, |
| "learning_rate": 5e-05, |
| "loss": 0.132, |
| "loss/crossentropy": 2.370471239089966, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02652416005730629, |
| "step": 944 |
| }, |
| { |
| "epoch": 0.007788226179978077, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 0.1785296122233073, |
| "learning_rate": 5e-05, |
| "loss": 0.1954, |
| "loss/crossentropy": 2.6951100826263428, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04693574458360672, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.00779646768916324, |
| "grad_norm": 2.9375, |
| "grad_norm_var": 0.24520670572916667, |
| "learning_rate": 5e-05, |
| "loss": 0.1673, |
| "loss/crossentropy": 1.1609289646148682, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.134765625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03258271515369415, |
| "step": 946 |
| }, |
| { |
| "epoch": 0.007804709198348402, |
| "grad_norm": 1.8125, |
| "grad_norm_var": 0.2462053934733073, |
| "learning_rate": 5e-05, |
| "loss": 0.1578, |
| "loss/crossentropy": 2.407763957977295, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.12353515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.034231819212436676, |
| "step": 947 |
| }, |
| { |
| "epoch": 0.007812950707533564, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 0.2494140625, |
| "learning_rate": 5e-05, |
| "loss": 0.155, |
| "loss/crossentropy": 1.5243741273880005, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.126953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02800397202372551, |
| "step": 948 |
| }, |
| { |
| "epoch": 0.007821192216718726, |
| "grad_norm": 5.0625, |
| "grad_norm_var": 0.8563189188639323, |
| "learning_rate": 5e-05, |
| "loss": 0.1713, |
| "loss/crossentropy": 1.9425218105316162, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03065253421664238, |
| "step": 949 |
| }, |
| { |
| "epoch": 0.007829433725903888, |
| "grad_norm": 3.8125, |
| "grad_norm_var": 1.027972157796224, |
| "learning_rate": 5e-05, |
| "loss": 0.2875, |
| "loss/crossentropy": 2.2058985233306885, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.228515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0590139701962471, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.00783767523508905, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 0.9658406575520834, |
| "learning_rate": 5e-05, |
| "loss": 0.2211, |
| "loss/crossentropy": 2.6648082733154297, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1630859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05802769958972931, |
| "step": 951 |
| }, |
| { |
| "epoch": 0.007845916744274211, |
| "grad_norm": 22.875, |
| "grad_norm_var": 27.2247314453125, |
| "learning_rate": 5e-05, |
| "loss": 0.3126, |
| "loss/crossentropy": 2.362283945083618, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05479743331670761, |
| "step": 952 |
| }, |
| { |
| "epoch": 0.007854158253459373, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 27.4640256245931, |
| "learning_rate": 5e-05, |
| "loss": 0.1007, |
| "loss/crossentropy": 0.38303616642951965, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.08935546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.011384121142327785, |
| "step": 953 |
| }, |
| { |
| "epoch": 0.007862399762644535, |
| "grad_norm": 1.1484375, |
| "grad_norm_var": 27.740185546875, |
| "learning_rate": 5e-05, |
| "loss": 0.1253, |
| "loss/crossentropy": 1.6580368280410767, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0986328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.026700211688876152, |
| "step": 954 |
| }, |
| { |
| "epoch": 0.007870641271829697, |
| "grad_norm": 1.7734375, |
| "grad_norm_var": 27.750869750976562, |
| "learning_rate": 5e-05, |
| "loss": 0.1519, |
| "loss/crossentropy": 2.318824529647827, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11767578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03420557081699371, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.007878882781014859, |
| "grad_norm": 3.03125, |
| "grad_norm_var": 27.58492202758789, |
| "learning_rate": 5e-05, |
| "loss": 0.1829, |
| "loss/crossentropy": 2.9506826400756836, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.042233243584632874, |
| "step": 956 |
| }, |
| { |
| "epoch": 0.00788712429020002, |
| "grad_norm": 2.75, |
| "grad_norm_var": 27.475665028889974, |
| "learning_rate": 5e-05, |
| "loss": 0.1622, |
| "loss/crossentropy": 1.327090859413147, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.13671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02552720718085766, |
| "step": 957 |
| }, |
| { |
| "epoch": 0.007895365799385183, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 27.580934397379558, |
| "learning_rate": 5e-05, |
| "loss": 0.1882, |
| "loss/crossentropy": 2.5473806858062744, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1455078125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04271348565816879, |
| "step": 958 |
| }, |
| { |
| "epoch": 0.007903607308570345, |
| "grad_norm": 1.4453125, |
| "grad_norm_var": 27.450960286458333, |
| "learning_rate": 5e-05, |
| "loss": 0.1266, |
| "loss/crossentropy": 1.5830978155136108, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10400390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.022615976631641388, |
| "step": 959 |
| }, |
| { |
| "epoch": 0.007911848817755508, |
| "grad_norm": 2.53125, |
| "grad_norm_var": 27.227925618489582, |
| "learning_rate": 5e-05, |
| "loss": 0.1684, |
| "loss/crossentropy": 2.8910608291625977, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1298828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03847365081310272, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.00792009032694067, |
| "grad_norm": 2.109375, |
| "grad_norm_var": 27.26726786295573, |
| "learning_rate": 5e-05, |
| "loss": 0.1292, |
| "loss/crossentropy": 1.506900429725647, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.111328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.017824511975049973, |
| "step": 961 |
| }, |
| { |
| "epoch": 0.007928331836125832, |
| "grad_norm": 5.3125, |
| "grad_norm_var": 27.391893513997395, |
| "learning_rate": 5e-05, |
| "loss": 0.3447, |
| "loss/crossentropy": 2.996657133102417, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.07124556601047516, |
| "step": 962 |
| }, |
| { |
| "epoch": 0.007936573345310994, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 27.421708170572916, |
| "learning_rate": 5e-05, |
| "loss": 0.1274, |
| "loss/crossentropy": 1.4830890893936157, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1064453125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02091062068939209, |
| "step": 963 |
| }, |
| { |
| "epoch": 0.007944814854496156, |
| "grad_norm": 1.0390625, |
| "grad_norm_var": 27.6348264058431, |
| "learning_rate": 5e-05, |
| "loss": 0.0988, |
| "loss/crossentropy": 0.4731054902076721, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.08740234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.011356725357472897, |
| "step": 964 |
| }, |
| { |
| "epoch": 0.007953056363681318, |
| "grad_norm": 3.203125, |
| "grad_norm_var": 27.5273312886556, |
| "learning_rate": 5e-05, |
| "loss": 0.2465, |
| "loss/crossentropy": 2.7651779651641846, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.203125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.043382175266742706, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.00796129787286648, |
| "grad_norm": 2.953125, |
| "grad_norm_var": 27.553851064046224, |
| "learning_rate": 5e-05, |
| "loss": 0.2593, |
| "loss/crossentropy": 1.5127090215682983, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.212890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04636671021580696, |
| "step": 966 |
| }, |
| { |
| "epoch": 0.007969539382051642, |
| "grad_norm": 3.0, |
| "grad_norm_var": 27.44041519165039, |
| "learning_rate": 5e-05, |
| "loss": 0.1922, |
| "loss/crossentropy": 1.4316555261611938, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.16796875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024203313514590263, |
| "step": 967 |
| }, |
| { |
| "epoch": 0.007977780891236803, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 1.1560523986816407, |
| "learning_rate": 5e-05, |
| "loss": 0.1196, |
| "loss/crossentropy": 1.2071629762649536, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1025390625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.017063483595848083, |
| "step": 968 |
| }, |
| { |
| "epoch": 0.007986022400421965, |
| "grad_norm": 2.375, |
| "grad_norm_var": 1.0834788004557292, |
| "learning_rate": 5e-05, |
| "loss": 0.1698, |
| "loss/crossentropy": 2.020332098007202, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1396484375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030112620443105698, |
| "step": 969 |
| }, |
| { |
| "epoch": 0.007994263909607127, |
| "grad_norm": 3.421875, |
| "grad_norm_var": 1.0326372782389324, |
| "learning_rate": 5e-05, |
| "loss": 0.2121, |
| "loss/crossentropy": 1.4469772577285767, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1767578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03534634783864021, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.008002505418792289, |
| "grad_norm": 2.8125, |
| "grad_norm_var": 0.9961415608723958, |
| "learning_rate": 5e-05, |
| "loss": 0.198, |
| "loss/crossentropy": 2.7077815532684326, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.15234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.045610323548316956, |
| "step": 971 |
| }, |
| { |
| "epoch": 0.008010746927977451, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 0.9894765218098959, |
| "learning_rate": 5e-05, |
| "loss": 0.1874, |
| "loss/crossentropy": 1.9022347927093506, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.154296875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033076584339141846, |
| "step": 972 |
| }, |
| { |
| "epoch": 0.008018988437162613, |
| "grad_norm": 2.703125, |
| "grad_norm_var": 0.9882893880208333, |
| "learning_rate": 5e-05, |
| "loss": 0.1256, |
| "loss/crossentropy": 0.7915277481079102, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11181640625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.013769976794719696, |
| "step": 973 |
| }, |
| { |
| "epoch": 0.008027229946347775, |
| "grad_norm": 4.875, |
| "grad_norm_var": 1.3040598551432292, |
| "learning_rate": 5e-05, |
| "loss": 0.3439, |
| "loss/crossentropy": 2.4664230346679688, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.28515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05875328183174133, |
| "step": 974 |
| }, |
| { |
| "epoch": 0.008035471455532937, |
| "grad_norm": 1.609375, |
| "grad_norm_var": 1.2780352274576823, |
| "learning_rate": 5e-05, |
| "loss": 0.1361, |
| "loss/crossentropy": 2.0170304775238037, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.111328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.024749569594860077, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.008043712964718099, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 1.3024024963378906, |
| "learning_rate": 5e-05, |
| "loss": 0.1647, |
| "loss/crossentropy": 2.516977310180664, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.033810123801231384, |
| "step": 976 |
| }, |
| { |
| "epoch": 0.00805195447390326, |
| "grad_norm": 2.421875, |
| "grad_norm_var": 1.2841529846191406, |
| "learning_rate": 5e-05, |
| "loss": 0.1797, |
| "loss/crossentropy": 1.6354763507843018, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.142578125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03713398799300194, |
| "step": 977 |
| }, |
| { |
| "epoch": 0.008060195983088422, |
| "grad_norm": 2.609375, |
| "grad_norm_var": 0.8040667215983073, |
| "learning_rate": 5e-05, |
| "loss": 0.1307, |
| "loss/crossentropy": 2.52553653717041, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10546875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025194775313138962, |
| "step": 978 |
| }, |
| { |
| "epoch": 0.008068437492273586, |
| "grad_norm": 1.15625, |
| "grad_norm_var": 0.8841041564941406, |
| "learning_rate": 5e-05, |
| "loss": 0.1438, |
| "loss/crossentropy": 2.593212127685547, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030498359352350235, |
| "step": 979 |
| }, |
| { |
| "epoch": 0.008076679001458748, |
| "grad_norm": 1.3203125, |
| "grad_norm_var": 0.8338783264160157, |
| "learning_rate": 5e-05, |
| "loss": 0.1281, |
| "loss/crossentropy": 2.2054710388183594, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10107421875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02705022320151329, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.00808492051064391, |
| "grad_norm": 1.390625, |
| "grad_norm_var": 0.8760047912597656, |
| "learning_rate": 5e-05, |
| "loss": 0.1358, |
| "loss/crossentropy": 1.5589760541915894, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1103515625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025421902537345886, |
| "step": 981 |
| }, |
| { |
| "epoch": 0.008093162019829072, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 0.8555946350097656, |
| "learning_rate": 5e-05, |
| "loss": 0.1685, |
| "loss/crossentropy": 2.5976901054382324, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.130859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03766857087612152, |
| "step": 982 |
| }, |
| { |
| "epoch": 0.008101403529014234, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 0.853905995686849, |
| "learning_rate": 5e-05, |
| "loss": 0.1405, |
| "loss/crossentropy": 1.4126673936843872, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.115234375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.025234002619981766, |
| "step": 983 |
| }, |
| { |
| "epoch": 0.008109645038199395, |
| "grad_norm": 3.0625, |
| "grad_norm_var": 0.859545644124349, |
| "learning_rate": 5e-05, |
| "loss": 0.1731, |
| "loss/crossentropy": 1.74605131149292, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1376953125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.035446591675281525, |
| "step": 984 |
| }, |
| { |
| "epoch": 0.008117886547384557, |
| "grad_norm": 2.90625, |
| "grad_norm_var": 0.8763201395670573, |
| "learning_rate": 5e-05, |
| "loss": 0.124, |
| "loss/crossentropy": 1.0640939474105835, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1044921875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.019489990547299385, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.00812612805656972, |
| "grad_norm": 1.6171875, |
| "grad_norm_var": 0.8388987223307292, |
| "learning_rate": 5e-05, |
| "loss": 0.142, |
| "loss/crossentropy": 1.6699655055999756, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11474609375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02729114145040512, |
| "step": 986 |
| }, |
| { |
| "epoch": 0.008134369565754881, |
| "grad_norm": 2.40625, |
| "grad_norm_var": 0.8218658447265625, |
| "learning_rate": 5e-05, |
| "loss": 0.1273, |
| "loss/crossentropy": 0.9518370032310486, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.10693359375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.020414654165506363, |
| "step": 987 |
| }, |
| { |
| "epoch": 0.008142611074940043, |
| "grad_norm": 1.9453125, |
| "grad_norm_var": 0.8288530985514323, |
| "learning_rate": 5e-05, |
| "loss": 0.1543, |
| "loss/crossentropy": 2.4604508876800537, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03225576505064964, |
| "step": 988 |
| }, |
| { |
| "epoch": 0.008150852584125205, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 0.8185991923014323, |
| "learning_rate": 5e-05, |
| "loss": 0.1825, |
| "loss/crossentropy": 2.5151901245117188, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04188704490661621, |
| "step": 989 |
| }, |
| { |
| "epoch": 0.008159094093310367, |
| "grad_norm": 3.8125, |
| "grad_norm_var": 0.5173500061035157, |
| "learning_rate": 5e-05, |
| "loss": 0.1537, |
| "loss/crossentropy": 2.6265015602111816, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.119140625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03460276871919632, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.008167335602495529, |
| "grad_norm": 1.4921875, |
| "grad_norm_var": 0.5271881103515625, |
| "learning_rate": 5e-05, |
| "loss": 0.1395, |
| "loss/crossentropy": 2.9253175258636475, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.109375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.030113544315099716, |
| "step": 991 |
| }, |
| { |
| "epoch": 0.00817557711168069, |
| "grad_norm": 1.3828125, |
| "grad_norm_var": 0.5665484110514323, |
| "learning_rate": 5e-05, |
| "loss": 0.1456, |
| "loss/crossentropy": 2.487765073776245, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.11328125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.032366957515478134, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.008183818620865852, |
| "grad_norm": 1.796875, |
| "grad_norm_var": 0.5669146219889323, |
| "learning_rate": 5e-05, |
| "loss": 0.1707, |
| "loss/crossentropy": 2.329315185546875, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.13671875, |
| "loss/idx": 0.0, |
| "loss/logits": 0.03394667059183121, |
| "step": 993 |
| }, |
| { |
| "epoch": 0.008192060130051014, |
| "grad_norm": 3.78125, |
| "grad_norm_var": 0.7332354227701823, |
| "learning_rate": 5e-05, |
| "loss": 0.1519, |
| "loss/crossentropy": 2.148042917251587, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1220703125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.029824528843164444, |
| "step": 994 |
| }, |
| { |
| "epoch": 0.008200301639236176, |
| "grad_norm": 25.625, |
| "grad_norm_var": 34.854078928629555, |
| "learning_rate": 5e-05, |
| "loss": 0.334, |
| "loss/crossentropy": 1.8628984689712524, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.27734375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.05670515447854996, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.008208543148421338, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 34.81780497233073, |
| "learning_rate": 5e-05, |
| "loss": 0.1181, |
| "loss/crossentropy": 2.3790695667266846, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.0947265625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.02332988940179348, |
| "step": 996 |
| }, |
| { |
| "epoch": 0.0082167846576065, |
| "grad_norm": 1.2734375, |
| "grad_norm_var": 34.854811350504555, |
| "learning_rate": 5e-05, |
| "loss": 0.1176, |
| "loss/crossentropy": 1.283698320388794, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1005859375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.016972240060567856, |
| "step": 997 |
| }, |
| { |
| "epoch": 0.008225026166791664, |
| "grad_norm": 2.640625, |
| "grad_norm_var": 34.82328465779622, |
| "learning_rate": 5e-05, |
| "loss": 0.1779, |
| "loss/crossentropy": 2.7694525718688965, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1337890625, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04413297772407532, |
| "step": 998 |
| }, |
| { |
| "epoch": 0.008233267675976826, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 34.77723388671875, |
| "learning_rate": 5e-05, |
| "loss": 0.1636, |
| "loss/crossentropy": 2.530651330947876, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.1298828125, |
| "loss/idx": 0.0, |
| "loss/logits": 0.0337049663066864, |
| "step": 999 |
| }, |
| { |
| "epoch": 0.008241509185161987, |
| "grad_norm": 5.5625, |
| "grad_norm_var": 34.94845784505208, |
| "learning_rate": 5e-05, |
| "loss": 0.278, |
| "loss/crossentropy": 1.1534559726715088, |
| "loss/dist_ce": 0.0, |
| "loss/hidden": 0.2333984375, |
| "loss/idx": 0.0, |
| "loss/logits": 0.04459930956363678, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 100000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.8956539674624e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|