testckpt1l3b / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
9f4d943 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.008241509185161987,
"eval_steps": 2000,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 8.241509185161987e-06,
"grad_norm": 780.0,
"learning_rate": 5e-05,
"loss": 22.7489,
"loss/crossentropy": 8.68287467956543,
"loss/dist_ce": 0.0,
"loss/hidden": 16.25,
"loss/idx": 0.0,
"loss/logits": 6.498888969421387,
"step": 1
},
{
"epoch": 1.6483018370323974e-05,
"grad_norm": 824.0,
"learning_rate": 5e-05,
"loss": 18.5076,
"loss/crossentropy": 8.787271499633789,
"loss/dist_ce": 0.0,
"loss/hidden": 12.625,
"loss/idx": 0.0,
"loss/logits": 5.8825788497924805,
"step": 2
},
{
"epoch": 2.472452755548596e-05,
"grad_norm": 466.0,
"learning_rate": 5e-05,
"loss": 12.0241,
"loss/crossentropy": 7.810218334197998,
"loss/dist_ce": 0.0,
"loss/hidden": 7.21875,
"loss/idx": 0.0,
"loss/logits": 4.805373191833496,
"step": 3
},
{
"epoch": 3.296603674064795e-05,
"grad_norm": 215.0,
"learning_rate": 5e-05,
"loss": 9.4126,
"loss/crossentropy": 6.043552398681641,
"loss/dist_ce": 0.0,
"loss/hidden": 6.25,
"loss/idx": 0.0,
"loss/logits": 3.1625852584838867,
"step": 4
},
{
"epoch": 4.1207545925809937e-05,
"grad_norm": 468.0,
"learning_rate": 5e-05,
"loss": 6.2853,
"loss/crossentropy": 4.783352851867676,
"loss/dist_ce": 0.0,
"loss/hidden": 4.21875,
"loss/idx": 0.0,
"loss/logits": 2.066528558731079,
"step": 5
},
{
"epoch": 4.944905511097192e-05,
"grad_norm": 306.0,
"learning_rate": 5e-05,
"loss": 5.4625,
"loss/crossentropy": 1.6133296489715576,
"loss/dist_ce": 0.0,
"loss/hidden": 4.1875,
"loss/idx": 0.0,
"loss/logits": 1.2749497890472412,
"step": 6
},
{
"epoch": 5.769056429613391e-05,
"grad_norm": 217.0,
"learning_rate": 5e-05,
"loss": 8.1947,
"loss/crossentropy": 4.63270378112793,
"loss/dist_ce": 0.0,
"loss/hidden": 5.78125,
"loss/idx": 0.0,
"loss/logits": 2.4134607315063477,
"step": 7
},
{
"epoch": 6.59320734812959e-05,
"grad_norm": 404.0,
"learning_rate": 5e-05,
"loss": 5.0477,
"loss/crossentropy": 4.424153804779053,
"loss/dist_ce": 0.0,
"loss/hidden": 3.53125,
"loss/idx": 0.0,
"loss/logits": 1.5164613723754883,
"step": 8
},
{
"epoch": 7.417358266645788e-05,
"grad_norm": 83.5,
"learning_rate": 5e-05,
"loss": 3.1549,
"loss/crossentropy": 3.354282855987549,
"loss/dist_ce": 0.0,
"loss/hidden": 2.296875,
"loss/idx": 0.0,
"loss/logits": 0.8579829931259155,
"step": 9
},
{
"epoch": 8.241509185161987e-05,
"grad_norm": 115.5,
"learning_rate": 5e-05,
"loss": 3.1588,
"loss/crossentropy": 3.1871225833892822,
"loss/dist_ce": 0.0,
"loss/hidden": 2.375,
"loss/idx": 0.0,
"loss/logits": 0.7837648391723633,
"step": 10
},
{
"epoch": 9.065660103678186e-05,
"grad_norm": 252.0,
"learning_rate": 5e-05,
"loss": 7.2603,
"loss/crossentropy": 4.682134628295898,
"loss/dist_ce": 0.0,
"loss/hidden": 5.0,
"loss/idx": 0.0,
"loss/logits": 2.2602720260620117,
"step": 11
},
{
"epoch": 9.889811022194384e-05,
"grad_norm": 109.0,
"learning_rate": 5e-05,
"loss": 3.1302,
"loss/crossentropy": 2.417746067047119,
"loss/dist_ce": 0.0,
"loss/hidden": 2.46875,
"loss/idx": 0.0,
"loss/logits": 0.661416232585907,
"step": 12
},
{
"epoch": 0.00010713961940710583,
"grad_norm": 68.5,
"learning_rate": 5e-05,
"loss": 2.4003,
"loss/crossentropy": 1.6968345642089844,
"loss/dist_ce": 0.0,
"loss/hidden": 1.984375,
"loss/idx": 0.0,
"loss/logits": 0.4158973693847656,
"step": 13
},
{
"epoch": 0.00011538112859226781,
"grad_norm": 454.0,
"learning_rate": 5e-05,
"loss": 7.3347,
"loss/crossentropy": 4.652151584625244,
"loss/dist_ce": 0.0,
"loss/hidden": 4.25,
"loss/idx": 0.0,
"loss/logits": 3.084686756134033,
"step": 14
},
{
"epoch": 0.00012362263777742982,
"grad_norm": 126.5,
"learning_rate": 5e-05,
"loss": 2.4695,
"loss/crossentropy": 3.0716333389282227,
"loss/dist_ce": 0.0,
"loss/hidden": 1.875,
"loss/idx": 0.0,
"loss/logits": 0.594476580619812,
"step": 15
},
{
"epoch": 0.0001318641469625918,
"grad_norm": 306.0,
"grad_norm_var": 53540.9625,
"learning_rate": 5e-05,
"loss": 3.9132,
"loss/crossentropy": 2.430070638656616,
"loss/dist_ce": 0.0,
"loss/hidden": 2.9375,
"loss/idx": 0.0,
"loss/logits": 0.9756777882575989,
"step": 16
},
{
"epoch": 0.00014010565614775377,
"grad_norm": 68.5,
"grad_norm_var": 41986.49895833333,
"learning_rate": 5e-05,
"loss": 2.3829,
"loss/crossentropy": 1.8029091358184814,
"loss/dist_ce": 0.0,
"loss/hidden": 1.8828125,
"loss/idx": 0.0,
"loss/logits": 0.5000446438789368,
"step": 17
},
{
"epoch": 0.00014834716533291577,
"grad_norm": 139.0,
"grad_norm_var": 21647.707291666666,
"learning_rate": 5e-05,
"loss": 2.5801,
"loss/crossentropy": 1.5956979990005493,
"loss/dist_ce": 0.0,
"loss/hidden": 2.15625,
"loss/idx": 0.0,
"loss/logits": 0.4238685965538025,
"step": 18
},
{
"epoch": 0.00015658867451807774,
"grad_norm": 28.625,
"grad_norm_var": 20272.937434895834,
"learning_rate": 5e-05,
"loss": 1.6796,
"loss/crossentropy": 2.664867401123047,
"loss/dist_ce": 0.0,
"loss/hidden": 1.296875,
"loss/idx": 0.0,
"loss/logits": 0.3827553689479828,
"step": 19
},
{
"epoch": 0.00016483018370323975,
"grad_norm": 81.5,
"grad_norm_var": 21299.079622395835,
"learning_rate": 5e-05,
"loss": 1.8862,
"loss/crossentropy": 3.0564301013946533,
"loss/dist_ce": 0.0,
"loss/hidden": 1.4609375,
"loss/idx": 0.0,
"loss/logits": 0.42524370551109314,
"step": 20
},
{
"epoch": 0.00017307169288840172,
"grad_norm": 71.5,
"grad_norm_var": 17047.856184895834,
"learning_rate": 5e-05,
"loss": 1.8175,
"loss/crossentropy": 1.5220972299575806,
"loss/dist_ce": 0.0,
"loss/hidden": 1.578125,
"loss/idx": 0.0,
"loss/logits": 0.23937611281871796,
"step": 21
},
{
"epoch": 0.00018131320207356372,
"grad_norm": 32.25,
"grad_norm_var": 17021.051497395834,
"learning_rate": 5e-05,
"loss": 1.636,
"loss/crossentropy": 1.8798402547836304,
"loss/dist_ce": 0.0,
"loss/hidden": 1.3125,
"loss/idx": 0.0,
"loss/logits": 0.32345157861709595,
"step": 22
},
{
"epoch": 0.0001895547112587257,
"grad_norm": 31.25,
"grad_norm_var": 17761.729622395833,
"learning_rate": 5e-05,
"loss": 1.5306,
"loss/crossentropy": 3.0712087154388428,
"loss/dist_ce": 0.0,
"loss/hidden": 1.171875,
"loss/idx": 0.0,
"loss/logits": 0.3587738275527954,
"step": 23
},
{
"epoch": 0.00019779622044388767,
"grad_norm": 13.375,
"grad_norm_var": 13976.939583333333,
"learning_rate": 5e-05,
"loss": 1.0326,
"loss/crossentropy": 2.2200183868408203,
"loss/dist_ce": 0.0,
"loss/hidden": 0.8125,
"loss/idx": 0.0,
"loss/logits": 0.2200760841369629,
"step": 24
},
{
"epoch": 0.00020603772962904968,
"grad_norm": 28.125,
"grad_norm_var": 14466.229622395833,
"learning_rate": 5e-05,
"loss": 1.509,
"loss/crossentropy": 3.206345319747925,
"loss/dist_ce": 0.0,
"loss/hidden": 1.1484375,
"loss/idx": 0.0,
"loss/logits": 0.3605613112449646,
"step": 25
},
{
"epoch": 0.00021427923881421165,
"grad_norm": 153.0,
"grad_norm_var": 14529.862434895833,
"learning_rate": 5e-05,
"loss": 2.1474,
"loss/crossentropy": 1.5313490629196167,
"loss/dist_ce": 0.0,
"loss/hidden": 1.765625,
"loss/idx": 0.0,
"loss/logits": 0.38178950548171997,
"step": 26
},
{
"epoch": 0.00022252074799937365,
"grad_norm": 94.0,
"grad_norm_var": 13366.093684895834,
"learning_rate": 5e-05,
"loss": 3.417,
"loss/crossentropy": 1.551514744758606,
"loss/dist_ce": 0.0,
"loss/hidden": 2.75,
"loss/idx": 0.0,
"loss/logits": 0.6670438051223755,
"step": 27
},
{
"epoch": 0.00023076225718453563,
"grad_norm": 268.0,
"grad_norm_var": 14865.165559895833,
"learning_rate": 5e-05,
"loss": 1.9003,
"loss/crossentropy": 3.108414649963379,
"loss/dist_ce": 0.0,
"loss/hidden": 1.421875,
"loss/idx": 0.0,
"loss/logits": 0.4784301221370697,
"step": 28
},
{
"epoch": 0.00023900376636969763,
"grad_norm": 34.25,
"grad_norm_var": 15186.259309895833,
"learning_rate": 5e-05,
"loss": 1.0626,
"loss/crossentropy": 3.3259568214416504,
"loss/dist_ce": 0.0,
"loss/hidden": 0.81640625,
"loss/idx": 0.0,
"loss/logits": 0.24623815715312958,
"step": 29
},
{
"epoch": 0.00024724527555485963,
"grad_norm": 167.0,
"grad_norm_var": 7576.8728515625,
"learning_rate": 5e-05,
"loss": 1.4178,
"loss/crossentropy": 1.5920592546463013,
"loss/dist_ce": 0.0,
"loss/hidden": 1.28125,
"loss/idx": 0.0,
"loss/logits": 0.13655498623847961,
"step": 30
},
{
"epoch": 0.0002554867847400216,
"grad_norm": 12.6875,
"grad_norm_var": 8024.979931640625,
"learning_rate": 5e-05,
"loss": 0.9625,
"loss/crossentropy": 2.868499517440796,
"loss/dist_ce": 0.0,
"loss/hidden": 0.7421875,
"loss/idx": 0.0,
"loss/logits": 0.2203603982925415,
"step": 31
},
{
"epoch": 0.0002637282939251836,
"grad_norm": 49.5,
"grad_norm_var": 4940.166650390625,
"learning_rate": 5e-05,
"loss": 1.0668,
"loss/crossentropy": 2.660956859588623,
"loss/dist_ce": 0.0,
"loss/hidden": 0.8359375,
"loss/idx": 0.0,
"loss/logits": 0.23084667325019836,
"step": 32
},
{
"epoch": 0.0002719698031103456,
"grad_norm": 18.875,
"grad_norm_var": 5167.097639973958,
"learning_rate": 5e-05,
"loss": 0.9463,
"loss/crossentropy": 1.6037225723266602,
"loss/dist_ce": 0.0,
"loss/hidden": 0.8125,
"loss/idx": 0.0,
"loss/logits": 0.13378173112869263,
"step": 33
},
{
"epoch": 0.00028021131229550753,
"grad_norm": 18.75,
"grad_norm_var": 5067.703499348959,
"learning_rate": 5e-05,
"loss": 1.0141,
"loss/crossentropy": 1.0409276485443115,
"loss/dist_ce": 0.0,
"loss/hidden": 0.84375,
"loss/idx": 0.0,
"loss/logits": 0.1703900545835495,
"step": 34
},
{
"epoch": 0.00028845282148066954,
"grad_norm": 16.5,
"grad_norm_var": 5142.032275390625,
"learning_rate": 5e-05,
"loss": 0.9889,
"loss/crossentropy": 1.5536582469940186,
"loss/dist_ce": 0.0,
"loss/hidden": 0.8359375,
"loss/idx": 0.0,
"loss/logits": 0.1529390513896942,
"step": 35
},
{
"epoch": 0.00029669433066583154,
"grad_norm": 9.8125,
"grad_norm_var": 5335.719205729167,
"learning_rate": 5e-05,
"loss": 0.8687,
"loss/crossentropy": 2.7224836349487305,
"loss/dist_ce": 0.0,
"loss/hidden": 0.671875,
"loss/idx": 0.0,
"loss/logits": 0.19684143364429474,
"step": 36
},
{
"epoch": 0.00030493583985099354,
"grad_norm": 7.125,
"grad_norm_var": 5527.603645833334,
"learning_rate": 5e-05,
"loss": 0.5503,
"loss/crossentropy": 2.5596024990081787,
"loss/dist_ce": 0.0,
"loss/hidden": 0.4296875,
"loss/idx": 0.0,
"loss/logits": 0.12058012187480927,
"step": 37
},
{
"epoch": 0.0003131773490361555,
"grad_norm": 13.1875,
"grad_norm_var": 5619.972379557292,
"learning_rate": 5e-05,
"loss": 0.8248,
"loss/crossentropy": 2.8074352741241455,
"loss/dist_ce": 0.0,
"loss/hidden": 0.62109375,
"loss/idx": 0.0,
"loss/logits": 0.2037278115749359,
"step": 38
},
{
"epoch": 0.0003214188582213175,
"grad_norm": 37.75,
"grad_norm_var": 5599.026806640625,
"learning_rate": 5e-05,
"loss": 1.3462,
"loss/crossentropy": 1.5375018119812012,
"loss/dist_ce": 0.0,
"loss/hidden": 1.15625,
"loss/idx": 0.0,
"loss/logits": 0.18997883796691895,
"step": 39
},
{
"epoch": 0.0003296603674064795,
"grad_norm": 7.28125,
"grad_norm_var": 5638.313244628906,
"learning_rate": 5e-05,
"loss": 0.6618,
"loss/crossentropy": 2.4615395069122314,
"loss/dist_ce": 0.0,
"loss/hidden": 0.51171875,
"loss/idx": 0.0,
"loss/logits": 0.15010175108909607,
"step": 40
},
{
"epoch": 0.00033790187659164144,
"grad_norm": 68.5,
"grad_norm_var": 5576.730855305989,
"learning_rate": 5e-05,
"loss": 0.7963,
"loss/crossentropy": 1.1309521198272705,
"loss/dist_ce": 0.0,
"loss/hidden": 0.6796875,
"loss/idx": 0.0,
"loss/logits": 0.11660157144069672,
"step": 41
},
{
"epoch": 0.00034614338577680344,
"grad_norm": 10.0625,
"grad_norm_var": 5100.57030843099,
"learning_rate": 5e-05,
"loss": 0.6657,
"loss/crossentropy": 2.225135326385498,
"loss/dist_ce": 0.0,
"loss/hidden": 0.54296875,
"loss/idx": 0.0,
"loss/logits": 0.12272368371486664,
"step": 42
},
{
"epoch": 0.00035438489496196544,
"grad_norm": 15.125,
"grad_norm_var": 5048.541564941406,
"learning_rate": 5e-05,
"loss": 0.7119,
"loss/crossentropy": 0.907244861125946,
"loss/dist_ce": 0.0,
"loss/hidden": 0.6328125,
"loss/idx": 0.0,
"loss/logits": 0.07912808656692505,
"step": 43
},
{
"epoch": 0.00036262640414712745,
"grad_norm": 11.25,
"grad_norm_var": 1608.158426920573,
"learning_rate": 5e-05,
"loss": 0.757,
"loss/crossentropy": 1.7073473930358887,
"loss/dist_ce": 0.0,
"loss/hidden": 0.62109375,
"loss/idx": 0.0,
"loss/logits": 0.1358700692653656,
"step": 44
},
{
"epoch": 0.0003708679133322894,
"grad_norm": 8.3125,
"grad_norm_var": 1639.323954264323,
"learning_rate": 5e-05,
"loss": 0.7863,
"loss/crossentropy": 2.7458887100219727,
"loss/dist_ce": 0.0,
"loss/hidden": 0.62890625,
"loss/idx": 0.0,
"loss/logits": 0.15743763744831085,
"step": 45
},
{
"epoch": 0.0003791094225174514,
"grad_norm": 7.15625,
"grad_norm_var": 305.3570963541667,
"learning_rate": 5e-05,
"loss": 0.7888,
"loss/crossentropy": 3.2708899974823,
"loss/dist_ce": 0.0,
"loss/hidden": 0.58984375,
"loss/idx": 0.0,
"loss/logits": 0.19896197319030762,
"step": 46
},
{
"epoch": 0.0003873509317026134,
"grad_norm": 95.5,
"grad_norm_var": 658.8413899739584,
"learning_rate": 5e-05,
"loss": 1.2412,
"loss/crossentropy": 2.0868113040924072,
"loss/dist_ce": 0.0,
"loss/hidden": 1.0,
"loss/idx": 0.0,
"loss/logits": 0.24115484952926636,
"step": 47
},
{
"epoch": 0.00039559244088777535,
"grad_norm": 7.53125,
"grad_norm_var": 629.9714803059895,
"learning_rate": 5e-05,
"loss": 0.7909,
"loss/crossentropy": 2.5569632053375244,
"loss/dist_ce": 0.0,
"loss/hidden": 0.6171875,
"loss/idx": 0.0,
"loss/logits": 0.17373064160346985,
"step": 48
},
{
"epoch": 0.00040383395007293735,
"grad_norm": 7.125,
"grad_norm_var": 643.5665974934896,
"learning_rate": 5e-05,
"loss": 0.5926,
"loss/crossentropy": 1.3575685024261475,
"loss/dist_ce": 0.0,
"loss/hidden": 0.486328125,
"loss/idx": 0.0,
"loss/logits": 0.10628513246774673,
"step": 49
},
{
"epoch": 0.00041207545925809935,
"grad_norm": 14.0625,
"grad_norm_var": 646.5402303059896,
"learning_rate": 5e-05,
"loss": 0.5641,
"loss/crossentropy": 1.111220359802246,
"loss/dist_ce": 0.0,
"loss/hidden": 0.486328125,
"loss/idx": 0.0,
"loss/logits": 0.07779324799776077,
"step": 50
},
{
"epoch": 0.00042031696844326135,
"grad_norm": 5.375,
"grad_norm_var": 660.9766560872396,
"learning_rate": 5e-05,
"loss": 0.622,
"loss/crossentropy": 2.907522678375244,
"loss/dist_ce": 0.0,
"loss/hidden": 0.498046875,
"loss/idx": 0.0,
"loss/logits": 0.12399697303771973,
"step": 51
},
{
"epoch": 0.0004285584776284233,
"grad_norm": 9.75,
"grad_norm_var": 661.0644816080729,
"learning_rate": 5e-05,
"loss": 0.5746,
"loss/crossentropy": 2.72662353515625,
"loss/dist_ce": 0.0,
"loss/hidden": 0.4453125,
"loss/idx": 0.0,
"loss/logits": 0.12924730777740479,
"step": 52
},
{
"epoch": 0.0004367999868135853,
"grad_norm": 4.9375,
"grad_norm_var": 665.2116170247396,
"learning_rate": 5e-05,
"loss": 0.6226,
"loss/crossentropy": 2.3917365074157715,
"loss/dist_ce": 0.0,
"loss/hidden": 0.498046875,
"loss/idx": 0.0,
"loss/logits": 0.1245586946606636,
"step": 53
},
{
"epoch": 0.0004450414959987473,
"grad_norm": 28.0,
"grad_norm_var": 665.1113240559896,
"learning_rate": 5e-05,
"loss": 0.8407,
"loss/crossentropy": 2.7228264808654785,
"loss/dist_ce": 0.0,
"loss/hidden": 0.65234375,
"loss/idx": 0.0,
"loss/logits": 0.18836885690689087,
"step": 54
},
{
"epoch": 0.00045328300518390925,
"grad_norm": 32.5,
"grad_norm_var": 655.1841756184896,
"learning_rate": 5e-05,
"loss": 0.6963,
"loss/crossentropy": 2.543640375137329,
"loss/dist_ce": 0.0,
"loss/hidden": 0.56640625,
"loss/idx": 0.0,
"loss/logits": 0.12986385822296143,
"step": 55
},
{
"epoch": 0.00046152451436907126,
"grad_norm": 16.875,
"grad_norm_var": 643.6704264322917,
"learning_rate": 5e-05,
"loss": 0.7344,
"loss/crossentropy": 1.649795413017273,
"loss/dist_ce": 0.0,
"loss/hidden": 0.609375,
"loss/idx": 0.0,
"loss/logits": 0.12504054605960846,
"step": 56
},
{
"epoch": 0.00046976602355423326,
"grad_norm": 28.0,
"grad_norm_var": 491.7321451822917,
"learning_rate": 5e-05,
"loss": 0.7995,
"loss/crossentropy": 1.5515227317810059,
"loss/dist_ce": 0.0,
"loss/hidden": 0.6796875,
"loss/idx": 0.0,
"loss/logits": 0.11983367055654526,
"step": 57
},
{
"epoch": 0.00047800753273939526,
"grad_norm": 4.15625,
"grad_norm_var": 500.8306925455729,
"learning_rate": 5e-05,
"loss": 0.4793,
"loss/crossentropy": 1.7439237833023071,
"loss/dist_ce": 0.0,
"loss/hidden": 0.388671875,
"loss/idx": 0.0,
"loss/logits": 0.09059557318687439,
"step": 58
},
{
"epoch": 0.0004862490419245572,
"grad_norm": 14.5,
"grad_norm_var": 501.1345662434896,
"learning_rate": 5e-05,
"loss": 1.0179,
"loss/crossentropy": 1.387863039970398,
"loss/dist_ce": 0.0,
"loss/hidden": 0.8828125,
"loss/idx": 0.0,
"loss/logits": 0.1351165473461151,
"step": 59
},
{
"epoch": 0.0004944905511097193,
"grad_norm": 8.0625,
"grad_norm_var": 504.82509358723956,
"learning_rate": 5e-05,
"loss": 0.5969,
"loss/crossentropy": 1.6710844039916992,
"loss/dist_ce": 0.0,
"loss/hidden": 0.490234375,
"loss/idx": 0.0,
"loss/logits": 0.10665580630302429,
"step": 60
},
{
"epoch": 0.0005027320602948812,
"grad_norm": 18.0,
"grad_norm_var": 497.86724853515625,
"learning_rate": 5e-05,
"loss": 0.5408,
"loss/crossentropy": 1.0266728401184082,
"loss/dist_ce": 0.0,
"loss/hidden": 0.47265625,
"loss/idx": 0.0,
"loss/logits": 0.068178191781044,
"step": 61
},
{
"epoch": 0.0005109735694800432,
"grad_norm": 7.46875,
"grad_norm_var": 497.38629150390625,
"learning_rate": 5e-05,
"loss": 0.8584,
"loss/crossentropy": 2.9015908241271973,
"loss/dist_ce": 0.0,
"loss/hidden": 0.6796875,
"loss/idx": 0.0,
"loss/logits": 0.1787503957748413,
"step": 62
},
{
"epoch": 0.0005192150786652052,
"grad_norm": 7.84375,
"grad_norm_var": 81.943603515625,
"learning_rate": 5e-05,
"loss": 0.4871,
"loss/crossentropy": 1.605463981628418,
"loss/dist_ce": 0.0,
"loss/hidden": 0.40234375,
"loss/idx": 0.0,
"loss/logits": 0.08476820588111877,
"step": 63
},
{
"epoch": 0.0005274565878503672,
"grad_norm": 6.3125,
"grad_norm_var": 82.98795166015626,
"learning_rate": 5e-05,
"loss": 0.4591,
"loss/crossentropy": 1.7012584209442139,
"loss/dist_ce": 0.0,
"loss/hidden": 0.373046875,
"loss/idx": 0.0,
"loss/logits": 0.0860566645860672,
"step": 64
},
{
"epoch": 0.0005356980970355292,
"grad_norm": 8.625,
"grad_norm_var": 81.89146728515625,
"learning_rate": 5e-05,
"loss": 0.513,
"loss/crossentropy": 1.6209317445755005,
"loss/dist_ce": 0.0,
"loss/hidden": 0.416015625,
"loss/idx": 0.0,
"loss/logits": 0.09696009755134583,
"step": 65
},
{
"epoch": 0.0005439396062206912,
"grad_norm": 5.0,
"grad_norm_var": 86.22919514973958,
"learning_rate": 5e-05,
"loss": 0.4936,
"loss/crossentropy": 2.992037773132324,
"loss/dist_ce": 0.0,
"loss/hidden": 0.37890625,
"loss/idx": 0.0,
"loss/logits": 0.114667147397995,
"step": 66
},
{
"epoch": 0.0005521811154058532,
"grad_norm": 10.5625,
"grad_norm_var": 82.74924723307292,
"learning_rate": 5e-05,
"loss": 0.9616,
"loss/crossentropy": 2.2757253646850586,
"loss/dist_ce": 0.0,
"loss/hidden": 0.75,
"loss/idx": 0.0,
"loss/logits": 0.21162353456020355,
"step": 67
},
{
"epoch": 0.0005604226245910151,
"grad_norm": 6.0625,
"grad_norm_var": 85.27672119140625,
"learning_rate": 5e-05,
"loss": 0.3899,
"loss/crossentropy": 0.7420970797538757,
"loss/dist_ce": 0.0,
"loss/hidden": 0.34765625,
"loss/idx": 0.0,
"loss/logits": 0.04225603863596916,
"step": 68
},
{
"epoch": 0.0005686641337761771,
"grad_norm": 4.78125,
"grad_norm_var": 85.44479166666666,
"learning_rate": 5e-05,
"loss": 0.3729,
"loss/crossentropy": 1.957132339477539,
"loss/dist_ce": 0.0,
"loss/hidden": 0.296875,
"loss/idx": 0.0,
"loss/logits": 0.07599128782749176,
"step": 69
},
{
"epoch": 0.0005769056429613391,
"grad_norm": 8.875,
"grad_norm_var": 69.85592447916666,
"learning_rate": 5e-05,
"loss": 0.5605,
"loss/crossentropy": 2.908198356628418,
"loss/dist_ce": 0.0,
"loss/hidden": 0.4453125,
"loss/idx": 0.0,
"loss/logits": 0.1151949018239975,
"step": 70
},
{
"epoch": 0.0005851471521465011,
"grad_norm": 6.125,
"grad_norm_var": 40.280208333333334,
"learning_rate": 5e-05,
"loss": 0.4273,
"loss/crossentropy": 1.8814876079559326,
"loss/dist_ce": 0.0,
"loss/hidden": 0.33984375,
"loss/idx": 0.0,
"loss/logits": 0.08749841153621674,
"step": 71
},
{
"epoch": 0.0005933886613316631,
"grad_norm": 3.625,
"grad_norm_var": 39.245052083333334,
"learning_rate": 5e-05,
"loss": 0.3375,
"loss/crossentropy": 1.5110681056976318,
"loss/dist_ce": 0.0,
"loss/hidden": 0.275390625,
"loss/idx": 0.0,
"loss/logits": 0.062085069715976715,
"step": 72
},
{
"epoch": 0.0006016301705168251,
"grad_norm": 4.09375,
"grad_norm_var": 15.198726399739583,
"learning_rate": 5e-05,
"loss": 0.3922,
"loss/crossentropy": 2.652179479598999,
"loss/dist_ce": 0.0,
"loss/hidden": 0.306640625,
"loss/idx": 0.0,
"loss/logits": 0.085569366812706,
"step": 73
},
{
"epoch": 0.0006098716797019871,
"grad_norm": 12.0,
"grad_norm_var": 15.279410807291667,
"learning_rate": 5e-05,
"loss": 0.7232,
"loss/crossentropy": 1.3632968664169312,
"loss/dist_ce": 0.0,
"loss/hidden": 0.625,
"loss/idx": 0.0,
"loss/logits": 0.09823663532733917,
"step": 74
},
{
"epoch": 0.000618113188887149,
"grad_norm": 3.15625,
"grad_norm_var": 13.862919108072917,
"learning_rate": 5e-05,
"loss": 0.3533,
"loss/crossentropy": 1.3768854141235352,
"loss/dist_ce": 0.0,
"loss/hidden": 0.30078125,
"loss/idx": 0.0,
"loss/logits": 0.05253131687641144,
"step": 75
},
{
"epoch": 0.000626354698072311,
"grad_norm": 4.5,
"grad_norm_var": 14.406571451822916,
"learning_rate": 5e-05,
"loss": 0.5053,
"loss/crossentropy": 2.9618265628814697,
"loss/dist_ce": 0.0,
"loss/hidden": 0.373046875,
"loss/idx": 0.0,
"loss/logits": 0.13229554891586304,
"step": 76
},
{
"epoch": 0.000634596207257473,
"grad_norm": 3.96875,
"grad_norm_var": 6.720442708333334,
"learning_rate": 5e-05,
"loss": 0.4483,
"loss/crossentropy": 1.52455472946167,
"loss/dist_ce": 0.0,
"loss/hidden": 0.3828125,
"loss/idx": 0.0,
"loss/logits": 0.065483957529068,
"step": 77
},
{
"epoch": 0.000642837716442635,
"grad_norm": 3.40625,
"grad_norm_var": 7.193343098958334,
"learning_rate": 5e-05,
"loss": 0.3286,
"loss/crossentropy": 2.478041648864746,
"loss/dist_ce": 0.0,
"loss/hidden": 0.255859375,
"loss/idx": 0.0,
"loss/logits": 0.0727752074599266,
"step": 78
},
{
"epoch": 0.000651079225627797,
"grad_norm": 3.765625,
"grad_norm_var": 7.330077107747396,
"learning_rate": 5e-05,
"loss": 0.382,
"loss/crossentropy": 1.4515001773834229,
"loss/dist_ce": 0.0,
"loss/hidden": 0.31640625,
"loss/idx": 0.0,
"loss/logits": 0.06562215089797974,
"step": 79
},
{
"epoch": 0.000659320734812959,
"grad_norm": 3.265625,
"grad_norm_var": 7.754378255208334,
"learning_rate": 5e-05,
"loss": 0.4323,
"loss/crossentropy": 1.748592495918274,
"loss/dist_ce": 0.0,
"loss/hidden": 0.34375,
"loss/idx": 0.0,
"loss/logits": 0.08855777978897095,
"step": 80
},
{
"epoch": 0.000667562243998121,
"grad_norm": 11.9375,
"grad_norm_var": 9.71513671875,
"learning_rate": 5e-05,
"loss": 0.465,
"loss/crossentropy": 2.6704611778259277,
"loss/dist_ce": 0.0,
"loss/hidden": 0.369140625,
"loss/idx": 0.0,
"loss/logits": 0.09580960124731064,
"step": 81
},
{
"epoch": 0.0006758037531832829,
"grad_norm": 23.0,
"grad_norm_var": 27.69638671875,
"learning_rate": 5e-05,
"loss": 0.8228,
"loss/crossentropy": 0.5576035976409912,
"loss/dist_ce": 0.0,
"loss/hidden": 0.66796875,
"loss/idx": 0.0,
"loss/logits": 0.1548667550086975,
"step": 82
},
{
"epoch": 0.0006840452623684449,
"grad_norm": 5.59375,
"grad_norm_var": 26.92584228515625,
"learning_rate": 5e-05,
"loss": 0.4884,
"loss/crossentropy": 0.8048841953277588,
"loss/dist_ce": 0.0,
"loss/hidden": 0.43359375,
"loss/idx": 0.0,
"loss/logits": 0.05478814244270325,
"step": 83
},
{
"epoch": 0.0006922867715536069,
"grad_norm": 4.96875,
"grad_norm_var": 27.102294921875,
"learning_rate": 5e-05,
"loss": 0.5603,
"loss/crossentropy": 2.0498743057250977,
"loss/dist_ce": 0.0,
"loss/hidden": 0.46875,
"loss/idx": 0.0,
"loss/logits": 0.0915648490190506,
"step": 84
},
{
"epoch": 0.0007005282807387689,
"grad_norm": 7.8125,
"grad_norm_var": 26.90455322265625,
"learning_rate": 5e-05,
"loss": 0.5343,
"loss/crossentropy": 1.643184781074524,
"loss/dist_ce": 0.0,
"loss/hidden": 0.44140625,
"loss/idx": 0.0,
"loss/logits": 0.09292187541723251,
"step": 85
},
{
"epoch": 0.0007087697899239309,
"grad_norm": 5.34375,
"grad_norm_var": 26.745003255208335,
"learning_rate": 5e-05,
"loss": 0.537,
"loss/crossentropy": 2.662973165512085,
"loss/dist_ce": 0.0,
"loss/hidden": 0.439453125,
"loss/idx": 0.0,
"loss/logits": 0.09757896512746811,
"step": 86
},
{
"epoch": 0.0007170112991090929,
"grad_norm": 2.4375,
"grad_norm_var": 27.857975260416666,
"learning_rate": 5e-05,
"loss": 0.2826,
"loss/crossentropy": 1.4633480310440063,
"loss/dist_ce": 0.0,
"loss/hidden": 0.236328125,
"loss/idx": 0.0,
"loss/logits": 0.0462251678109169,
"step": 87
},
{
"epoch": 0.0007252528082942549,
"grad_norm": 7.21875,
"grad_norm_var": 27.32125244140625,
"learning_rate": 5e-05,
"loss": 0.7004,
"loss/crossentropy": 2.25626277923584,
"loss/dist_ce": 0.0,
"loss/hidden": 0.55859375,
"loss/idx": 0.0,
"loss/logits": 0.14179712533950806,
"step": 88
},
{
"epoch": 0.0007334943174794168,
"grad_norm": 10.0625,
"grad_norm_var": 27.510107421875,
"learning_rate": 5e-05,
"loss": 0.341,
"loss/crossentropy": 2.072801113128662,
"loss/dist_ce": 0.0,
"loss/hidden": 0.275390625,
"loss/idx": 0.0,
"loss/logits": 0.0656304880976677,
"step": 89
},
{
"epoch": 0.0007417358266645788,
"grad_norm": 3.46875,
"grad_norm_var": 26.40260009765625,
"learning_rate": 5e-05,
"loss": 0.3528,
"loss/crossentropy": 1.6771039962768555,
"loss/dist_ce": 0.0,
"loss/hidden": 0.29296875,
"loss/idx": 0.0,
"loss/logits": 0.059821829199790955,
"step": 90
},
{
"epoch": 0.0007499773358497408,
"grad_norm": 2.640625,
"grad_norm_var": 26.648696899414062,
"learning_rate": 5e-05,
"loss": 0.2959,
"loss/crossentropy": 1.3347995281219482,
"loss/dist_ce": 0.0,
"loss/hidden": 0.24609375,
"loss/idx": 0.0,
"loss/logits": 0.049777351319789886,
"step": 91
},
{
"epoch": 0.0007582188450349028,
"grad_norm": 5.21875,
"grad_norm_var": 26.492967732747395,
"learning_rate": 5e-05,
"loss": 0.4277,
"loss/crossentropy": 2.2866933345794678,
"loss/dist_ce": 0.0,
"loss/hidden": 0.34765625,
"loss/idx": 0.0,
"loss/logits": 0.07999749481678009,
"step": 92
},
{
"epoch": 0.0007664603542200648,
"grad_norm": 20.375,
"grad_norm_var": 37.763719685872395,
"learning_rate": 5e-05,
"loss": 0.592,
"loss/crossentropy": 0.5346123576164246,
"loss/dist_ce": 0.0,
"loss/hidden": 0.52734375,
"loss/idx": 0.0,
"loss/logits": 0.06467436254024506,
"step": 93
},
{
"epoch": 0.0007747018634052268,
"grad_norm": 5.28125,
"grad_norm_var": 36.951952107747395,
"learning_rate": 5e-05,
"loss": 0.5485,
"loss/crossentropy": 1.4174734354019165,
"loss/dist_ce": 0.0,
"loss/hidden": 0.46484375,
"loss/idx": 0.0,
"loss/logits": 0.08362259715795517,
"step": 94
},
{
"epoch": 0.0007829433725903888,
"grad_norm": 3.5,
"grad_norm_var": 37.093912760416664,
"learning_rate": 5e-05,
"loss": 0.3081,
"loss/crossentropy": 1.5907094478607178,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25390625,
"loss/idx": 0.0,
"loss/logits": 0.05422712862491608,
"step": 95
},
{
"epoch": 0.0007911848817755507,
"grad_norm": 3.546875,
"grad_norm_var": 36.93508707682292,
"learning_rate": 5e-05,
"loss": 0.3014,
"loss/crossentropy": 1.4840497970581055,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25,
"loss/idx": 0.0,
"loss/logits": 0.05143030732870102,
"step": 96
},
{
"epoch": 0.0007994263909607127,
"grad_norm": 6.03125,
"grad_norm_var": 35.73922526041667,
"learning_rate": 5e-05,
"loss": 0.5235,
"loss/crossentropy": 1.9094655513763428,
"loss/dist_ce": 0.0,
"loss/hidden": 0.416015625,
"loss/idx": 0.0,
"loss/logits": 0.10748349130153656,
"step": 97
},
{
"epoch": 0.0008076679001458747,
"grad_norm": 5.53125,
"grad_norm_var": 18.19996337890625,
"learning_rate": 5e-05,
"loss": 0.3909,
"loss/crossentropy": 1.4463390111923218,
"loss/dist_ce": 0.0,
"loss/hidden": 0.328125,
"loss/idx": 0.0,
"loss/logits": 0.06276652216911316,
"step": 98
},
{
"epoch": 0.0008159094093310367,
"grad_norm": 7.03125,
"grad_norm_var": 18.214937337239583,
"learning_rate": 5e-05,
"loss": 0.5097,
"loss/crossentropy": 2.1369168758392334,
"loss/dist_ce": 0.0,
"loss/hidden": 0.3984375,
"loss/idx": 0.0,
"loss/logits": 0.11129927635192871,
"step": 99
},
{
"epoch": 0.0008241509185161987,
"grad_norm": 2.28125,
"grad_norm_var": 19.135965983072918,
"learning_rate": 5e-05,
"loss": 0.2675,
"loss/crossentropy": 1.7142083644866943,
"loss/dist_ce": 0.0,
"loss/hidden": 0.22265625,
"loss/idx": 0.0,
"loss/logits": 0.044855352491140366,
"step": 100
},
{
"epoch": 0.0008323924277013607,
"grad_norm": 5.25,
"grad_norm_var": 18.965132649739584,
"learning_rate": 5e-05,
"loss": 0.3094,
"loss/crossentropy": 0.9302163124084473,
"loss/dist_ce": 0.0,
"loss/hidden": 0.267578125,
"loss/idx": 0.0,
"loss/logits": 0.041776590049266815,
"step": 101
},
{
"epoch": 0.0008406339368865227,
"grad_norm": 9.9375,
"grad_norm_var": 19.911995442708335,
"learning_rate": 5e-05,
"loss": 0.4483,
"loss/crossentropy": 2.0367283821105957,
"loss/dist_ce": 0.0,
"loss/hidden": 0.36328125,
"loss/idx": 0.0,
"loss/logits": 0.08506779372692108,
"step": 102
},
{
"epoch": 0.0008488754460716846,
"grad_norm": 2.984375,
"grad_norm_var": 19.65354715983073,
"learning_rate": 5e-05,
"loss": 0.3074,
"loss/crossentropy": 2.5578417778015137,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25,
"loss/idx": 0.0,
"loss/logits": 0.05742755904793739,
"step": 103
},
{
"epoch": 0.0008571169552568466,
"grad_norm": 3.96875,
"grad_norm_var": 19.903644816080728,
"learning_rate": 5e-05,
"loss": 0.3362,
"loss/crossentropy": 1.235908031463623,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28515625,
"loss/idx": 0.0,
"loss/logits": 0.05100230872631073,
"step": 104
},
{
"epoch": 0.0008653584644420086,
"grad_norm": 13.1875,
"grad_norm_var": 22.177814737955728,
"learning_rate": 5e-05,
"loss": 0.5365,
"loss/crossentropy": 1.3280326128005981,
"loss/dist_ce": 0.0,
"loss/hidden": 0.4765625,
"loss/idx": 0.0,
"loss/logits": 0.05990615114569664,
"step": 105
},
{
"epoch": 0.0008735999736271706,
"grad_norm": 3.640625,
"grad_norm_var": 22.11558837890625,
"learning_rate": 5e-05,
"loss": 0.3851,
"loss/crossentropy": 2.764561414718628,
"loss/dist_ce": 0.0,
"loss/hidden": 0.296875,
"loss/idx": 0.0,
"loss/logits": 0.08821941912174225,
"step": 106
},
{
"epoch": 0.0008818414828123326,
"grad_norm": 5.875,
"grad_norm_var": 21.201919555664062,
"learning_rate": 5e-05,
"loss": 0.3966,
"loss/crossentropy": 2.7057063579559326,
"loss/dist_ce": 0.0,
"loss/hidden": 0.31640625,
"loss/idx": 0.0,
"loss/logits": 0.08024018257856369,
"step": 107
},
{
"epoch": 0.0008900829919974946,
"grad_norm": 3.375,
"grad_norm_var": 21.723835245768228,
"learning_rate": 5e-05,
"loss": 0.3745,
"loss/crossentropy": 1.9366930723190308,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2890625,
"loss/idx": 0.0,
"loss/logits": 0.08542559295892715,
"step": 108
},
{
"epoch": 0.0008983245011826566,
"grad_norm": 3.796875,
"grad_norm_var": 7.927079264322916,
"learning_rate": 5e-05,
"loss": 0.3125,
"loss/crossentropy": 2.499528408050537,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25,
"loss/idx": 0.0,
"loss/logits": 0.062451932579278946,
"step": 109
},
{
"epoch": 0.0009065660103678185,
"grad_norm": 4.59375,
"grad_norm_var": 7.960738118489584,
"learning_rate": 5e-05,
"loss": 0.3585,
"loss/crossentropy": 2.2361199855804443,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28515625,
"loss/idx": 0.0,
"loss/logits": 0.07333969324827194,
"step": 110
},
{
"epoch": 0.0009148075195529805,
"grad_norm": 2.671875,
"grad_norm_var": 8.200495402018229,
"learning_rate": 5e-05,
"loss": 0.2519,
"loss/crossentropy": 1.3813279867172241,
"loss/dist_ce": 0.0,
"loss/hidden": 0.21484375,
"loss/idx": 0.0,
"loss/logits": 0.03708701953291893,
"step": 111
},
{
"epoch": 0.0009230490287381425,
"grad_norm": 6.875,
"grad_norm_var": 8.145243326822916,
"learning_rate": 5e-05,
"loss": 0.6369,
"loss/crossentropy": 0.7904279232025146,
"loss/dist_ce": 0.0,
"loss/hidden": 0.55859375,
"loss/idx": 0.0,
"loss/logits": 0.07834139466285706,
"step": 112
},
{
"epoch": 0.0009312905379233045,
"grad_norm": 26.625,
"grad_norm_var": 36.27662760416667,
"learning_rate": 5e-05,
"loss": 0.811,
"loss/crossentropy": 1.7555081844329834,
"loss/dist_ce": 0.0,
"loss/hidden": 0.6171875,
"loss/idx": 0.0,
"loss/logits": 0.19379198551177979,
"step": 113
},
{
"epoch": 0.0009395320471084665,
"grad_norm": 20.875,
"grad_norm_var": 48.545633951822914,
"learning_rate": 5e-05,
"loss": 0.5722,
"loss/crossentropy": 2.7940163612365723,
"loss/dist_ce": 0.0,
"loss/hidden": 0.455078125,
"loss/idx": 0.0,
"loss/logits": 0.11716997623443604,
"step": 114
},
{
"epoch": 0.0009477735562936285,
"grad_norm": 4.1875,
"grad_norm_var": 49.299153645833336,
"learning_rate": 5e-05,
"loss": 0.3537,
"loss/crossentropy": 2.1036393642425537,
"loss/dist_ce": 0.0,
"loss/hidden": 0.291015625,
"loss/idx": 0.0,
"loss/logits": 0.06271170824766159,
"step": 115
},
{
"epoch": 0.0009560150654787905,
"grad_norm": 2.84375,
"grad_norm_var": 48.926936848958334,
"learning_rate": 5e-05,
"loss": 0.2983,
"loss/crossentropy": 1.6935715675354004,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25390625,
"loss/idx": 0.0,
"loss/logits": 0.04436497390270233,
"step": 116
},
{
"epoch": 0.0009642565746639524,
"grad_norm": 2.1875,
"grad_norm_var": 50.4494140625,
"learning_rate": 5e-05,
"loss": 0.2826,
"loss/crossentropy": 0.9073331356048584,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25,
"loss/idx": 0.0,
"loss/logits": 0.032610610127449036,
"step": 117
},
{
"epoch": 0.0009724980838491144,
"grad_norm": 2.203125,
"grad_norm_var": 51.52145080566406,
"learning_rate": 5e-05,
"loss": 0.1972,
"loss/crossentropy": 1.5172605514526367,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1669921875,
"loss/idx": 0.0,
"loss/logits": 0.030240532010793686,
"step": 118
},
{
"epoch": 0.0009807395930342764,
"grad_norm": 3.609375,
"grad_norm_var": 51.222215779622395,
"learning_rate": 5e-05,
"loss": 0.3043,
"loss/crossentropy": 1.3851293325424194,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2578125,
"loss/idx": 0.0,
"loss/logits": 0.04644084721803665,
"step": 119
},
{
"epoch": 0.0009889811022194385,
"grad_norm": 11.5,
"grad_norm_var": 51.8164784749349,
"learning_rate": 5e-05,
"loss": 0.3607,
"loss/crossentropy": 1.786331057548523,
"loss/dist_ce": 0.0,
"loss/hidden": 0.30078125,
"loss/idx": 0.0,
"loss/logits": 0.059897445142269135,
"step": 120
},
{
"epoch": 0.0009972226114046004,
"grad_norm": 3.421875,
"grad_norm_var": 50.212398274739584,
"learning_rate": 5e-05,
"loss": 0.3368,
"loss/crossentropy": 1.3703113794326782,
"loss/dist_ce": 0.0,
"loss/hidden": 0.283203125,
"loss/idx": 0.0,
"loss/logits": 0.05364468693733215,
"step": 121
},
{
"epoch": 0.0010054641205897623,
"grad_norm": 5.6875,
"grad_norm_var": 49.62085673014323,
"learning_rate": 5e-05,
"loss": 0.3796,
"loss/crossentropy": 1.5540196895599365,
"loss/dist_ce": 0.0,
"loss/hidden": 0.31640625,
"loss/idx": 0.0,
"loss/logits": 0.06318466365337372,
"step": 122
},
{
"epoch": 0.0010137056297749244,
"grad_norm": 3.15625,
"grad_norm_var": 50.45276590983073,
"learning_rate": 5e-05,
"loss": 0.2844,
"loss/crossentropy": 2.5752596855163574,
"loss/dist_ce": 0.0,
"loss/hidden": 0.224609375,
"loss/idx": 0.0,
"loss/logits": 0.05982211232185364,
"step": 123
},
{
"epoch": 0.0010219471389600863,
"grad_norm": 7.9375,
"grad_norm_var": 49.715518188476565,
"learning_rate": 5e-05,
"loss": 0.6157,
"loss/crossentropy": 2.424745798110962,
"loss/dist_ce": 0.0,
"loss/hidden": 0.47265625,
"loss/idx": 0.0,
"loss/logits": 0.14306676387786865,
"step": 124
},
{
"epoch": 0.0010301886481452484,
"grad_norm": 13.0625,
"grad_norm_var": 51.110791015625,
"learning_rate": 5e-05,
"loss": 0.3906,
"loss/crossentropy": 1.6216858625411987,
"loss/dist_ce": 0.0,
"loss/hidden": 0.32421875,
"loss/idx": 0.0,
"loss/logits": 0.06637328118085861,
"step": 125
},
{
"epoch": 0.0010384301573304103,
"grad_norm": 10.3125,
"grad_norm_var": 50.87027587890625,
"learning_rate": 5e-05,
"loss": 0.4044,
"loss/crossentropy": 2.009226083755493,
"loss/dist_ce": 0.0,
"loss/hidden": 0.32421875,
"loss/idx": 0.0,
"loss/logits": 0.08017371594905853,
"step": 126
},
{
"epoch": 0.0010466716665155724,
"grad_norm": 2.734375,
"grad_norm_var": 50.826558430989586,
"learning_rate": 5e-05,
"loss": 0.2593,
"loss/crossentropy": 0.31721383333206177,
"loss/dist_ce": 0.0,
"loss/hidden": 0.240234375,
"loss/idx": 0.0,
"loss/logits": 0.019071679562330246,
"step": 127
},
{
"epoch": 0.0010549131757007343,
"grad_norm": 9.0625,
"grad_norm_var": 50.811747233072914,
"learning_rate": 5e-05,
"loss": 0.4044,
"loss/crossentropy": 1.5959731340408325,
"loss/dist_ce": 0.0,
"loss/hidden": 0.3359375,
"loss/idx": 0.0,
"loss/logits": 0.06845791637897491,
"step": 128
},
{
"epoch": 0.0010631546848858962,
"grad_norm": 6.53125,
"grad_norm_var": 26.382666015625,
"learning_rate": 5e-05,
"loss": 0.404,
"loss/crossentropy": 1.000110387802124,
"loss/dist_ce": 0.0,
"loss/hidden": 0.345703125,
"loss/idx": 0.0,
"loss/logits": 0.0582566112279892,
"step": 129
},
{
"epoch": 0.0010713961940710583,
"grad_norm": 13.375,
"grad_norm_var": 15.855322265625,
"learning_rate": 5e-05,
"loss": 0.4836,
"loss/crossentropy": 1.4723174571990967,
"loss/dist_ce": 0.0,
"loss/hidden": 0.41015625,
"loss/idx": 0.0,
"loss/logits": 0.07343296706676483,
"step": 130
},
{
"epoch": 0.0010796377032562202,
"grad_norm": 1.8359375,
"grad_norm_var": 16.883135732014974,
"learning_rate": 5e-05,
"loss": 0.2167,
"loss/crossentropy": 1.884359359741211,
"loss/dist_ce": 0.0,
"loss/hidden": 0.17578125,
"loss/idx": 0.0,
"loss/logits": 0.04091912880539894,
"step": 131
},
{
"epoch": 0.0010878792124413823,
"grad_norm": 9.0625,
"grad_norm_var": 16.503775787353515,
"learning_rate": 5e-05,
"loss": 0.335,
"loss/crossentropy": 2.47468638420105,
"loss/dist_ce": 0.0,
"loss/hidden": 0.265625,
"loss/idx": 0.0,
"loss/logits": 0.06939947605133057,
"step": 132
},
{
"epoch": 0.0010961207216265442,
"grad_norm": 2.28125,
"grad_norm_var": 16.44910659790039,
"learning_rate": 5e-05,
"loss": 0.2479,
"loss/crossentropy": 1.4188505411148071,
"loss/dist_ce": 0.0,
"loss/hidden": 0.208984375,
"loss/idx": 0.0,
"loss/logits": 0.03889765217900276,
"step": 133
},
{
"epoch": 0.0011043622308117063,
"grad_norm": 3.453125,
"grad_norm_var": 15.812143707275391,
"learning_rate": 5e-05,
"loss": 0.3702,
"loss/crossentropy": 2.618537425994873,
"loss/dist_ce": 0.0,
"loss/hidden": 0.27734375,
"loss/idx": 0.0,
"loss/logits": 0.09288428723812103,
"step": 134
},
{
"epoch": 0.0011126037399968682,
"grad_norm": 14.5,
"grad_norm_var": 18.753179677327473,
"learning_rate": 5e-05,
"loss": 0.6568,
"loss/crossentropy": 3.4983789920806885,
"loss/dist_ce": 0.0,
"loss/hidden": 0.5078125,
"loss/idx": 0.0,
"loss/logits": 0.14897578954696655,
"step": 135
},
{
"epoch": 0.0011208452491820301,
"grad_norm": 3.078125,
"grad_norm_var": 18.548115793863932,
"learning_rate": 5e-05,
"loss": 0.2164,
"loss/crossentropy": 0.7671207189559937,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1875,
"loss/idx": 0.0,
"loss/logits": 0.028927473351359367,
"step": 136
},
{
"epoch": 0.0011290867583671922,
"grad_norm": 3.015625,
"grad_norm_var": 18.743755849202476,
"learning_rate": 5e-05,
"loss": 0.2917,
"loss/crossentropy": 1.4987382888793945,
"loss/dist_ce": 0.0,
"loss/hidden": 0.240234375,
"loss/idx": 0.0,
"loss/logits": 0.051444459706544876,
"step": 137
},
{
"epoch": 0.0011373282675523541,
"grad_norm": 2.515625,
"grad_norm_var": 19.85060806274414,
"learning_rate": 5e-05,
"loss": 0.2295,
"loss/crossentropy": 1.1994467973709106,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1982421875,
"loss/idx": 0.0,
"loss/logits": 0.03128223866224289,
"step": 138
},
{
"epoch": 0.0011455697767375162,
"grad_norm": 11.625,
"grad_norm_var": 20.42235895792643,
"learning_rate": 5e-05,
"loss": 0.3444,
"loss/crossentropy": 1.7283226251602173,
"loss/dist_ce": 0.0,
"loss/hidden": 0.287109375,
"loss/idx": 0.0,
"loss/logits": 0.05732431262731552,
"step": 139
},
{
"epoch": 0.0011538112859226781,
"grad_norm": 4.09375,
"grad_norm_var": 20.941615549723306,
"learning_rate": 5e-05,
"loss": 0.3394,
"loss/crossentropy": 2.3807051181793213,
"loss/dist_ce": 0.0,
"loss/hidden": 0.26171875,
"loss/idx": 0.0,
"loss/logits": 0.07771667838096619,
"step": 140
},
{
"epoch": 0.0011620527951078403,
"grad_norm": 4.4375,
"grad_norm_var": 18.514149729410807,
"learning_rate": 5e-05,
"loss": 0.4546,
"loss/crossentropy": 3.182520866394043,
"loss/dist_ce": 0.0,
"loss/hidden": 0.34765625,
"loss/idx": 0.0,
"loss/logits": 0.10692334175109863,
"step": 141
},
{
"epoch": 0.0011702943042930021,
"grad_norm": 4.34375,
"grad_norm_var": 17.60290501912435,
"learning_rate": 5e-05,
"loss": 0.2908,
"loss/crossentropy": 1.5368177890777588,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2431640625,
"loss/idx": 0.0,
"loss/logits": 0.04762953519821167,
"step": 142
},
{
"epoch": 0.001178535813478164,
"grad_norm": 4.5,
"grad_norm_var": 17.029766591389976,
"learning_rate": 5e-05,
"loss": 0.407,
"loss/crossentropy": 2.778043270111084,
"loss/dist_ce": 0.0,
"loss/hidden": 0.302734375,
"loss/idx": 0.0,
"loss/logits": 0.1042385995388031,
"step": 143
},
{
"epoch": 0.0011867773226633262,
"grad_norm": 4.15625,
"grad_norm_var": 16.600789133707682,
"learning_rate": 5e-05,
"loss": 0.3435,
"loss/crossentropy": 2.792048692703247,
"loss/dist_ce": 0.0,
"loss/hidden": 0.267578125,
"loss/idx": 0.0,
"loss/logits": 0.07588605582714081,
"step": 144
},
{
"epoch": 0.001195018831848488,
"grad_norm": 3.8125,
"grad_norm_var": 16.797792307535808,
"learning_rate": 5e-05,
"loss": 0.2385,
"loss/crossentropy": 1.401113510131836,
"loss/dist_ce": 0.0,
"loss/hidden": 0.205078125,
"loss/idx": 0.0,
"loss/logits": 0.03345421701669693,
"step": 145
},
{
"epoch": 0.0012032603410336502,
"grad_norm": 6.875,
"grad_norm_var": 12.726405588785807,
"learning_rate": 5e-05,
"loss": 0.3725,
"loss/crossentropy": 2.2165474891662598,
"loss/dist_ce": 0.0,
"loss/hidden": 0.296875,
"loss/idx": 0.0,
"loss/logits": 0.07561925053596497,
"step": 146
},
{
"epoch": 0.001211501850218812,
"grad_norm": 3.171875,
"grad_norm_var": 12.234430948893229,
"learning_rate": 5e-05,
"loss": 0.2506,
"loss/crossentropy": 2.589618444442749,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19921875,
"loss/idx": 0.0,
"loss/logits": 0.05137525126338005,
"step": 147
},
{
"epoch": 0.0012197433594039742,
"grad_norm": 22.25,
"grad_norm_var": 29.706151326497395,
"learning_rate": 5e-05,
"loss": 0.542,
"loss/crossentropy": 1.4461145401000977,
"loss/dist_ce": 0.0,
"loss/hidden": 0.47265625,
"loss/idx": 0.0,
"loss/logits": 0.06938936561346054,
"step": 148
},
{
"epoch": 0.001227984868589136,
"grad_norm": 4.75,
"grad_norm_var": 28.819587198893228,
"learning_rate": 5e-05,
"loss": 0.3594,
"loss/crossentropy": 1.5630475282669067,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28125,
"loss/idx": 0.0,
"loss/logits": 0.07812213897705078,
"step": 149
},
{
"epoch": 0.001236226377774298,
"grad_norm": 3.09375,
"grad_norm_var": 28.963407389322917,
"learning_rate": 5e-05,
"loss": 0.2805,
"loss/crossentropy": 1.3344874382019043,
"loss/dist_ce": 0.0,
"loss/hidden": 0.234375,
"loss/idx": 0.0,
"loss/logits": 0.04612912982702255,
"step": 150
},
{
"epoch": 0.00124446788695946,
"grad_norm": 6.34375,
"grad_norm_var": 24.164176432291665,
"learning_rate": 5e-05,
"loss": 0.3487,
"loss/crossentropy": 2.1057682037353516,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28125,
"loss/idx": 0.0,
"loss/logits": 0.06741908937692642,
"step": 151
},
{
"epoch": 0.001252709396144622,
"grad_norm": 3.078125,
"grad_norm_var": 24.164176432291665,
"learning_rate": 5e-05,
"loss": 0.3282,
"loss/crossentropy": 2.7360680103302,
"loss/dist_ce": 0.0,
"loss/hidden": 0.248046875,
"loss/idx": 0.0,
"loss/logits": 0.0801510438323021,
"step": 152
},
{
"epoch": 0.001260950905329784,
"grad_norm": 3.265625,
"grad_norm_var": 24.076806640625,
"learning_rate": 5e-05,
"loss": 0.2896,
"loss/crossentropy": 2.9115164279937744,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2265625,
"loss/idx": 0.0,
"loss/logits": 0.06300797313451767,
"step": 153
},
{
"epoch": 0.001269192414514946,
"grad_norm": 3.109375,
"grad_norm_var": 23.841239420572915,
"learning_rate": 5e-05,
"loss": 0.2801,
"loss/crossentropy": 2.20858097076416,
"loss/dist_ce": 0.0,
"loss/hidden": 0.22265625,
"loss/idx": 0.0,
"loss/logits": 0.05741541087627411,
"step": 154
},
{
"epoch": 0.001277433923700108,
"grad_norm": 3.4375,
"grad_norm_var": 21.679227701822917,
"learning_rate": 5e-05,
"loss": 0.2616,
"loss/crossentropy": 2.4968795776367188,
"loss/dist_ce": 0.0,
"loss/hidden": 0.205078125,
"loss/idx": 0.0,
"loss/logits": 0.05649275332689285,
"step": 155
},
{
"epoch": 0.00128567543288527,
"grad_norm": 1.90625,
"grad_norm_var": 22.328641764322917,
"learning_rate": 5e-05,
"loss": 0.2142,
"loss/crossentropy": 1.6698881387710571,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.03645133972167969,
"step": 156
},
{
"epoch": 0.0012939169420704319,
"grad_norm": 2.84375,
"grad_norm_var": 22.640543619791668,
"learning_rate": 5e-05,
"loss": 0.2762,
"loss/crossentropy": 2.5742998123168945,
"loss/dist_ce": 0.0,
"loss/hidden": 0.208984375,
"loss/idx": 0.0,
"loss/logits": 0.06718096137046814,
"step": 157
},
{
"epoch": 0.001302158451255594,
"grad_norm": 13.0,
"grad_norm_var": 26.498661295572916,
"learning_rate": 5e-05,
"loss": 0.3361,
"loss/crossentropy": 2.4912259578704834,
"loss/dist_ce": 0.0,
"loss/hidden": 0.26953125,
"loss/idx": 0.0,
"loss/logits": 0.06653441488742828,
"step": 158
},
{
"epoch": 0.0013103999604407559,
"grad_norm": 2.28125,
"grad_norm_var": 27.131640625,
"learning_rate": 5e-05,
"loss": 0.244,
"loss/crossentropy": 1.5568723678588867,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1923828125,
"loss/idx": 0.0,
"loss/logits": 0.05163004621863365,
"step": 159
},
{
"epoch": 0.001318641469625918,
"grad_norm": 2.90625,
"grad_norm_var": 27.446744791666667,
"learning_rate": 5e-05,
"loss": 0.2633,
"loss/crossentropy": 2.150268793106079,
"loss/dist_ce": 0.0,
"loss/hidden": 0.212890625,
"loss/idx": 0.0,
"loss/logits": 0.050429798662662506,
"step": 160
},
{
"epoch": 0.0013268829788110799,
"grad_norm": 3.6875,
"grad_norm_var": 27.473893229166666,
"learning_rate": 5e-05,
"loss": 0.2778,
"loss/crossentropy": 1.7107495069503784,
"loss/dist_ce": 0.0,
"loss/hidden": 0.21875,
"loss/idx": 0.0,
"loss/logits": 0.059024274349212646,
"step": 161
},
{
"epoch": 0.001335124487996242,
"grad_norm": 5.46875,
"grad_norm_var": 27.316239420572916,
"learning_rate": 5e-05,
"loss": 0.3054,
"loss/crossentropy": 2.609410285949707,
"loss/dist_ce": 0.0,
"loss/hidden": 0.236328125,
"loss/idx": 0.0,
"loss/logits": 0.06908264756202698,
"step": 162
},
{
"epoch": 0.0013433659971814039,
"grad_norm": 2.1875,
"grad_norm_var": 27.654426066080728,
"learning_rate": 5e-05,
"loss": 0.1832,
"loss/crossentropy": 1.302159309387207,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1552734375,
"loss/idx": 0.0,
"loss/logits": 0.027962597087025642,
"step": 163
},
{
"epoch": 0.0013516075063665658,
"grad_norm": 4.96875,
"grad_norm_var": 7.092438761393229,
"learning_rate": 5e-05,
"loss": 0.275,
"loss/crossentropy": 1.3545722961425781,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23828125,
"loss/idx": 0.0,
"loss/logits": 0.03673800453543663,
"step": 164
},
{
"epoch": 0.0013598490155517279,
"grad_norm": 5.625,
"grad_norm_var": 7.210814412434896,
"learning_rate": 5e-05,
"loss": 0.3255,
"loss/crossentropy": 2.3857431411743164,
"loss/dist_ce": 0.0,
"loss/hidden": 0.267578125,
"loss/idx": 0.0,
"loss/logits": 0.05789117142558098,
"step": 165
},
{
"epoch": 0.0013680905247368898,
"grad_norm": 2.3125,
"grad_norm_var": 7.364216105143229,
"learning_rate": 5e-05,
"loss": 0.224,
"loss/crossentropy": 1.6262630224227905,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1865234375,
"loss/idx": 0.0,
"loss/logits": 0.03748723864555359,
"step": 166
},
{
"epoch": 0.0013763320339220519,
"grad_norm": 3.515625,
"grad_norm_var": 7.037398274739584,
"learning_rate": 5e-05,
"loss": 0.3005,
"loss/crossentropy": 2.802839756011963,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2421875,
"loss/idx": 0.0,
"loss/logits": 0.05828278884291649,
"step": 167
},
{
"epoch": 0.0013845735431072138,
"grad_norm": 5.9375,
"grad_norm_var": 7.206615193684896,
"learning_rate": 5e-05,
"loss": 0.3168,
"loss/crossentropy": 2.6197855472564697,
"loss/dist_ce": 0.0,
"loss/hidden": 0.24609375,
"loss/idx": 0.0,
"loss/logits": 0.07069416344165802,
"step": 168
},
{
"epoch": 0.0013928150522923759,
"grad_norm": 2.609375,
"grad_norm_var": 7.311205037434896,
"learning_rate": 5e-05,
"loss": 0.2188,
"loss/crossentropy": 2.0093469619750977,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.04102495685219765,
"step": 169
},
{
"epoch": 0.0014010565614775378,
"grad_norm": 9.0625,
"grad_norm_var": 8.730110677083333,
"learning_rate": 5e-05,
"loss": 0.4723,
"loss/crossentropy": 0.45764070749282837,
"loss/dist_ce": 0.0,
"loss/hidden": 0.421875,
"loss/idx": 0.0,
"loss/logits": 0.05042431876063347,
"step": 170
},
{
"epoch": 0.0014092980706626997,
"grad_norm": 10.6875,
"grad_norm_var": 11.003287760416667,
"learning_rate": 5e-05,
"loss": 0.2967,
"loss/crossentropy": 1.4506618976593018,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25,
"loss/idx": 0.0,
"loss/logits": 0.04666414484381676,
"step": 171
},
{
"epoch": 0.0014175395798478618,
"grad_norm": 2.734375,
"grad_norm_var": 10.711449178059896,
"learning_rate": 5e-05,
"loss": 0.1833,
"loss/crossentropy": 1.2239363193511963,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1494140625,
"loss/idx": 0.0,
"loss/logits": 0.03392494469881058,
"step": 172
},
{
"epoch": 0.0014257810890330237,
"grad_norm": 9.25,
"grad_norm_var": 11.44383036295573,
"learning_rate": 5e-05,
"loss": 0.4527,
"loss/crossentropy": 2.3572747707366943,
"loss/dist_ce": 0.0,
"loss/hidden": 0.365234375,
"loss/idx": 0.0,
"loss/logits": 0.08745455741882324,
"step": 173
},
{
"epoch": 0.0014340225982181858,
"grad_norm": 3.328125,
"grad_norm_var": 7.476220703125,
"learning_rate": 5e-05,
"loss": 0.3071,
"loss/crossentropy": 2.6207613945007324,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23828125,
"loss/idx": 0.0,
"loss/logits": 0.06882129609584808,
"step": 174
},
{
"epoch": 0.0014422641074033477,
"grad_norm": 12.8125,
"grad_norm_var": 10.892020670572917,
"learning_rate": 5e-05,
"loss": 0.3909,
"loss/crossentropy": 2.3904902935028076,
"loss/dist_ce": 0.0,
"loss/hidden": 0.3046875,
"loss/idx": 0.0,
"loss/logits": 0.08622868359088898,
"step": 175
},
{
"epoch": 0.0014505056165885098,
"grad_norm": 6.78125,
"grad_norm_var": 10.519657389322917,
"learning_rate": 5e-05,
"loss": 0.4847,
"loss/crossentropy": 2.4582583904266357,
"loss/dist_ce": 0.0,
"loss/hidden": 0.40625,
"loss/idx": 0.0,
"loss/logits": 0.07843705266714096,
"step": 176
},
{
"epoch": 0.0014587471257736717,
"grad_norm": 6.125,
"grad_norm_var": 10.241630045572917,
"learning_rate": 5e-05,
"loss": 0.3267,
"loss/crossentropy": 2.8331282138824463,
"loss/dist_ce": 0.0,
"loss/hidden": 0.248046875,
"loss/idx": 0.0,
"loss/logits": 0.07867051661014557,
"step": 177
},
{
"epoch": 0.0014669886349588336,
"grad_norm": 4.21875,
"grad_norm_var": 10.400809733072917,
"learning_rate": 5e-05,
"loss": 0.4616,
"loss/crossentropy": 2.921239137649536,
"loss/dist_ce": 0.0,
"loss/hidden": 0.35546875,
"loss/idx": 0.0,
"loss/logits": 0.10613523423671722,
"step": 178
},
{
"epoch": 0.0014752301441439957,
"grad_norm": 2.96875,
"grad_norm_var": 10.066845703125,
"learning_rate": 5e-05,
"loss": 0.2874,
"loss/crossentropy": 2.7095119953155518,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2333984375,
"loss/idx": 0.0,
"loss/logits": 0.05400337651371956,
"step": 179
},
{
"epoch": 0.0014834716533291576,
"grad_norm": 2.71875,
"grad_norm_var": 10.635205078125,
"learning_rate": 5e-05,
"loss": 0.3012,
"loss/crossentropy": 2.5020012855529785,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23046875,
"loss/idx": 0.0,
"loss/logits": 0.07070466130971909,
"step": 180
},
{
"epoch": 0.0014917131625143197,
"grad_norm": 2.375,
"grad_norm_var": 11.313981119791666,
"learning_rate": 5e-05,
"loss": 0.2055,
"loss/crossentropy": 1.2848535776138306,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1767578125,
"loss/idx": 0.0,
"loss/logits": 0.02870912477374077,
"step": 181
},
{
"epoch": 0.0014999546716994816,
"grad_norm": 4.4375,
"grad_norm_var": 10.703043619791666,
"learning_rate": 5e-05,
"loss": 0.2174,
"loss/crossentropy": 0.40366628766059875,
"loss/dist_ce": 0.0,
"loss/hidden": 0.201171875,
"loss/idx": 0.0,
"loss/logits": 0.016210440546274185,
"step": 182
},
{
"epoch": 0.0015081961808846437,
"grad_norm": 6.28125,
"grad_norm_var": 10.41333719889323,
"learning_rate": 5e-05,
"loss": 0.3403,
"loss/crossentropy": 2.8000519275665283,
"loss/dist_ce": 0.0,
"loss/hidden": 0.263671875,
"loss/idx": 0.0,
"loss/logits": 0.07665810734033585,
"step": 183
},
{
"epoch": 0.0015164376900698056,
"grad_norm": 2.15625,
"grad_norm_var": 11.22276102701823,
"learning_rate": 5e-05,
"loss": 0.2103,
"loss/crossentropy": 1.5652306079864502,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1708984375,
"loss/idx": 0.0,
"loss/logits": 0.039365194737911224,
"step": 184
},
{
"epoch": 0.0015246791992549675,
"grad_norm": 50.5,
"grad_norm_var": 135.891162109375,
"learning_rate": 5e-05,
"loss": 0.5337,
"loss/crossentropy": 1.4967753887176514,
"loss/dist_ce": 0.0,
"loss/hidden": 0.4453125,
"loss/idx": 0.0,
"loss/logits": 0.08833958208560944,
"step": 185
},
{
"epoch": 0.0015329207084401296,
"grad_norm": 6.53125,
"grad_norm_var": 136.11099853515626,
"learning_rate": 5e-05,
"loss": 0.3785,
"loss/crossentropy": 2.4065871238708496,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2890625,
"loss/idx": 0.0,
"loss/logits": 0.08948490023612976,
"step": 186
},
{
"epoch": 0.0015411622176252915,
"grad_norm": 6.125,
"grad_norm_var": 136.0016886393229,
"learning_rate": 5e-05,
"loss": 0.315,
"loss/crossentropy": 2.2277615070343018,
"loss/dist_ce": 0.0,
"loss/hidden": 0.244140625,
"loss/idx": 0.0,
"loss/logits": 0.07082026451826096,
"step": 187
},
{
"epoch": 0.0015494037268104536,
"grad_norm": 2.578125,
"grad_norm_var": 136.11466471354166,
"learning_rate": 5e-05,
"loss": 0.2275,
"loss/crossentropy": 2.0500736236572266,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1806640625,
"loss/idx": 0.0,
"loss/logits": 0.04683098569512367,
"step": 188
},
{
"epoch": 0.0015576452359956155,
"grad_norm": 2.203125,
"grad_norm_var": 138.1135732014974,
"learning_rate": 5e-05,
"loss": 0.1763,
"loss/crossentropy": 0.851466953754425,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1552734375,
"loss/idx": 0.0,
"loss/logits": 0.021058566868305206,
"step": 189
},
{
"epoch": 0.0015658867451807776,
"grad_norm": 10.9375,
"grad_norm_var": 137.36402994791666,
"learning_rate": 5e-05,
"loss": 0.4211,
"loss/crossentropy": 1.5812166929244995,
"loss/dist_ce": 0.0,
"loss/hidden": 0.33984375,
"loss/idx": 0.0,
"loss/logits": 0.08130454272031784,
"step": 190
},
{
"epoch": 0.0015741282543659395,
"grad_norm": 5.75,
"grad_norm_var": 136.052685546875,
"learning_rate": 5e-05,
"loss": 0.2656,
"loss/crossentropy": 1.3986968994140625,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2216796875,
"loss/idx": 0.0,
"loss/logits": 0.04394121095538139,
"step": 191
},
{
"epoch": 0.0015823697635511014,
"grad_norm": 2.203125,
"grad_norm_var": 137.9039052327474,
"learning_rate": 5e-05,
"loss": 0.2049,
"loss/crossentropy": 2.949207067489624,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1591796875,
"loss/idx": 0.0,
"loss/logits": 0.04576685652136803,
"step": 192
},
{
"epoch": 0.0015906112727362635,
"grad_norm": 206.0,
"grad_norm_var": 2601.2852040608723,
"learning_rate": 5e-05,
"loss": 1.1479,
"loss/crossentropy": 1.637980580329895,
"loss/dist_ce": 0.0,
"loss/hidden": 1.015625,
"loss/idx": 0.0,
"loss/logits": 0.13231301307678223,
"step": 193
},
{
"epoch": 0.0015988527819214254,
"grad_norm": 4.28125,
"grad_norm_var": 2601.1549875895184,
"learning_rate": 5e-05,
"loss": 0.294,
"loss/crossentropy": 2.0168232917785645,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23828125,
"loss/idx": 0.0,
"loss/logits": 0.055670544505119324,
"step": 194
},
{
"epoch": 0.0016070942911065875,
"grad_norm": 5.4375,
"grad_norm_var": 2595.9699696858725,
"learning_rate": 5e-05,
"loss": 0.3063,
"loss/crossentropy": 2.8578052520751953,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2275390625,
"loss/idx": 0.0,
"loss/logits": 0.07876630872488022,
"step": 195
},
{
"epoch": 0.0016153358002917494,
"grad_norm": 2.5,
"grad_norm_var": 2596.477936808268,
"learning_rate": 5e-05,
"loss": 0.1944,
"loss/crossentropy": 1.4366533756256104,
"loss/dist_ce": 0.0,
"loss/hidden": 0.166015625,
"loss/idx": 0.0,
"loss/logits": 0.028374146670103073,
"step": 196
},
{
"epoch": 0.0016235773094769115,
"grad_norm": 2.09375,
"grad_norm_var": 2597.144513956706,
"learning_rate": 5e-05,
"loss": 0.2272,
"loss/crossentropy": 1.4764188528060913,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1845703125,
"loss/idx": 0.0,
"loss/logits": 0.04259010776877403,
"step": 197
},
{
"epoch": 0.0016318188186620734,
"grad_norm": 8.25,
"grad_norm_var": 2590.14152730306,
"learning_rate": 5e-05,
"loss": 0.2137,
"loss/crossentropy": 0.44381940364837646,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1884765625,
"loss/idx": 0.0,
"loss/logits": 0.025243356823921204,
"step": 198
},
{
"epoch": 0.0016400603278472353,
"grad_norm": 3.703125,
"grad_norm_var": 2595.355013020833,
"learning_rate": 5e-05,
"loss": 0.2982,
"loss/crossentropy": 2.862804889678955,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23046875,
"loss/idx": 0.0,
"loss/logits": 0.06769528239965439,
"step": 199
},
{
"epoch": 0.0016483018370323974,
"grad_norm": 30.625,
"grad_norm_var": 2577.980920410156,
"learning_rate": 5e-05,
"loss": 0.5157,
"loss/crossentropy": 2.726966142654419,
"loss/dist_ce": 0.0,
"loss/hidden": 0.4296875,
"loss/idx": 0.0,
"loss/logits": 0.08603046834468842,
"step": 200
},
{
"epoch": 0.0016565433462175593,
"grad_norm": 8.4375,
"grad_norm_var": 2527.9221638997396,
"learning_rate": 5e-05,
"loss": 0.4416,
"loss/crossentropy": 1.4357587099075317,
"loss/dist_ce": 0.0,
"loss/hidden": 0.37109375,
"loss/idx": 0.0,
"loss/logits": 0.07054366171360016,
"step": 201
},
{
"epoch": 0.0016647848554027214,
"grad_norm": 3.515625,
"grad_norm_var": 2533.595897420247,
"learning_rate": 5e-05,
"loss": 0.2845,
"loss/crossentropy": 2.2265231609344482,
"loss/dist_ce": 0.0,
"loss/hidden": 0.21875,
"loss/idx": 0.0,
"loss/logits": 0.06571735441684723,
"step": 202
},
{
"epoch": 0.0016730263645878833,
"grad_norm": 8.1875,
"grad_norm_var": 2530.3101308186847,
"learning_rate": 5e-05,
"loss": 0.3526,
"loss/crossentropy": 2.462019681930542,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28125,
"loss/idx": 0.0,
"loss/logits": 0.0713062509894371,
"step": 203
},
{
"epoch": 0.0016812678737730454,
"grad_norm": 2.046875,
"grad_norm_var": 2531.50295308431,
"learning_rate": 5e-05,
"loss": 0.2505,
"loss/crossentropy": 2.9555587768554688,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1923828125,
"loss/idx": 0.0,
"loss/logits": 0.05816446244716644,
"step": 204
},
{
"epoch": 0.0016895093829582073,
"grad_norm": 4.3125,
"grad_norm_var": 2527.0187459309896,
"learning_rate": 5e-05,
"loss": 0.3139,
"loss/crossentropy": 1.3578487634658813,
"loss/dist_ce": 0.0,
"loss/hidden": 0.267578125,
"loss/idx": 0.0,
"loss/logits": 0.04633466899394989,
"step": 205
},
{
"epoch": 0.0016977508921433692,
"grad_norm": 5.03125,
"grad_norm_var": 2535.7589192708333,
"learning_rate": 5e-05,
"loss": 0.2753,
"loss/crossentropy": 2.804027557373047,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2109375,
"loss/idx": 0.0,
"loss/logits": 0.06441053748130798,
"step": 206
},
{
"epoch": 0.0017059924013285313,
"grad_norm": 2.859375,
"grad_norm_var": 2541.3487782796224,
"learning_rate": 5e-05,
"loss": 0.2116,
"loss/crossentropy": 1.8921546936035156,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1767578125,
"loss/idx": 0.0,
"loss/logits": 0.034831516444683075,
"step": 207
},
{
"epoch": 0.0017142339105136932,
"grad_norm": 1.765625,
"grad_norm_var": 2542.324095662435,
"learning_rate": 5e-05,
"loss": 0.2004,
"loss/crossentropy": 2.6469180583953857,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1572265625,
"loss/idx": 0.0,
"loss/logits": 0.04312657564878464,
"step": 208
},
{
"epoch": 0.0017224754196988553,
"grad_norm": 5.65625,
"grad_norm_var": 47.41833394368489,
"learning_rate": 5e-05,
"loss": 0.3427,
"loss/crossentropy": 1.994149088859558,
"loss/dist_ce": 0.0,
"loss/hidden": 0.27734375,
"loss/idx": 0.0,
"loss/logits": 0.06540031731128693,
"step": 209
},
{
"epoch": 0.0017307169288840172,
"grad_norm": 3.671875,
"grad_norm_var": 47.59491780598958,
"learning_rate": 5e-05,
"loss": 0.2391,
"loss/crossentropy": 1.5324124097824097,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1953125,
"loss/idx": 0.0,
"loss/logits": 0.04377663880586624,
"step": 210
},
{
"epoch": 0.0017389584380691793,
"grad_norm": 7.125,
"grad_norm_var": 47.61689046223958,
"learning_rate": 5e-05,
"loss": 0.5363,
"loss/crossentropy": 2.5077903270721436,
"loss/dist_ce": 0.0,
"loss/hidden": 0.4453125,
"loss/idx": 0.0,
"loss/logits": 0.09098894894123077,
"step": 211
},
{
"epoch": 0.0017471999472543412,
"grad_norm": 2.5,
"grad_norm_var": 47.61689046223958,
"learning_rate": 5e-05,
"loss": 0.2245,
"loss/crossentropy": 1.6434502601623535,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.042850345373153687,
"step": 212
},
{
"epoch": 0.0017554414564395031,
"grad_norm": 3.0625,
"grad_norm_var": 47.140462239583336,
"learning_rate": 5e-05,
"loss": 0.2987,
"loss/crossentropy": 2.1260766983032227,
"loss/dist_ce": 0.0,
"loss/hidden": 0.228515625,
"loss/idx": 0.0,
"loss/logits": 0.07022828608751297,
"step": 213
},
{
"epoch": 0.0017636829656246652,
"grad_norm": 2.703125,
"grad_norm_var": 47.61895243326823,
"learning_rate": 5e-05,
"loss": 0.3036,
"loss/crossentropy": 2.342567205429077,
"loss/dist_ce": 0.0,
"loss/hidden": 0.24609375,
"loss/idx": 0.0,
"loss/logits": 0.057552557438611984,
"step": 214
},
{
"epoch": 0.0017719244748098271,
"grad_norm": 3.5625,
"grad_norm_var": 47.66232096354167,
"learning_rate": 5e-05,
"loss": 0.2831,
"loss/crossentropy": 2.4342286586761475,
"loss/dist_ce": 0.0,
"loss/hidden": 0.21875,
"loss/idx": 0.0,
"loss/logits": 0.06433624029159546,
"step": 215
},
{
"epoch": 0.0017801659839949892,
"grad_norm": 4.59375,
"grad_norm_var": 4.341304524739583,
"learning_rate": 5e-05,
"loss": 0.2732,
"loss/crossentropy": 1.6944836378097534,
"loss/dist_ce": 0.0,
"loss/hidden": 0.220703125,
"loss/idx": 0.0,
"loss/logits": 0.05249807611107826,
"step": 216
},
{
"epoch": 0.0017884074931801511,
"grad_norm": 3.015625,
"grad_norm_var": 3.197980753580729,
"learning_rate": 5e-05,
"loss": 0.2028,
"loss/crossentropy": 1.4322035312652588,
"loss/dist_ce": 0.0,
"loss/hidden": 0.173828125,
"loss/idx": 0.0,
"loss/logits": 0.028966199606657028,
"step": 217
},
{
"epoch": 0.0017966490023653132,
"grad_norm": 8.1875,
"grad_norm_var": 4.275614420572917,
"learning_rate": 5e-05,
"loss": 0.4244,
"loss/crossentropy": 2.7989346981048584,
"loss/dist_ce": 0.0,
"loss/hidden": 0.3359375,
"loss/idx": 0.0,
"loss/logits": 0.08842961490154266,
"step": 218
},
{
"epoch": 0.0018048905115504751,
"grad_norm": 2.359375,
"grad_norm_var": 3.3524485270182294,
"learning_rate": 5e-05,
"loss": 0.1763,
"loss/crossentropy": 0.49160024523735046,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15625,
"loss/idx": 0.0,
"loss/logits": 0.020072361454367638,
"step": 219
},
{
"epoch": 0.001813132020735637,
"grad_norm": 2.03125,
"grad_norm_var": 3.356331380208333,
"learning_rate": 5e-05,
"loss": 0.1975,
"loss/crossentropy": 0.9580312967300415,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.027562592178583145,
"step": 220
},
{
"epoch": 0.0018213735299207991,
"grad_norm": 4.46875,
"grad_norm_var": 3.366402180989583,
"learning_rate": 5e-05,
"loss": 0.2313,
"loss/crossentropy": 2.2378625869750977,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.04968283697962761,
"step": 221
},
{
"epoch": 0.001829615039105961,
"grad_norm": 2.609375,
"grad_norm_var": 3.3716054280598957,
"learning_rate": 5e-05,
"loss": 0.2638,
"loss/crossentropy": 1.2911431789398193,
"loss/dist_ce": 0.0,
"loss/hidden": 0.21484375,
"loss/idx": 0.0,
"loss/logits": 0.04894676432013512,
"step": 222
},
{
"epoch": 0.0018378565482911231,
"grad_norm": 4.125,
"grad_norm_var": 3.3196126302083333,
"learning_rate": 5e-05,
"loss": 0.2351,
"loss/crossentropy": 2.6005423069000244,
"loss/dist_ce": 0.0,
"loss/hidden": 0.185546875,
"loss/idx": 0.0,
"loss/logits": 0.04953521490097046,
"step": 223
},
{
"epoch": 0.001846098057476285,
"grad_norm": 2.109375,
"grad_norm_var": 3.2319295247395834,
"learning_rate": 5e-05,
"loss": 0.1785,
"loss/crossentropy": 1.635225772857666,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1455078125,
"loss/idx": 0.0,
"loss/logits": 0.03304152935743332,
"step": 224
},
{
"epoch": 0.0018543395666614471,
"grad_norm": 6.4375,
"grad_norm_var": 3.457047526041667,
"learning_rate": 5e-05,
"loss": 0.5431,
"loss/crossentropy": 2.507209062576294,
"loss/dist_ce": 0.0,
"loss/hidden": 0.404296875,
"loss/idx": 0.0,
"loss/logits": 0.13885299861431122,
"step": 225
},
{
"epoch": 0.001862581075846609,
"grad_norm": 2.984375,
"grad_norm_var": 3.508430989583333,
"learning_rate": 5e-05,
"loss": 0.181,
"loss/crossentropy": 0.42544418573379517,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1611328125,
"loss/idx": 0.0,
"loss/logits": 0.019888322800397873,
"step": 226
},
{
"epoch": 0.001870822585031771,
"grad_norm": 3.140625,
"grad_norm_var": 2.7699208577473957,
"learning_rate": 5e-05,
"loss": 0.2789,
"loss/crossentropy": 2.700981378555298,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2138671875,
"loss/idx": 0.0,
"loss/logits": 0.06499424576759338,
"step": 227
},
{
"epoch": 0.001879064094216933,
"grad_norm": 5.3125,
"grad_norm_var": 2.8449940999348957,
"learning_rate": 5e-05,
"loss": 0.2954,
"loss/crossentropy": 1.6264232397079468,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2451171875,
"loss/idx": 0.0,
"loss/logits": 0.050260186195373535,
"step": 228
},
{
"epoch": 0.001887305603402095,
"grad_norm": 5.96875,
"grad_norm_var": 3.089452107747396,
"learning_rate": 5e-05,
"loss": 0.1884,
"loss/crossentropy": 1.3441599607467651,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1669921875,
"loss/idx": 0.0,
"loss/logits": 0.021441757678985596,
"step": 229
},
{
"epoch": 0.001895547112587257,
"grad_norm": 2.171875,
"grad_norm_var": 3.197223917643229,
"learning_rate": 5e-05,
"loss": 0.1824,
"loss/crossentropy": 0.4492271840572357,
"loss/dist_ce": 0.0,
"loss/hidden": 0.162109375,
"loss/idx": 0.0,
"loss/logits": 0.02026466839015484,
"step": 230
},
{
"epoch": 0.001903788621772419,
"grad_norm": 3.21875,
"grad_norm_var": 3.222020467122396,
"learning_rate": 5e-05,
"loss": 0.2551,
"loss/crossentropy": 2.23905873298645,
"loss/dist_ce": 0.0,
"loss/hidden": 0.20703125,
"loss/idx": 0.0,
"loss/logits": 0.04803081601858139,
"step": 231
},
{
"epoch": 0.001912030130957581,
"grad_norm": 2.1875,
"grad_norm_var": 3.368024698893229,
"learning_rate": 5e-05,
"loss": 0.216,
"loss/crossentropy": 1.9740031957626343,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.04604914411902428,
"step": 232
},
{
"epoch": 0.001920271640142743,
"grad_norm": 2.125,
"grad_norm_var": 3.5072428385416665,
"learning_rate": 5e-05,
"loss": 0.2327,
"loss/crossentropy": 2.738755226135254,
"loss/dist_ce": 0.0,
"loss/hidden": 0.18359375,
"loss/idx": 0.0,
"loss/logits": 0.04907117411494255,
"step": 233
},
{
"epoch": 0.0019285131493279048,
"grad_norm": 4.21875,
"grad_norm_var": 2.1248982747395835,
"learning_rate": 5e-05,
"loss": 0.4989,
"loss/crossentropy": 2.8038580417633057,
"loss/dist_ce": 0.0,
"loss/hidden": 0.390625,
"loss/idx": 0.0,
"loss/logits": 0.10827778279781342,
"step": 234
},
{
"epoch": 0.001936754658513067,
"grad_norm": 2.84375,
"grad_norm_var": 2.068040974934896,
"learning_rate": 5e-05,
"loss": 0.3216,
"loss/crossentropy": 2.015542984008789,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2734375,
"loss/idx": 0.0,
"loss/logits": 0.04818400368094444,
"step": 235
},
{
"epoch": 0.0019449961676982288,
"grad_norm": 3.203125,
"grad_norm_var": 1.9248372395833333,
"learning_rate": 5e-05,
"loss": 0.2473,
"loss/crossentropy": 1.5457327365875244,
"loss/dist_ce": 0.0,
"loss/hidden": 0.205078125,
"loss/idx": 0.0,
"loss/logits": 0.04223756492137909,
"step": 236
},
{
"epoch": 0.001953237676883391,
"grad_norm": 3.15625,
"grad_norm_var": 1.8752766927083333,
"learning_rate": 5e-05,
"loss": 0.3016,
"loss/crossentropy": 2.3469016551971436,
"loss/dist_ce": 0.0,
"loss/hidden": 0.24609375,
"loss/idx": 0.0,
"loss/logits": 0.05547412484884262,
"step": 237
},
{
"epoch": 0.001961479186068553,
"grad_norm": 3.578125,
"grad_norm_var": 1.8204060872395833,
"learning_rate": 5e-05,
"loss": 0.2486,
"loss/crossentropy": 2.7459280490875244,
"loss/dist_ce": 0.0,
"loss/hidden": 0.197265625,
"loss/idx": 0.0,
"loss/logits": 0.051300592720508575,
"step": 238
},
{
"epoch": 0.0019697206952537147,
"grad_norm": 1.9765625,
"grad_norm_var": 1.9438433329264322,
"learning_rate": 5e-05,
"loss": 0.2187,
"loss/crossentropy": 2.024442434310913,
"loss/dist_ce": 0.0,
"loss/hidden": 0.171875,
"loss/idx": 0.0,
"loss/logits": 0.04683014005422592,
"step": 239
},
{
"epoch": 0.001977962204438877,
"grad_norm": 3.140625,
"grad_norm_var": 1.8308489481608072,
"learning_rate": 5e-05,
"loss": 0.2406,
"loss/crossentropy": 1.520363211631775,
"loss/dist_ce": 0.0,
"loss/hidden": 0.201171875,
"loss/idx": 0.0,
"loss/logits": 0.039388738572597504,
"step": 240
},
{
"epoch": 0.001986203713624039,
"grad_norm": 2.609375,
"grad_norm_var": 1.2366920471191407,
"learning_rate": 5e-05,
"loss": 0.1882,
"loss/crossentropy": 2.6319429874420166,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.035828400403261185,
"step": 241
},
{
"epoch": 0.001994445222809201,
"grad_norm": 9.3125,
"grad_norm_var": 3.5240455627441407,
"learning_rate": 5e-05,
"loss": 0.2554,
"loss/crossentropy": 1.4544413089752197,
"loss/dist_ce": 0.0,
"loss/hidden": 0.21875,
"loss/idx": 0.0,
"loss/logits": 0.036612022668123245,
"step": 242
},
{
"epoch": 0.0020026867319943627,
"grad_norm": 2.96875,
"grad_norm_var": 3.5372271219889324,
"learning_rate": 5e-05,
"loss": 0.2419,
"loss/crossentropy": 2.548147201538086,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1884765625,
"loss/idx": 0.0,
"loss/logits": 0.053470924496650696,
"step": 243
},
{
"epoch": 0.0020109282411795246,
"grad_norm": 11.8125,
"grad_norm_var": 7.640775299072265,
"learning_rate": 5e-05,
"loss": 0.3306,
"loss/crossentropy": 0.6415009498596191,
"loss/dist_ce": 0.0,
"loss/hidden": 0.294921875,
"loss/idx": 0.0,
"loss/logits": 0.035646334290504456,
"step": 244
},
{
"epoch": 0.002019169750364687,
"grad_norm": 3.203125,
"grad_norm_var": 7.404184722900391,
"learning_rate": 5e-05,
"loss": 0.2558,
"loss/crossentropy": 2.516376495361328,
"loss/dist_ce": 0.0,
"loss/hidden": 0.203125,
"loss/idx": 0.0,
"loss/logits": 0.052696891129016876,
"step": 245
},
{
"epoch": 0.002027411259549849,
"grad_norm": 2.53125,
"grad_norm_var": 7.3314674377441404,
"learning_rate": 5e-05,
"loss": 0.1816,
"loss/crossentropy": 0.31695130467414856,
"loss/dist_ce": 0.0,
"loss/hidden": 0.166015625,
"loss/idx": 0.0,
"loss/logits": 0.015581747516989708,
"step": 246
},
{
"epoch": 0.0020356527687350108,
"grad_norm": 2.515625,
"grad_norm_var": 7.4243934631347654,
"learning_rate": 5e-05,
"loss": 0.2397,
"loss/crossentropy": 1.937793493270874,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1875,
"loss/idx": 0.0,
"loss/logits": 0.05216747149825096,
"step": 247
},
{
"epoch": 0.0020438942779201726,
"grad_norm": 2.46875,
"grad_norm_var": 7.367502593994141,
"learning_rate": 5e-05,
"loss": 0.2293,
"loss/crossentropy": 2.4479126930236816,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.04766194522380829,
"step": 248
},
{
"epoch": 0.0020521357871053345,
"grad_norm": 2.25,
"grad_norm_var": 7.339662424723307,
"learning_rate": 5e-05,
"loss": 0.1789,
"loss/crossentropy": 1.441886067390442,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1484375,
"loss/idx": 0.0,
"loss/logits": 0.030450304970145226,
"step": 249
},
{
"epoch": 0.002060377296290497,
"grad_norm": 6.25,
"grad_norm_var": 7.694205474853516,
"learning_rate": 5e-05,
"loss": 0.2986,
"loss/crossentropy": 2.495968818664551,
"loss/dist_ce": 0.0,
"loss/hidden": 0.234375,
"loss/idx": 0.0,
"loss/logits": 0.06425687670707703,
"step": 250
},
{
"epoch": 0.0020686188054756588,
"grad_norm": 1.765625,
"grad_norm_var": 7.931449127197266,
"learning_rate": 5e-05,
"loss": 0.1676,
"loss/crossentropy": 1.5723477602005005,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1416015625,
"loss/idx": 0.0,
"loss/logits": 0.025984089821577072,
"step": 251
},
{
"epoch": 0.0020768603146608207,
"grad_norm": 5.3125,
"grad_norm_var": 8.00752944946289,
"learning_rate": 5e-05,
"loss": 0.4023,
"loss/crossentropy": 1.5475258827209473,
"loss/dist_ce": 0.0,
"loss/hidden": 0.34375,
"loss/idx": 0.0,
"loss/logits": 0.05857189744710922,
"step": 252
},
{
"epoch": 0.0020851018238459825,
"grad_norm": 4.96875,
"grad_norm_var": 7.996083323160807,
"learning_rate": 5e-05,
"loss": 0.3194,
"loss/crossentropy": 2.286716938018799,
"loss/dist_ce": 0.0,
"loss/hidden": 0.244140625,
"loss/idx": 0.0,
"loss/logits": 0.07523184269666672,
"step": 253
},
{
"epoch": 0.002093343333031145,
"grad_norm": 5.5,
"grad_norm_var": 8.076161448160807,
"learning_rate": 5e-05,
"loss": 0.2753,
"loss/crossentropy": 1.5914890766143799,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23046875,
"loss/idx": 0.0,
"loss/logits": 0.044802576303482056,
"step": 254
},
{
"epoch": 0.0021015848422163068,
"grad_norm": 3.34375,
"grad_norm_var": 7.771882120768229,
"learning_rate": 5e-05,
"loss": 0.2477,
"loss/crossentropy": 2.1449875831604004,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1962890625,
"loss/idx": 0.0,
"loss/logits": 0.0514422208070755,
"step": 255
},
{
"epoch": 0.0021098263514014687,
"grad_norm": 9.3125,
"grad_norm_var": 9.1392578125,
"learning_rate": 5e-05,
"loss": 0.5091,
"loss/crossentropy": 2.605140447616577,
"loss/dist_ce": 0.0,
"loss/hidden": 0.408203125,
"loss/idx": 0.0,
"loss/logits": 0.10090796649456024,
"step": 256
},
{
"epoch": 0.0021180678605866306,
"grad_norm": 13.75,
"grad_norm_var": 13.705028279622395,
"learning_rate": 5e-05,
"loss": 0.3277,
"loss/crossentropy": 2.162487745285034,
"loss/dist_ce": 0.0,
"loss/hidden": 0.26953125,
"loss/idx": 0.0,
"loss/logits": 0.058200109750032425,
"step": 257
},
{
"epoch": 0.0021263093697717925,
"grad_norm": 3.140625,
"grad_norm_var": 12.910640462239583,
"learning_rate": 5e-05,
"loss": 0.2509,
"loss/crossentropy": 2.1336512565612793,
"loss/dist_ce": 0.0,
"loss/hidden": 0.197265625,
"loss/idx": 0.0,
"loss/logits": 0.053586918860673904,
"step": 258
},
{
"epoch": 0.0021345508789569548,
"grad_norm": 5.71875,
"grad_norm_var": 12.61343994140625,
"learning_rate": 5e-05,
"loss": 0.2007,
"loss/crossentropy": 0.37939441204071045,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.019016824662685394,
"step": 259
},
{
"epoch": 0.0021427923881421167,
"grad_norm": 2.59375,
"grad_norm_var": 9.846614583333333,
"learning_rate": 5e-05,
"loss": 0.2159,
"loss/crossentropy": 1.1070560216903687,
"loss/dist_ce": 0.0,
"loss/hidden": 0.185546875,
"loss/idx": 0.0,
"loss/logits": 0.03035794384777546,
"step": 260
},
{
"epoch": 0.0021510338973272786,
"grad_norm": 4.28125,
"grad_norm_var": 9.709251912434896,
"learning_rate": 5e-05,
"loss": 0.2899,
"loss/crossentropy": 1.604844331741333,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2294921875,
"loss/idx": 0.0,
"loss/logits": 0.06044600158929825,
"step": 261
},
{
"epoch": 0.0021592754065124405,
"grad_norm": 2.84375,
"grad_norm_var": 9.623680623372396,
"learning_rate": 5e-05,
"loss": 0.2858,
"loss/crossentropy": 2.6856131553649902,
"loss/dist_ce": 0.0,
"loss/hidden": 0.220703125,
"loss/idx": 0.0,
"loss/logits": 0.06507028639316559,
"step": 262
},
{
"epoch": 0.0021675169156976024,
"grad_norm": 3.734375,
"grad_norm_var": 9.353270467122396,
"learning_rate": 5e-05,
"loss": 0.3012,
"loss/crossentropy": 2.49045991897583,
"loss/dist_ce": 0.0,
"loss/hidden": 0.24609375,
"loss/idx": 0.0,
"loss/logits": 0.055099453777074814,
"step": 263
},
{
"epoch": 0.0021757584248827647,
"grad_norm": 4.65625,
"grad_norm_var": 8.964476521809896,
"learning_rate": 5e-05,
"loss": 0.3876,
"loss/crossentropy": 1.3125131130218506,
"loss/dist_ce": 0.0,
"loss/hidden": 0.330078125,
"loss/idx": 0.0,
"loss/logits": 0.057526711374521255,
"step": 264
},
{
"epoch": 0.0021839999340679266,
"grad_norm": 11.625,
"grad_norm_var": 11.065306599934896,
"learning_rate": 5e-05,
"loss": 0.5553,
"loss/crossentropy": 2.620126485824585,
"loss/dist_ce": 0.0,
"loss/hidden": 0.43359375,
"loss/idx": 0.0,
"loss/logits": 0.12169644981622696,
"step": 265
},
{
"epoch": 0.0021922414432530885,
"grad_norm": 2.09375,
"grad_norm_var": 11.756932576497396,
"learning_rate": 5e-05,
"loss": 0.2092,
"loss/crossentropy": 1.3833715915679932,
"loss/dist_ce": 0.0,
"loss/hidden": 0.173828125,
"loss/idx": 0.0,
"loss/logits": 0.035347893834114075,
"step": 266
},
{
"epoch": 0.0022004829524382504,
"grad_norm": 2.5,
"grad_norm_var": 11.445540364583334,
"learning_rate": 5e-05,
"loss": 0.1932,
"loss/crossentropy": 0.3456151485443115,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.015461962670087814,
"step": 267
},
{
"epoch": 0.0022087244616234127,
"grad_norm": 2.546875,
"grad_norm_var": 11.932225545247396,
"learning_rate": 5e-05,
"loss": 0.1921,
"loss/crossentropy": 2.6017909049987793,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.03972596302628517,
"step": 268
},
{
"epoch": 0.0022169659708085746,
"grad_norm": 14.4375,
"grad_norm_var": 17.290453084309895,
"learning_rate": 5e-05,
"loss": 0.368,
"loss/crossentropy": 2.35398006439209,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2890625,
"loss/idx": 0.0,
"loss/logits": 0.0789838507771492,
"step": 269
},
{
"epoch": 0.0022252074799937365,
"grad_norm": 3.53125,
"grad_norm_var": 17.599608357747396,
"learning_rate": 5e-05,
"loss": 0.2167,
"loss/crossentropy": 2.4693641662597656,
"loss/dist_ce": 0.0,
"loss/hidden": 0.166015625,
"loss/idx": 0.0,
"loss/logits": 0.05065637826919556,
"step": 270
},
{
"epoch": 0.0022334489891788984,
"grad_norm": 2.515625,
"grad_norm_var": 17.895113118489583,
"learning_rate": 5e-05,
"loss": 0.1815,
"loss/crossentropy": 1.3819841146469116,
"loss/dist_ce": 0.0,
"loss/hidden": 0.154296875,
"loss/idx": 0.0,
"loss/logits": 0.027211952954530716,
"step": 271
},
{
"epoch": 0.0022416904983640603,
"grad_norm": 7.0,
"grad_norm_var": 17.078511555989582,
"learning_rate": 5e-05,
"loss": 0.1861,
"loss/crossentropy": 1.4722107648849487,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1572265625,
"loss/idx": 0.0,
"loss/logits": 0.028832225129008293,
"step": 272
},
{
"epoch": 0.0022499320075492226,
"grad_norm": 4.21875,
"grad_norm_var": 12.190022786458334,
"learning_rate": 5e-05,
"loss": 0.3099,
"loss/crossentropy": 1.6392327547073364,
"loss/dist_ce": 0.0,
"loss/hidden": 0.251953125,
"loss/idx": 0.0,
"loss/logits": 0.05795075744390488,
"step": 273
},
{
"epoch": 0.0022581735167343845,
"grad_norm": 5.21875,
"grad_norm_var": 11.989110310872396,
"learning_rate": 5e-05,
"loss": 0.4204,
"loss/crossentropy": 2.4640941619873047,
"loss/dist_ce": 0.0,
"loss/hidden": 0.34765625,
"loss/idx": 0.0,
"loss/logits": 0.0727241188287735,
"step": 274
},
{
"epoch": 0.0022664150259195464,
"grad_norm": 3.21875,
"grad_norm_var": 12.130060831705729,
"learning_rate": 5e-05,
"loss": 0.2795,
"loss/crossentropy": 1.452579140663147,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2314453125,
"loss/idx": 0.0,
"loss/logits": 0.048067688941955566,
"step": 275
},
{
"epoch": 0.0022746565351047083,
"grad_norm": 6.9375,
"grad_norm_var": 12.023729451497395,
"learning_rate": 5e-05,
"loss": 0.2885,
"loss/crossentropy": 1.5026805400848389,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23828125,
"loss/idx": 0.0,
"loss/logits": 0.05024395138025284,
"step": 276
},
{
"epoch": 0.00228289804428987,
"grad_norm": 2.875,
"grad_norm_var": 12.298021443684895,
"learning_rate": 5e-05,
"loss": 0.1911,
"loss/crossentropy": 1.6457816362380981,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1552734375,
"loss/idx": 0.0,
"loss/logits": 0.03582204133272171,
"step": 277
},
{
"epoch": 0.0022911395534750325,
"grad_norm": 2.453125,
"grad_norm_var": 12.419710286458333,
"learning_rate": 5e-05,
"loss": 0.226,
"loss/crossentropy": 2.44157338142395,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1767578125,
"loss/idx": 0.0,
"loss/logits": 0.0492391511797905,
"step": 278
},
{
"epoch": 0.0022993810626601944,
"grad_norm": 3.09375,
"grad_norm_var": 12.55113016764323,
"learning_rate": 5e-05,
"loss": 0.285,
"loss/crossentropy": 2.398951292037964,
"loss/dist_ce": 0.0,
"loss/hidden": 0.21484375,
"loss/idx": 0.0,
"loss/logits": 0.07012955844402313,
"step": 279
},
{
"epoch": 0.0023076225718453563,
"grad_norm": 4.78125,
"grad_norm_var": 12.547500610351562,
"learning_rate": 5e-05,
"loss": 0.3511,
"loss/crossentropy": 2.1601598262786865,
"loss/dist_ce": 0.0,
"loss/hidden": 0.267578125,
"loss/idx": 0.0,
"loss/logits": 0.08354485034942627,
"step": 280
},
{
"epoch": 0.002315864081030518,
"grad_norm": 1.8046875,
"grad_norm_var": 9.82229995727539,
"learning_rate": 5e-05,
"loss": 0.2235,
"loss/crossentropy": 1.5090404748916626,
"loss/dist_ce": 0.0,
"loss/hidden": 0.18359375,
"loss/idx": 0.0,
"loss/logits": 0.03991977125406265,
"step": 281
},
{
"epoch": 0.0023241055902156805,
"grad_norm": 3.375,
"grad_norm_var": 9.5434445699056,
"learning_rate": 5e-05,
"loss": 0.3108,
"loss/crossentropy": 2.371715545654297,
"loss/dist_ce": 0.0,
"loss/hidden": 0.251953125,
"loss/idx": 0.0,
"loss/logits": 0.05880487337708473,
"step": 282
},
{
"epoch": 0.0023323470994008424,
"grad_norm": 4.78125,
"grad_norm_var": 9.288734690348308,
"learning_rate": 5e-05,
"loss": 0.3743,
"loss/crossentropy": 1.7449641227722168,
"loss/dist_ce": 0.0,
"loss/hidden": 0.296875,
"loss/idx": 0.0,
"loss/logits": 0.07747267186641693,
"step": 283
},
{
"epoch": 0.0023405886085860043,
"grad_norm": 2.34375,
"grad_norm_var": 9.345546213785807,
"learning_rate": 5e-05,
"loss": 0.2385,
"loss/crossentropy": 2.332099199295044,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.05684517323970795,
"step": 284
},
{
"epoch": 0.002348830117771166,
"grad_norm": 8.875,
"grad_norm_var": 3.936232248942057,
"learning_rate": 5e-05,
"loss": 0.2679,
"loss/crossentropy": 2.6605751514434814,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2197265625,
"loss/idx": 0.0,
"loss/logits": 0.04814404994249344,
"step": 285
},
{
"epoch": 0.002357071626956328,
"grad_norm": 2.515625,
"grad_norm_var": 4.089766184488933,
"learning_rate": 5e-05,
"loss": 0.2346,
"loss/crossentropy": 2.4595489501953125,
"loss/dist_ce": 0.0,
"loss/hidden": 0.189453125,
"loss/idx": 0.0,
"loss/logits": 0.0451187826693058,
"step": 286
},
{
"epoch": 0.0023653131361414904,
"grad_norm": 6.3125,
"grad_norm_var": 4.175789133707682,
"learning_rate": 5e-05,
"loss": 0.4335,
"loss/crossentropy": 3.0684797763824463,
"loss/dist_ce": 0.0,
"loss/hidden": 0.3125,
"loss/idx": 0.0,
"loss/logits": 0.12099675089120865,
"step": 287
},
{
"epoch": 0.0023735546453266523,
"grad_norm": 2.046875,
"grad_norm_var": 3.967474110921224,
"learning_rate": 5e-05,
"loss": 0.1653,
"loss/crossentropy": 2.7369492053985596,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.03439934179186821,
"step": 288
},
{
"epoch": 0.002381796154511814,
"grad_norm": 37.0,
"grad_norm_var": 71.8541135152181,
"learning_rate": 5e-05,
"loss": 0.2523,
"loss/crossentropy": 1.4466071128845215,
"loss/dist_ce": 0.0,
"loss/hidden": 0.21484375,
"loss/idx": 0.0,
"loss/logits": 0.037433233112096786,
"step": 289
},
{
"epoch": 0.002390037663696976,
"grad_norm": 2.71875,
"grad_norm_var": 72.5391721089681,
"learning_rate": 5e-05,
"loss": 0.183,
"loss/crossentropy": 0.8366924524307251,
"loss/dist_ce": 0.0,
"loss/hidden": 0.154296875,
"loss/idx": 0.0,
"loss/logits": 0.028669871389865875,
"step": 290
},
{
"epoch": 0.002398279172882138,
"grad_norm": 3.296875,
"grad_norm_var": 72.5111467997233,
"learning_rate": 5e-05,
"loss": 0.2282,
"loss/crossentropy": 2.2352423667907715,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1865234375,
"loss/idx": 0.0,
"loss/logits": 0.04168039560317993,
"step": 291
},
{
"epoch": 0.0024065206820673003,
"grad_norm": 7.25,
"grad_norm_var": 72.55836766560873,
"learning_rate": 5e-05,
"loss": 0.5589,
"loss/crossentropy": 3.05739426612854,
"loss/dist_ce": 0.0,
"loss/hidden": 0.45703125,
"loss/idx": 0.0,
"loss/logits": 0.10190241038799286,
"step": 292
},
{
"epoch": 0.002414762191252462,
"grad_norm": 8.4375,
"grad_norm_var": 72.19658788045247,
"learning_rate": 5e-05,
"loss": 0.337,
"loss/crossentropy": 1.8930912017822266,
"loss/dist_ce": 0.0,
"loss/hidden": 0.29296875,
"loss/idx": 0.0,
"loss/logits": 0.04408019781112671,
"step": 293
},
{
"epoch": 0.002423003700437624,
"grad_norm": 10.9375,
"grad_norm_var": 72.32363255818684,
"learning_rate": 5e-05,
"loss": 0.2491,
"loss/crossentropy": 1.5801359415054321,
"loss/dist_ce": 0.0,
"loss/hidden": 0.208984375,
"loss/idx": 0.0,
"loss/logits": 0.04009478539228439,
"step": 294
},
{
"epoch": 0.002431245209622786,
"grad_norm": 5.09375,
"grad_norm_var": 71.57246068318685,
"learning_rate": 5e-05,
"loss": 0.1903,
"loss/crossentropy": 0.9831718802452087,
"loss/dist_ce": 0.0,
"loss/hidden": 0.158203125,
"loss/idx": 0.0,
"loss/logits": 0.032129500061273575,
"step": 295
},
{
"epoch": 0.0024394867188079483,
"grad_norm": 2.765625,
"grad_norm_var": 72.41545384724935,
"learning_rate": 5e-05,
"loss": 0.3458,
"loss/crossentropy": 2.587148666381836,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2470703125,
"loss/idx": 0.0,
"loss/logits": 0.09877443313598633,
"step": 296
},
{
"epoch": 0.00244772822799311,
"grad_norm": 2.8125,
"grad_norm_var": 71.80135091145833,
"learning_rate": 5e-05,
"loss": 0.2355,
"loss/crossentropy": 2.281587839126587,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1865234375,
"loss/idx": 0.0,
"loss/logits": 0.04900825023651123,
"step": 297
},
{
"epoch": 0.002455969737178272,
"grad_norm": 2.171875,
"grad_norm_var": 72.45891825358073,
"learning_rate": 5e-05,
"loss": 0.1823,
"loss/crossentropy": 1.4672614336013794,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1552734375,
"loss/idx": 0.0,
"loss/logits": 0.02706265263259411,
"step": 298
},
{
"epoch": 0.002464211246363434,
"grad_norm": 2.953125,
"grad_norm_var": 73.16838785807292,
"learning_rate": 5e-05,
"loss": 0.2346,
"loss/crossentropy": 2.4047231674194336,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1865234375,
"loss/idx": 0.0,
"loss/logits": 0.048083603382110596,
"step": 299
},
{
"epoch": 0.002472452755548596,
"grad_norm": 11.1875,
"grad_norm_var": 72.89547526041666,
"learning_rate": 5e-05,
"loss": 0.3081,
"loss/crossentropy": 0.815432071685791,
"loss/dist_ce": 0.0,
"loss/hidden": 0.279296875,
"loss/idx": 0.0,
"loss/logits": 0.028755802661180496,
"step": 300
},
{
"epoch": 0.0024806942647337582,
"grad_norm": 5.34375,
"grad_norm_var": 72.92076416015625,
"learning_rate": 5e-05,
"loss": 0.3042,
"loss/crossentropy": 1.8723258972167969,
"loss/dist_ce": 0.0,
"loss/hidden": 0.248046875,
"loss/idx": 0.0,
"loss/logits": 0.05616258084774017,
"step": 301
},
{
"epoch": 0.00248893577391892,
"grad_norm": 18.0,
"grad_norm_var": 78.5388905843099,
"learning_rate": 5e-05,
"loss": 0.3228,
"loss/crossentropy": 1.394120693206787,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2890625,
"loss/idx": 0.0,
"loss/logits": 0.03374548256397247,
"step": 302
},
{
"epoch": 0.002497177283104082,
"grad_norm": 3.984375,
"grad_norm_var": 79.40784505208333,
"learning_rate": 5e-05,
"loss": 0.1944,
"loss/crossentropy": 1.3347328901290894,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16796875,
"loss/idx": 0.0,
"loss/logits": 0.02643435075879097,
"step": 303
},
{
"epoch": 0.002505418792289244,
"grad_norm": 3.3125,
"grad_norm_var": 78.5244618733724,
"learning_rate": 5e-05,
"loss": 0.2445,
"loss/crossentropy": 1.3129806518554688,
"loss/dist_ce": 0.0,
"loss/hidden": 0.201171875,
"loss/idx": 0.0,
"loss/logits": 0.04336348548531532,
"step": 304
},
{
"epoch": 0.002513660301474406,
"grad_norm": 2.5,
"grad_norm_var": 19.303954060872396,
"learning_rate": 5e-05,
"loss": 0.2061,
"loss/crossentropy": 1.494554042816162,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.02449900656938553,
"step": 305
},
{
"epoch": 0.002521901810659568,
"grad_norm": 2.0625,
"grad_norm_var": 19.600291951497397,
"learning_rate": 5e-05,
"loss": 0.2326,
"loss/crossentropy": 2.698347806930542,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19140625,
"loss/idx": 0.0,
"loss/logits": 0.04119253158569336,
"step": 306
},
{
"epoch": 0.00253014331984473,
"grad_norm": 3.890625,
"grad_norm_var": 19.427578735351563,
"learning_rate": 5e-05,
"loss": 0.4184,
"loss/crossentropy": 1.5166016817092896,
"loss/dist_ce": 0.0,
"loss/hidden": 0.3515625,
"loss/idx": 0.0,
"loss/logits": 0.06688607484102249,
"step": 307
},
{
"epoch": 0.002538384829029892,
"grad_norm": 7.28125,
"grad_norm_var": 19.43370666503906,
"learning_rate": 5e-05,
"loss": 0.2909,
"loss/crossentropy": 2.696192979812622,
"loss/dist_ce": 0.0,
"loss/hidden": 0.22265625,
"loss/idx": 0.0,
"loss/logits": 0.068264901638031,
"step": 308
},
{
"epoch": 0.002546626338215054,
"grad_norm": 4.875,
"grad_norm_var": 18.97215881347656,
"learning_rate": 5e-05,
"loss": 0.2964,
"loss/crossentropy": 2.594907522201538,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23828125,
"loss/idx": 0.0,
"loss/logits": 0.05810549482703209,
"step": 309
},
{
"epoch": 0.002554867847400216,
"grad_norm": 2.59375,
"grad_norm_var": 17.355557250976563,
"learning_rate": 5e-05,
"loss": 0.2423,
"loss/crossentropy": 2.760004758834839,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1884765625,
"loss/idx": 0.0,
"loss/logits": 0.05380372703075409,
"step": 310
},
{
"epoch": 0.002563109356585378,
"grad_norm": 2.84375,
"grad_norm_var": 17.659365844726562,
"learning_rate": 5e-05,
"loss": 0.2252,
"loss/crossentropy": 1.6147583723068237,
"loss/dist_ce": 0.0,
"loss/hidden": 0.189453125,
"loss/idx": 0.0,
"loss/logits": 0.035766348242759705,
"step": 311
},
{
"epoch": 0.00257135086577054,
"grad_norm": 1.8671875,
"grad_norm_var": 17.966829172770183,
"learning_rate": 5e-05,
"loss": 0.1994,
"loss/crossentropy": 2.396746873855591,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1591796875,
"loss/idx": 0.0,
"loss/logits": 0.04022689908742905,
"step": 312
},
{
"epoch": 0.002579592374955702,
"grad_norm": 2.71875,
"grad_norm_var": 17.992909495035807,
"learning_rate": 5e-05,
"loss": 0.2099,
"loss/crossentropy": 1.5479542016983032,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.03212570399045944,
"step": 313
},
{
"epoch": 0.0025878338841408637,
"grad_norm": 1.34375,
"grad_norm_var": 18.331384023030598,
"learning_rate": 5e-05,
"loss": 0.1331,
"loss/crossentropy": 0.4866638779640198,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1171875,
"loss/idx": 0.0,
"loss/logits": 0.01593683287501335,
"step": 314
},
{
"epoch": 0.002596075393326026,
"grad_norm": 2.3125,
"grad_norm_var": 18.5145627339681,
"learning_rate": 5e-05,
"loss": 0.2208,
"loss/crossentropy": 2.5530097484588623,
"loss/dist_ce": 0.0,
"loss/hidden": 0.18359375,
"loss/idx": 0.0,
"loss/logits": 0.037243057042360306,
"step": 315
},
{
"epoch": 0.002604316902511188,
"grad_norm": 9.3125,
"grad_norm_var": 17.1267453511556,
"learning_rate": 5e-05,
"loss": 0.2906,
"loss/crossentropy": 1.721799373626709,
"loss/dist_ce": 0.0,
"loss/hidden": 0.251953125,
"loss/idx": 0.0,
"loss/logits": 0.03867912292480469,
"step": 316
},
{
"epoch": 0.00261255841169635,
"grad_norm": 4.21875,
"grad_norm_var": 17.100304921468098,
"learning_rate": 5e-05,
"loss": 0.2352,
"loss/crossentropy": 2.6998496055603027,
"loss/dist_ce": 0.0,
"loss/hidden": 0.189453125,
"loss/idx": 0.0,
"loss/logits": 0.04570027440786362,
"step": 317
},
{
"epoch": 0.0026207999208815117,
"grad_norm": 2.9375,
"grad_norm_var": 4.307966868082683,
"learning_rate": 5e-05,
"loss": 0.268,
"loss/crossentropy": 2.9610462188720703,
"loss/dist_ce": 0.0,
"loss/hidden": 0.203125,
"loss/idx": 0.0,
"loss/logits": 0.06485921144485474,
"step": 318
},
{
"epoch": 0.0026290414300666736,
"grad_norm": 14.9375,
"grad_norm_var": 12.325996653238933,
"learning_rate": 5e-05,
"loss": 0.4275,
"loss/crossentropy": 0.4579806327819824,
"loss/dist_ce": 0.0,
"loss/hidden": 0.369140625,
"loss/idx": 0.0,
"loss/logits": 0.058338165283203125,
"step": 319
},
{
"epoch": 0.002637282939251836,
"grad_norm": 2.6875,
"grad_norm_var": 12.433784739176433,
"learning_rate": 5e-05,
"loss": 0.2148,
"loss/crossentropy": 0.9076347947120667,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.033187899738550186,
"step": 320
},
{
"epoch": 0.002645524448436998,
"grad_norm": 7.5,
"grad_norm_var": 12.813667551676433,
"learning_rate": 5e-05,
"loss": 0.2431,
"loss/crossentropy": 1.4940351247787476,
"loss/dist_ce": 0.0,
"loss/hidden": 0.197265625,
"loss/idx": 0.0,
"loss/logits": 0.04587508738040924,
"step": 321
},
{
"epoch": 0.0026537659576221597,
"grad_norm": 3.09375,
"grad_norm_var": 12.533095041910807,
"learning_rate": 5e-05,
"loss": 0.2585,
"loss/crossentropy": 2.5319809913635254,
"loss/dist_ce": 0.0,
"loss/hidden": 0.193359375,
"loss/idx": 0.0,
"loss/logits": 0.06514355540275574,
"step": 322
},
{
"epoch": 0.0026620074668073216,
"grad_norm": 2.34375,
"grad_norm_var": 12.839448801676433,
"learning_rate": 5e-05,
"loss": 0.1839,
"loss/crossentropy": 1.5948070287704468,
"loss/dist_ce": 0.0,
"loss/hidden": 0.150390625,
"loss/idx": 0.0,
"loss/logits": 0.03354639932513237,
"step": 323
},
{
"epoch": 0.002670248975992484,
"grad_norm": 2.109375,
"grad_norm_var": 12.630688222249349,
"learning_rate": 5e-05,
"loss": 0.2014,
"loss/crossentropy": 0.86527419090271,
"loss/dist_ce": 0.0,
"loss/hidden": 0.173828125,
"loss/idx": 0.0,
"loss/logits": 0.027525369077920914,
"step": 324
},
{
"epoch": 0.002678490485177646,
"grad_norm": 5.4375,
"grad_norm_var": 12.698766835530598,
"learning_rate": 5e-05,
"loss": 0.3587,
"loss/crossentropy": 1.7473679780960083,
"loss/dist_ce": 0.0,
"loss/hidden": 0.27734375,
"loss/idx": 0.0,
"loss/logits": 0.081350177526474,
"step": 325
},
{
"epoch": 0.0026867319943628077,
"grad_norm": 3.8125,
"grad_norm_var": 12.519842274983723,
"learning_rate": 5e-05,
"loss": 0.2829,
"loss/crossentropy": 2.330885410308838,
"loss/dist_ce": 0.0,
"loss/hidden": 0.232421875,
"loss/idx": 0.0,
"loss/logits": 0.05051898583769798,
"step": 326
},
{
"epoch": 0.0026949735035479696,
"grad_norm": 3.078125,
"grad_norm_var": 12.476446278889973,
"learning_rate": 5e-05,
"loss": 0.2309,
"loss/crossentropy": 1.6335246562957764,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19140625,
"loss/idx": 0.0,
"loss/logits": 0.0394761748611927,
"step": 327
},
{
"epoch": 0.0027032150127331315,
"grad_norm": 3.53125,
"grad_norm_var": 12.097102864583333,
"learning_rate": 5e-05,
"loss": 0.1913,
"loss/crossentropy": 1.0116859674453735,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.021392133086919785,
"step": 328
},
{
"epoch": 0.002711456521918294,
"grad_norm": 108.0,
"grad_norm_var": 680.3999959309896,
"learning_rate": 5e-05,
"loss": 0.7489,
"loss/crossentropy": 1.9841949939727783,
"loss/dist_ce": 0.0,
"loss/hidden": 0.609375,
"loss/idx": 0.0,
"loss/logits": 0.13949471712112427,
"step": 329
},
{
"epoch": 0.0027196980311034557,
"grad_norm": 1.8515625,
"grad_norm_var": 679.7595273335775,
"learning_rate": 5e-05,
"loss": 0.1969,
"loss/crossentropy": 2.633579730987549,
"loss/dist_ce": 0.0,
"loss/hidden": 0.154296875,
"loss/idx": 0.0,
"loss/logits": 0.042630117386579514,
"step": 330
},
{
"epoch": 0.0027279395402886176,
"grad_norm": 3.640625,
"grad_norm_var": 678.318477121989,
"learning_rate": 5e-05,
"loss": 0.3044,
"loss/crossentropy": 1.2868930101394653,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25,
"loss/idx": 0.0,
"loss/logits": 0.054360825568437576,
"step": 331
},
{
"epoch": 0.0027361810494737795,
"grad_norm": 2.0,
"grad_norm_var": 683.4576983133952,
"learning_rate": 5e-05,
"loss": 0.1712,
"loss/crossentropy": 1.3776507377624512,
"loss/dist_ce": 0.0,
"loss/hidden": 0.14453125,
"loss/idx": 0.0,
"loss/logits": 0.02663344331085682,
"step": 332
},
{
"epoch": 0.0027444225586589414,
"grad_norm": 1.875,
"grad_norm_var": 685.8260149637858,
"learning_rate": 5e-05,
"loss": 0.2246,
"loss/crossentropy": 2.4259564876556396,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.04682992398738861,
"step": 333
},
{
"epoch": 0.0027526640678441038,
"grad_norm": 2.484375,
"grad_norm_var": 686.2989051818847,
"learning_rate": 5e-05,
"loss": 0.1464,
"loss/crossentropy": 0.36722007393836975,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.011639876291155815,
"step": 334
},
{
"epoch": 0.0027609055770292656,
"grad_norm": 2.484375,
"grad_norm_var": 688.6630531311035,
"learning_rate": 5e-05,
"loss": 0.182,
"loss/crossentropy": 1.3434722423553467,
"loss/dist_ce": 0.0,
"loss/hidden": 0.150390625,
"loss/idx": 0.0,
"loss/logits": 0.03163960948586464,
"step": 335
},
{
"epoch": 0.0027691470862144275,
"grad_norm": 9.4375,
"grad_norm_var": 685.1584144592285,
"learning_rate": 5e-05,
"loss": 0.5014,
"loss/crossentropy": 1.7821474075317383,
"loss/dist_ce": 0.0,
"loss/hidden": 0.380859375,
"loss/idx": 0.0,
"loss/logits": 0.12050823867321014,
"step": 336
},
{
"epoch": 0.0027773885953995894,
"grad_norm": 2.765625,
"grad_norm_var": 688.2431556701661,
"learning_rate": 5e-05,
"loss": 0.1513,
"loss/crossentropy": 2.026543378829956,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1259765625,
"loss/idx": 0.0,
"loss/logits": 0.025330830365419388,
"step": 337
},
{
"epoch": 0.0027856301045847518,
"grad_norm": 2.25,
"grad_norm_var": 689.0501564025878,
"learning_rate": 5e-05,
"loss": 0.1708,
"loss/crossentropy": 1.9071934223175049,
"loss/dist_ce": 0.0,
"loss/hidden": 0.13671875,
"loss/idx": 0.0,
"loss/logits": 0.03409082442522049,
"step": 338
},
{
"epoch": 0.0027938716137699137,
"grad_norm": 1.75,
"grad_norm_var": 689.6639686584473,
"learning_rate": 5e-05,
"loss": 0.1443,
"loss/crossentropy": 0.5046422481536865,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1298828125,
"loss/idx": 0.0,
"loss/logits": 0.014421624131500721,
"step": 339
},
{
"epoch": 0.0028021131229550755,
"grad_norm": 2.59375,
"grad_norm_var": 689.183125559489,
"learning_rate": 5e-05,
"loss": 0.2108,
"loss/crossentropy": 1.3842849731445312,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.03306809440255165,
"step": 340
},
{
"epoch": 0.0028103546321402374,
"grad_norm": 3.984375,
"grad_norm_var": 690.1626604715983,
"learning_rate": 5e-05,
"loss": 0.3321,
"loss/crossentropy": 2.6099562644958496,
"loss/dist_ce": 0.0,
"loss/hidden": 0.248046875,
"loss/idx": 0.0,
"loss/logits": 0.08409686386585236,
"step": 341
},
{
"epoch": 0.0028185961413253993,
"grad_norm": 1.9296875,
"grad_norm_var": 691.8675496419271,
"learning_rate": 5e-05,
"loss": 0.1953,
"loss/crossentropy": 1.5371732711791992,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1572265625,
"loss/idx": 0.0,
"loss/logits": 0.03804505988955498,
"step": 342
},
{
"epoch": 0.0028268376505105617,
"grad_norm": 2.828125,
"grad_norm_var": 692.0889689127604,
"learning_rate": 5e-05,
"loss": 0.2334,
"loss/crossentropy": 2.7439146041870117,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.055648088455200195,
"step": 343
},
{
"epoch": 0.0028350791596957236,
"grad_norm": 2.171875,
"grad_norm_var": 693.3022288004557,
"learning_rate": 5e-05,
"loss": 0.1965,
"loss/crossentropy": 1.5956981182098389,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16015625,
"loss/idx": 0.0,
"loss/logits": 0.036296091973781586,
"step": 344
},
{
"epoch": 0.0028433206688808855,
"grad_norm": 2.59375,
"grad_norm_var": 3.4128326416015624,
"learning_rate": 5e-05,
"loss": 0.1868,
"loss/crossentropy": 1.582602858543396,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1533203125,
"loss/idx": 0.0,
"loss/logits": 0.033465512096881866,
"step": 345
},
{
"epoch": 0.0028515621780660473,
"grad_norm": 2.234375,
"grad_norm_var": 3.3677101135253906,
"learning_rate": 5e-05,
"loss": 0.1981,
"loss/crossentropy": 1.295432209968567,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16796875,
"loss/idx": 0.0,
"loss/logits": 0.030169658362865448,
"step": 346
},
{
"epoch": 0.0028598036872512092,
"grad_norm": 2.4375,
"grad_norm_var": 3.3456214904785155,
"learning_rate": 5e-05,
"loss": 0.2163,
"loss/crossentropy": 1.2951956987380981,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.034686122089624405,
"step": 347
},
{
"epoch": 0.0028680451964363716,
"grad_norm": 3.015625,
"grad_norm_var": 3.293121083577474,
"learning_rate": 5e-05,
"loss": 0.2123,
"loss/crossentropy": 0.4816242456436157,
"loss/dist_ce": 0.0,
"loss/hidden": 0.193359375,
"loss/idx": 0.0,
"loss/logits": 0.018970437347888947,
"step": 348
},
{
"epoch": 0.0028762867056215335,
"grad_norm": 3.15625,
"grad_norm_var": 3.2159624735514325,
"learning_rate": 5e-05,
"loss": 0.2762,
"loss/crossentropy": 2.688483715057373,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2060546875,
"loss/idx": 0.0,
"loss/logits": 0.07011875510215759,
"step": 349
},
{
"epoch": 0.0028845282148066954,
"grad_norm": 3.5625,
"grad_norm_var": 3.2134356180826824,
"learning_rate": 5e-05,
"loss": 0.2319,
"loss/crossentropy": 2.7942285537719727,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.050256311893463135,
"step": 350
},
{
"epoch": 0.0028927697239918572,
"grad_norm": 2.890625,
"grad_norm_var": 3.1917742411295573,
"learning_rate": 5e-05,
"loss": 0.2219,
"loss/crossentropy": 0.3606606721878052,
"loss/dist_ce": 0.0,
"loss/hidden": 0.201171875,
"loss/idx": 0.0,
"loss/logits": 0.020754382014274597,
"step": 351
},
{
"epoch": 0.0029010112331770196,
"grad_norm": 2.640625,
"grad_norm_var": 0.33584772745768227,
"learning_rate": 5e-05,
"loss": 0.2024,
"loss/crossentropy": 1.9477823972702026,
"loss/dist_ce": 0.0,
"loss/hidden": 0.158203125,
"loss/idx": 0.0,
"loss/logits": 0.04415898397564888,
"step": 352
},
{
"epoch": 0.0029092527423621815,
"grad_norm": 2.90625,
"grad_norm_var": 0.33877741495768227,
"learning_rate": 5e-05,
"loss": 0.2767,
"loss/crossentropy": 1.250917673110962,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2314453125,
"loss/idx": 0.0,
"loss/logits": 0.04520602151751518,
"step": 353
},
{
"epoch": 0.0029174942515473434,
"grad_norm": 6.28125,
"grad_norm_var": 1.1211443583170573,
"learning_rate": 5e-05,
"loss": 0.3595,
"loss/crossentropy": 2.2711970806121826,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28125,
"loss/idx": 0.0,
"loss/logits": 0.07829815149307251,
"step": 354
},
{
"epoch": 0.0029257357607325053,
"grad_norm": 3.1875,
"grad_norm_var": 1.0229713439941406,
"learning_rate": 5e-05,
"loss": 0.319,
"loss/crossentropy": 2.352555990219116,
"loss/dist_ce": 0.0,
"loss/hidden": 0.251953125,
"loss/idx": 0.0,
"loss/logits": 0.06703340262174606,
"step": 355
},
{
"epoch": 0.002933977269917667,
"grad_norm": 7.3125,
"grad_norm_var": 2.142752838134766,
"learning_rate": 5e-05,
"loss": 0.3474,
"loss/crossentropy": 2.538165807723999,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2890625,
"loss/idx": 0.0,
"loss/logits": 0.05830331891775131,
"step": 356
},
{
"epoch": 0.0029422187791028295,
"grad_norm": 6.53125,
"grad_norm_var": 2.7735023498535156,
"learning_rate": 5e-05,
"loss": 0.222,
"loss/crossentropy": 1.2392635345458984,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19140625,
"loss/idx": 0.0,
"loss/logits": 0.030571604147553444,
"step": 357
},
{
"epoch": 0.0029504602882879914,
"grad_norm": 2.140625,
"grad_norm_var": 2.7326812744140625,
"learning_rate": 5e-05,
"loss": 0.1149,
"loss/crossentropy": 0.3575584590435028,
"loss/dist_ce": 0.0,
"loss/hidden": 0.107421875,
"loss/idx": 0.0,
"loss/logits": 0.007437488064169884,
"step": 358
},
{
"epoch": 0.0029587017974731533,
"grad_norm": 1.953125,
"grad_norm_var": 2.8581207275390623,
"learning_rate": 5e-05,
"loss": 0.1523,
"loss/crossentropy": 1.4491117000579834,
"loss/dist_ce": 0.0,
"loss/hidden": 0.126953125,
"loss/idx": 0.0,
"loss/logits": 0.025374623015522957,
"step": 359
},
{
"epoch": 0.002966943306658315,
"grad_norm": 6.875,
"grad_norm_var": 3.4463175455729167,
"learning_rate": 5e-05,
"loss": 0.3615,
"loss/crossentropy": 2.15775203704834,
"loss/dist_ce": 0.0,
"loss/hidden": 0.294921875,
"loss/idx": 0.0,
"loss/logits": 0.06661273539066315,
"step": 360
},
{
"epoch": 0.002975184815843477,
"grad_norm": 9.4375,
"grad_norm_var": 5.334586588541667,
"learning_rate": 5e-05,
"loss": 0.2964,
"loss/crossentropy": 1.3742177486419678,
"loss/dist_ce": 0.0,
"loss/hidden": 0.259765625,
"loss/idx": 0.0,
"loss/logits": 0.036625977605581284,
"step": 361
},
{
"epoch": 0.0029834263250286394,
"grad_norm": 1.75,
"grad_norm_var": 5.473623657226563,
"learning_rate": 5e-05,
"loss": 0.1747,
"loss/crossentropy": 1.4426945447921753,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1455078125,
"loss/idx": 0.0,
"loss/logits": 0.02921513468027115,
"step": 362
},
{
"epoch": 0.0029916678342138013,
"grad_norm": 1.6640625,
"grad_norm_var": 5.685538482666016,
"learning_rate": 5e-05,
"loss": 0.1638,
"loss/crossentropy": 1.559606909751892,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1357421875,
"loss/idx": 0.0,
"loss/logits": 0.028026653453707695,
"step": 363
},
{
"epoch": 0.002999909343398963,
"grad_norm": 2.109375,
"grad_norm_var": 5.865667470296224,
"learning_rate": 5e-05,
"loss": 0.1489,
"loss/crossentropy": 0.7043201923370361,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.018008584156632423,
"step": 364
},
{
"epoch": 0.003008150852584125,
"grad_norm": 4.1875,
"grad_norm_var": 5.812695058186849,
"learning_rate": 5e-05,
"loss": 0.2217,
"loss/crossentropy": 1.9148753881454468,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.0439751073718071,
"step": 365
},
{
"epoch": 0.0030163923617692874,
"grad_norm": 4.125,
"grad_norm_var": 5.792956288655599,
"learning_rate": 5e-05,
"loss": 0.3083,
"loss/crossentropy": 2.424750328063965,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23046875,
"loss/idx": 0.0,
"loss/logits": 0.07787832617759705,
"step": 366
},
{
"epoch": 0.0030246338709544493,
"grad_norm": 2.09375,
"grad_norm_var": 5.963744862874349,
"learning_rate": 5e-05,
"loss": 0.1791,
"loss/crossentropy": 1.68095862865448,
"loss/dist_ce": 0.0,
"loss/hidden": 0.146484375,
"loss/idx": 0.0,
"loss/logits": 0.03266463428735733,
"step": 367
},
{
"epoch": 0.003032875380139611,
"grad_norm": 11.875,
"grad_norm_var": 9.52763646443685,
"learning_rate": 5e-05,
"loss": 0.5786,
"loss/crossentropy": 1.7796623706817627,
"loss/dist_ce": 0.0,
"loss/hidden": 0.4765625,
"loss/idx": 0.0,
"loss/logits": 0.10202518105506897,
"step": 368
},
{
"epoch": 0.003041116889324773,
"grad_norm": 2.953125,
"grad_norm_var": 9.516863759358724,
"learning_rate": 5e-05,
"loss": 0.2031,
"loss/crossentropy": 1.285621166229248,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.025359109044075012,
"step": 369
},
{
"epoch": 0.003049358398509935,
"grad_norm": 1.953125,
"grad_norm_var": 9.749049631754557,
"learning_rate": 5e-05,
"loss": 0.176,
"loss/crossentropy": 1.5697388648986816,
"loss/dist_ce": 0.0,
"loss/hidden": 0.14453125,
"loss/idx": 0.0,
"loss/logits": 0.03151218220591545,
"step": 370
},
{
"epoch": 0.0030575999076950973,
"grad_norm": 3.0,
"grad_norm_var": 9.781166330973308,
"learning_rate": 5e-05,
"loss": 0.2175,
"loss/crossentropy": 1.4509150981903076,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1845703125,
"loss/idx": 0.0,
"loss/logits": 0.03292187303304672,
"step": 371
},
{
"epoch": 0.003065841416880259,
"grad_norm": 1.75,
"grad_norm_var": 9.534547678629558,
"learning_rate": 5e-05,
"loss": 0.1677,
"loss/crossentropy": 2.5047202110290527,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1318359375,
"loss/idx": 0.0,
"loss/logits": 0.03586728125810623,
"step": 372
},
{
"epoch": 0.003074082926065421,
"grad_norm": 5.5,
"grad_norm_var": 9.256392161051432,
"learning_rate": 5e-05,
"loss": 0.2845,
"loss/crossentropy": 1.728163719177246,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23828125,
"loss/idx": 0.0,
"loss/logits": 0.04620472714304924,
"step": 373
},
{
"epoch": 0.003082324435250583,
"grad_norm": 3.109375,
"grad_norm_var": 9.079986317952473,
"learning_rate": 5e-05,
"loss": 0.2467,
"loss/crossentropy": 2.323497772216797,
"loss/dist_ce": 0.0,
"loss/hidden": 0.201171875,
"loss/idx": 0.0,
"loss/logits": 0.04549071192741394,
"step": 374
},
{
"epoch": 0.003090565944435745,
"grad_norm": 2.734375,
"grad_norm_var": 8.902730051676432,
"learning_rate": 5e-05,
"loss": 0.1624,
"loss/crossentropy": 1.6974822282791138,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1357421875,
"loss/idx": 0.0,
"loss/logits": 0.026702899485826492,
"step": 375
},
{
"epoch": 0.003098807453620907,
"grad_norm": 2.046875,
"grad_norm_var": 8.553822580973307,
"learning_rate": 5e-05,
"loss": 0.1575,
"loss/crossentropy": 1.636716604232788,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12890625,
"loss/idx": 0.0,
"loss/logits": 0.02862347848713398,
"step": 376
},
{
"epoch": 0.003107048962806069,
"grad_norm": 3.953125,
"grad_norm_var": 6.287947336832683,
"learning_rate": 5e-05,
"loss": 0.366,
"loss/crossentropy": 2.5682785511016846,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28125,
"loss/idx": 0.0,
"loss/logits": 0.08478732407093048,
"step": 377
},
{
"epoch": 0.003115290471991231,
"grad_norm": 4.4375,
"grad_norm_var": 6.139050038655599,
"learning_rate": 5e-05,
"loss": 0.236,
"loss/crossentropy": 1.9300963878631592,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1884765625,
"loss/idx": 0.0,
"loss/logits": 0.04753483831882477,
"step": 378
},
{
"epoch": 0.003123531981176393,
"grad_norm": 8.0,
"grad_norm_var": 7.0182851155598955,
"learning_rate": 5e-05,
"loss": 0.3625,
"loss/crossentropy": 1.655861496925354,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2890625,
"loss/idx": 0.0,
"loss/logits": 0.0734243243932724,
"step": 379
},
{
"epoch": 0.003131773490361555,
"grad_norm": 4.71875,
"grad_norm_var": 6.789794921875,
"learning_rate": 5e-05,
"loss": 0.3235,
"loss/crossentropy": 2.3060801029205322,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2578125,
"loss/idx": 0.0,
"loss/logits": 0.06571874022483826,
"step": 380
},
{
"epoch": 0.003140014999546717,
"grad_norm": 3.640625,
"grad_norm_var": 6.8059234619140625,
"learning_rate": 5e-05,
"loss": 0.2763,
"loss/crossentropy": 2.67515230178833,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2236328125,
"loss/idx": 0.0,
"loss/logits": 0.05268421396613121,
"step": 381
},
{
"epoch": 0.003148256508731879,
"grad_norm": 8.6875,
"grad_norm_var": 8.111107381184896,
"learning_rate": 5e-05,
"loss": 0.2852,
"loss/crossentropy": 2.6763105392456055,
"loss/dist_ce": 0.0,
"loss/hidden": 0.220703125,
"loss/idx": 0.0,
"loss/logits": 0.06450507789850235,
"step": 382
},
{
"epoch": 0.003156498017917041,
"grad_norm": 3.78125,
"grad_norm_var": 7.769432576497396,
"learning_rate": 5e-05,
"loss": 0.2415,
"loss/crossentropy": 2.8496878147125244,
"loss/dist_ce": 0.0,
"loss/hidden": 0.189453125,
"loss/idx": 0.0,
"loss/logits": 0.052030615508556366,
"step": 383
},
{
"epoch": 0.0031647395271022028,
"grad_norm": 4.21875,
"grad_norm_var": 3.9133941650390627,
"learning_rate": 5e-05,
"loss": 0.1925,
"loss/crossentropy": 1.3341301679611206,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1630859375,
"loss/idx": 0.0,
"loss/logits": 0.02942117676138878,
"step": 384
},
{
"epoch": 0.003172981036287365,
"grad_norm": 1.984375,
"grad_norm_var": 4.1111806233723955,
"learning_rate": 5e-05,
"loss": 0.1985,
"loss/crossentropy": 2.383344888687134,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1533203125,
"loss/idx": 0.0,
"loss/logits": 0.04519602656364441,
"step": 385
},
{
"epoch": 0.003181222545472527,
"grad_norm": 2.046875,
"grad_norm_var": 4.0865224202473955,
"learning_rate": 5e-05,
"loss": 0.235,
"loss/crossentropy": 2.4512088298797607,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1806640625,
"loss/idx": 0.0,
"loss/logits": 0.054316744208335876,
"step": 386
},
{
"epoch": 0.003189464054657689,
"grad_norm": 2.03125,
"grad_norm_var": 4.271190388997396,
"learning_rate": 5e-05,
"loss": 0.187,
"loss/crossentropy": 1.7021174430847168,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.034697070717811584,
"step": 387
},
{
"epoch": 0.003197705563842851,
"grad_norm": 6.78125,
"grad_norm_var": 4.400902303059896,
"learning_rate": 5e-05,
"loss": 0.3263,
"loss/crossentropy": 2.0133919715881348,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25,
"loss/idx": 0.0,
"loss/logits": 0.07632862031459808,
"step": 388
},
{
"epoch": 0.0032059470730280127,
"grad_norm": 2.484375,
"grad_norm_var": 4.458426920572917,
"learning_rate": 5e-05,
"loss": 0.2092,
"loss/crossentropy": 2.6211116313934326,
"loss/dist_ce": 0.0,
"loss/hidden": 0.162109375,
"loss/idx": 0.0,
"loss/logits": 0.04707195237278938,
"step": 389
},
{
"epoch": 0.003214188582213175,
"grad_norm": 1.671875,
"grad_norm_var": 4.766141764322916,
"learning_rate": 5e-05,
"loss": 0.1619,
"loss/crossentropy": 1.861954689025879,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1298828125,
"loss/idx": 0.0,
"loss/logits": 0.031997717916965485,
"step": 390
},
{
"epoch": 0.003222430091398337,
"grad_norm": 3.953125,
"grad_norm_var": 4.661246744791667,
"learning_rate": 5e-05,
"loss": 0.2297,
"loss/crossentropy": 1.4026638269424438,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19140625,
"loss/idx": 0.0,
"loss/logits": 0.03833915665745735,
"step": 391
},
{
"epoch": 0.003230671600583499,
"grad_norm": 4.71875,
"grad_norm_var": 4.401887003580729,
"learning_rate": 5e-05,
"loss": 0.3195,
"loss/crossentropy": 2.687175750732422,
"loss/dist_ce": 0.0,
"loss/hidden": 0.251953125,
"loss/idx": 0.0,
"loss/logits": 0.06756812334060669,
"step": 392
},
{
"epoch": 0.0032389131097686607,
"grad_norm": 1.8828125,
"grad_norm_var": 4.736358388264974,
"learning_rate": 5e-05,
"loss": 0.1761,
"loss/crossentropy": 1.6555976867675781,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1435546875,
"loss/idx": 0.0,
"loss/logits": 0.032518401741981506,
"step": 393
},
{
"epoch": 0.003247154618953823,
"grad_norm": 3.640625,
"grad_norm_var": 4.736462148030599,
"learning_rate": 5e-05,
"loss": 0.3446,
"loss/crossentropy": 2.2599704265594482,
"loss/dist_ce": 0.0,
"loss/hidden": 0.279296875,
"loss/idx": 0.0,
"loss/logits": 0.06530951708555222,
"step": 394
},
{
"epoch": 0.003255396128138985,
"grad_norm": 1.8828125,
"grad_norm_var": 3.825056966145833,
"learning_rate": 5e-05,
"loss": 0.204,
"loss/crossentropy": 2.564816951751709,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15625,
"loss/idx": 0.0,
"loss/logits": 0.04773015156388283,
"step": 395
},
{
"epoch": 0.003263637637324147,
"grad_norm": 2.390625,
"grad_norm_var": 3.8267242431640627,
"learning_rate": 5e-05,
"loss": 0.1961,
"loss/crossentropy": 2.249958038330078,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15625,
"loss/idx": 0.0,
"loss/logits": 0.039828941226005554,
"step": 396
},
{
"epoch": 0.0032718791465093087,
"grad_norm": 1.3671875,
"grad_norm_var": 4.103281402587891,
"learning_rate": 5e-05,
"loss": 0.1342,
"loss/crossentropy": 1.058944821357727,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11474609375,
"loss/idx": 0.0,
"loss/logits": 0.019409142434597015,
"step": 397
},
{
"epoch": 0.0032801206556944706,
"grad_norm": 13.375,
"grad_norm_var": 8.815500640869141,
"learning_rate": 5e-05,
"loss": 0.3495,
"loss/crossentropy": 2.6670608520507812,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28125,
"loss/idx": 0.0,
"loss/logits": 0.06825672090053558,
"step": 398
},
{
"epoch": 0.003288362164879633,
"grad_norm": 2.140625,
"grad_norm_var": 8.952433013916016,
"learning_rate": 5e-05,
"loss": 0.2057,
"loss/crossentropy": 2.589582920074463,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1669921875,
"loss/idx": 0.0,
"loss/logits": 0.038732096552848816,
"step": 399
},
{
"epoch": 0.003296603674064795,
"grad_norm": 2.265625,
"grad_norm_var": 9.012959543863932,
"learning_rate": 5e-05,
"loss": 0.134,
"loss/crossentropy": 0.9774411916732788,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1171875,
"loss/idx": 0.0,
"loss/logits": 0.01683815009891987,
"step": 400
},
{
"epoch": 0.0033048451832499567,
"grad_norm": 5.3125,
"grad_norm_var": 9.071028391520182,
"learning_rate": 5e-05,
"loss": 0.3151,
"loss/crossentropy": 2.308528184890747,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2578125,
"loss/idx": 0.0,
"loss/logits": 0.057268112897872925,
"step": 401
},
{
"epoch": 0.0033130866924351186,
"grad_norm": 1.75,
"grad_norm_var": 9.138868967692057,
"learning_rate": 5e-05,
"loss": 0.1162,
"loss/crossentropy": 0.1983821541070938,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1083984375,
"loss/idx": 0.0,
"loss/logits": 0.007823487743735313,
"step": 402
},
{
"epoch": 0.0033213282016202805,
"grad_norm": 3.578125,
"grad_norm_var": 8.964241282145183,
"learning_rate": 5e-05,
"loss": 0.1463,
"loss/crossentropy": 0.282149076461792,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1337890625,
"loss/idx": 0.0,
"loss/logits": 0.012518523260951042,
"step": 403
},
{
"epoch": 0.003329569710805443,
"grad_norm": 5.4375,
"grad_norm_var": 8.52498550415039,
"learning_rate": 5e-05,
"loss": 0.3084,
"loss/crossentropy": 2.364654302597046,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2333984375,
"loss/idx": 0.0,
"loss/logits": 0.07504182308912277,
"step": 404
},
{
"epoch": 0.0033378112199906047,
"grad_norm": 6.40625,
"grad_norm_var": 8.894703928629557,
"learning_rate": 5e-05,
"loss": 0.2316,
"loss/crossentropy": 1.4973769187927246,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1953125,
"loss/idx": 0.0,
"loss/logits": 0.036246173083782196,
"step": 405
},
{
"epoch": 0.0033460527291757666,
"grad_norm": 14.9375,
"grad_norm_var": 16.021522776285806,
"learning_rate": 5e-05,
"loss": 0.3451,
"loss/crossentropy": 2.6580722332000732,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28515625,
"loss/idx": 0.0,
"loss/logits": 0.05991474539041519,
"step": 406
},
{
"epoch": 0.0033542942383609285,
"grad_norm": 1.7109375,
"grad_norm_var": 16.55601298014323,
"learning_rate": 5e-05,
"loss": 0.1524,
"loss/crossentropy": 2.4848172664642334,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1201171875,
"loss/idx": 0.0,
"loss/logits": 0.0323210209608078,
"step": 407
},
{
"epoch": 0.003362535747546091,
"grad_norm": 25.625,
"grad_norm_var": 44.34390360514323,
"learning_rate": 5e-05,
"loss": 0.3535,
"loss/crossentropy": 2.135502338409424,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28515625,
"loss/idx": 0.0,
"loss/logits": 0.06830734014511108,
"step": 408
},
{
"epoch": 0.0033707772567312527,
"grad_norm": 2.15625,
"grad_norm_var": 44.203704579671225,
"learning_rate": 5e-05,
"loss": 0.1807,
"loss/crossentropy": 1.5167546272277832,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1484375,
"loss/idx": 0.0,
"loss/logits": 0.032285355031490326,
"step": 409
},
{
"epoch": 0.0033790187659164146,
"grad_norm": 2.25,
"grad_norm_var": 44.73858820597331,
"learning_rate": 5e-05,
"loss": 0.1845,
"loss/crossentropy": 2.5554723739624023,
"loss/dist_ce": 0.0,
"loss/hidden": 0.146484375,
"loss/idx": 0.0,
"loss/logits": 0.03800758719444275,
"step": 410
},
{
"epoch": 0.0033872602751015765,
"grad_norm": 5.125,
"grad_norm_var": 43.70799051920573,
"learning_rate": 5e-05,
"loss": 0.1819,
"loss/crossentropy": 1.3965002298355103,
"loss/dist_ce": 0.0,
"loss/hidden": 0.162109375,
"loss/idx": 0.0,
"loss/logits": 0.019777944311499596,
"step": 411
},
{
"epoch": 0.0033955017842867384,
"grad_norm": 2.640625,
"grad_norm_var": 43.591942342122394,
"learning_rate": 5e-05,
"loss": 0.2145,
"loss/crossentropy": 1.4451302289962769,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.04454413428902626,
"step": 412
},
{
"epoch": 0.0034037432934719007,
"grad_norm": 2.125,
"grad_norm_var": 43.159234364827476,
"learning_rate": 5e-05,
"loss": 0.1472,
"loss/crossentropy": 0.820690929889679,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1259765625,
"loss/idx": 0.0,
"loss/logits": 0.02119414508342743,
"step": 413
},
{
"epoch": 0.0034119848026570626,
"grad_norm": 2.546875,
"grad_norm_var": 39.915026601155596,
"learning_rate": 5e-05,
"loss": 0.189,
"loss/crossentropy": 2.5425262451171875,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1484375,
"loss/idx": 0.0,
"loss/logits": 0.0405312180519104,
"step": 414
},
{
"epoch": 0.0034202263118422245,
"grad_norm": 1.96875,
"grad_norm_var": 39.991005198160806,
"learning_rate": 5e-05,
"loss": 0.161,
"loss/crossentropy": 1.5188648700714111,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1337890625,
"loss/idx": 0.0,
"loss/logits": 0.027209658175706863,
"step": 415
},
{
"epoch": 0.0034284678210273864,
"grad_norm": 3.40625,
"grad_norm_var": 39.60099461873372,
"learning_rate": 5e-05,
"loss": 0.2137,
"loss/crossentropy": 1.8609509468078613,
"loss/dist_ce": 0.0,
"loss/hidden": 0.173828125,
"loss/idx": 0.0,
"loss/logits": 0.03991951048374176,
"step": 416
},
{
"epoch": 0.0034367093302125483,
"grad_norm": 4.03125,
"grad_norm_var": 39.72469863891602,
"learning_rate": 5e-05,
"loss": 0.3144,
"loss/crossentropy": 2.865185260772705,
"loss/dist_ce": 0.0,
"loss/hidden": 0.244140625,
"loss/idx": 0.0,
"loss/logits": 0.07025311887264252,
"step": 417
},
{
"epoch": 0.0034449508393977106,
"grad_norm": 5.0,
"grad_norm_var": 38.82227350870768,
"learning_rate": 5e-05,
"loss": 0.2031,
"loss/crossentropy": 1.3800697326660156,
"loss/dist_ce": 0.0,
"loss/hidden": 0.17578125,
"loss/idx": 0.0,
"loss/logits": 0.027345672249794006,
"step": 418
},
{
"epoch": 0.0034531923485828725,
"grad_norm": 2.921875,
"grad_norm_var": 39.02252375284831,
"learning_rate": 5e-05,
"loss": 0.1523,
"loss/crossentropy": 0.4079228937625885,
"loss/dist_ce": 0.0,
"loss/hidden": 0.13671875,
"loss/idx": 0.0,
"loss/logits": 0.015567103400826454,
"step": 419
},
{
"epoch": 0.0034614338577680344,
"grad_norm": 4.96875,
"grad_norm_var": 39.04129206339518,
"learning_rate": 5e-05,
"loss": 0.1958,
"loss/crossentropy": 2.5757017135620117,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1640625,
"loss/idx": 0.0,
"loss/logits": 0.031719379127025604,
"step": 420
},
{
"epoch": 0.0034696753669531963,
"grad_norm": 2.15625,
"grad_norm_var": 39.65029271443685,
"learning_rate": 5e-05,
"loss": 0.2189,
"loss/crossentropy": 1.558744192123413,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.04112962260842323,
"step": 421
},
{
"epoch": 0.0034779168761383586,
"grad_norm": 2.4375,
"grad_norm_var": 33.22532526652018,
"learning_rate": 5e-05,
"loss": 0.2209,
"loss/crossentropy": 2.4556710720062256,
"loss/dist_ce": 0.0,
"loss/hidden": 0.171875,
"loss/idx": 0.0,
"loss/logits": 0.04905615746974945,
"step": 422
},
{
"epoch": 0.0034861583853235205,
"grad_norm": 4.03125,
"grad_norm_var": 32.71692606608073,
"learning_rate": 5e-05,
"loss": 0.2253,
"loss/crossentropy": 2.399423360824585,
"loss/dist_ce": 0.0,
"loss/hidden": 0.18359375,
"loss/idx": 0.0,
"loss/logits": 0.04174065962433815,
"step": 423
},
{
"epoch": 0.0034943998945086824,
"grad_norm": 6.53125,
"grad_norm_var": 1.9431711832682292,
"learning_rate": 5e-05,
"loss": 0.3418,
"loss/crossentropy": 1.4518251419067383,
"loss/dist_ce": 0.0,
"loss/hidden": 0.30859375,
"loss/idx": 0.0,
"loss/logits": 0.03323998302221298,
"step": 424
},
{
"epoch": 0.0035026414036938443,
"grad_norm": 2.703125,
"grad_norm_var": 1.87164306640625,
"learning_rate": 5e-05,
"loss": 0.2027,
"loss/crossentropy": 2.5503337383270264,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1611328125,
"loss/idx": 0.0,
"loss/logits": 0.04152850806713104,
"step": 425
},
{
"epoch": 0.0035108829128790062,
"grad_norm": 3.140625,
"grad_norm_var": 1.7813629150390624,
"learning_rate": 5e-05,
"loss": 0.21,
"loss/crossentropy": 2.405348539352417,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.0400310643017292,
"step": 426
},
{
"epoch": 0.0035191244220641685,
"grad_norm": 2.078125,
"grad_norm_var": 1.694677734375,
"learning_rate": 5e-05,
"loss": 0.2251,
"loss/crossentropy": 2.5670955181121826,
"loss/dist_ce": 0.0,
"loss/hidden": 0.173828125,
"loss/idx": 0.0,
"loss/logits": 0.05131090059876442,
"step": 427
},
{
"epoch": 0.0035273659312493304,
"grad_norm": 5.46875,
"grad_norm_var": 1.9485829671223958,
"learning_rate": 5e-05,
"loss": 0.2417,
"loss/crossentropy": 1.348537564277649,
"loss/dist_ce": 0.0,
"loss/hidden": 0.208984375,
"loss/idx": 0.0,
"loss/logits": 0.032746195793151855,
"step": 428
},
{
"epoch": 0.0035356074404344923,
"grad_norm": 2.796875,
"grad_norm_var": 1.8563313802083334,
"learning_rate": 5e-05,
"loss": 0.2383,
"loss/crossentropy": 2.7552454471588135,
"loss/dist_ce": 0.0,
"loss/hidden": 0.189453125,
"loss/idx": 0.0,
"loss/logits": 0.04886690154671669,
"step": 429
},
{
"epoch": 0.0035438489496196542,
"grad_norm": 2.703125,
"grad_norm_var": 1.83775634765625,
"learning_rate": 5e-05,
"loss": 0.3084,
"loss/crossentropy": 2.6097259521484375,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2578125,
"loss/idx": 0.0,
"loss/logits": 0.05057002976536751,
"step": 430
},
{
"epoch": 0.003552090458804816,
"grad_norm": 4.3125,
"grad_norm_var": 1.695849609375,
"learning_rate": 5e-05,
"loss": 0.2062,
"loss/crossentropy": 1.8095245361328125,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.036256495863199234,
"step": 431
},
{
"epoch": 0.0035603319679899785,
"grad_norm": 6.3125,
"grad_norm_var": 2.12232666015625,
"learning_rate": 5e-05,
"loss": 0.2564,
"loss/crossentropy": 2.088921546936035,
"loss/dist_ce": 0.0,
"loss/hidden": 0.201171875,
"loss/idx": 0.0,
"loss/logits": 0.0551944300532341,
"step": 432
},
{
"epoch": 0.0035685734771751403,
"grad_norm": 42.0,
"grad_norm_var": 93.143505859375,
"learning_rate": 5e-05,
"loss": 0.7318,
"loss/crossentropy": 2.4523119926452637,
"loss/dist_ce": 0.0,
"loss/hidden": 0.5859375,
"loss/idx": 0.0,
"loss/logits": 0.14583945274353027,
"step": 433
},
{
"epoch": 0.0035768149863603022,
"grad_norm": 3.09375,
"grad_norm_var": 93.68137613932292,
"learning_rate": 5e-05,
"loss": 0.2037,
"loss/crossentropy": 2.0388007164001465,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1689453125,
"loss/idx": 0.0,
"loss/logits": 0.03475724905729294,
"step": 434
},
{
"epoch": 0.003585056495545464,
"grad_norm": 4.1875,
"grad_norm_var": 93.24458719889323,
"learning_rate": 5e-05,
"loss": 0.2542,
"loss/crossentropy": 2.6730620861053467,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1953125,
"loss/idx": 0.0,
"loss/logits": 0.05885430425405502,
"step": 435
},
{
"epoch": 0.0035932980047306265,
"grad_norm": 3.34375,
"grad_norm_var": 93.6726308186849,
"learning_rate": 5e-05,
"loss": 0.1555,
"loss/crossentropy": 0.4195747375488281,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1416015625,
"loss/idx": 0.0,
"loss/logits": 0.013927996158599854,
"step": 436
},
{
"epoch": 0.0036015395139157884,
"grad_norm": 3.421875,
"grad_norm_var": 93.11043294270833,
"learning_rate": 5e-05,
"loss": 0.254,
"loss/crossentropy": 1.7030478715896606,
"loss/dist_ce": 0.0,
"loss/hidden": 0.208984375,
"loss/idx": 0.0,
"loss/logits": 0.04505797103047371,
"step": 437
},
{
"epoch": 0.0036097810231009502,
"grad_norm": 1.71875,
"grad_norm_var": 93.49947509765624,
"learning_rate": 5e-05,
"loss": 0.1532,
"loss/crossentropy": 0.4963390529155731,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1357421875,
"loss/idx": 0.0,
"loss/logits": 0.017449375241994858,
"step": 438
},
{
"epoch": 0.003618022532286112,
"grad_norm": 3.125,
"grad_norm_var": 93.80262044270833,
"learning_rate": 5e-05,
"loss": 0.2339,
"loss/crossentropy": 2.744438648223877,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.05223686248064041,
"step": 439
},
{
"epoch": 0.003626264041471274,
"grad_norm": 1.1875,
"grad_norm_var": 95.25058186848959,
"learning_rate": 5e-05,
"loss": 0.1237,
"loss/crossentropy": 0.5037131905555725,
"loss/dist_ce": 0.0,
"loss/hidden": 0.109375,
"loss/idx": 0.0,
"loss/logits": 0.014304354786872864,
"step": 440
},
{
"epoch": 0.0036345055506564364,
"grad_norm": 4.5,
"grad_norm_var": 94.72848205566406,
"learning_rate": 5e-05,
"loss": 0.1741,
"loss/crossentropy": 1.277227759361267,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.021763307973742485,
"step": 441
},
{
"epoch": 0.0036427470598415983,
"grad_norm": 8.5625,
"grad_norm_var": 94.61658528645833,
"learning_rate": 5e-05,
"loss": 0.3688,
"loss/crossentropy": 2.51446533203125,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2890625,
"loss/idx": 0.0,
"loss/logits": 0.07975561916828156,
"step": 442
},
{
"epoch": 0.00365098856902676,
"grad_norm": 2.515625,
"grad_norm_var": 94.38951822916667,
"learning_rate": 5e-05,
"loss": 0.2143,
"loss/crossentropy": 2.614020347595215,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.044419120997190475,
"step": 443
},
{
"epoch": 0.003659230078211922,
"grad_norm": 2.203125,
"grad_norm_var": 95.37579650878907,
"learning_rate": 5e-05,
"loss": 0.1516,
"loss/crossentropy": 0.43124791979789734,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.01679244264960289,
"step": 444
},
{
"epoch": 0.0036674715873970844,
"grad_norm": 1.4609375,
"grad_norm_var": 96.05772476196289,
"learning_rate": 5e-05,
"loss": 0.1449,
"loss/crossentropy": 1.3757065534591675,
"loss/dist_ce": 0.0,
"loss/hidden": 0.119140625,
"loss/idx": 0.0,
"loss/logits": 0.02579795941710472,
"step": 445
},
{
"epoch": 0.0036757130965822463,
"grad_norm": 2.859375,
"grad_norm_var": 95.99232559204101,
"learning_rate": 5e-05,
"loss": 0.2612,
"loss/crossentropy": 1.5693522691726685,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2265625,
"loss/idx": 0.0,
"loss/logits": 0.034596264362335205,
"step": 446
},
{
"epoch": 0.003683954605767408,
"grad_norm": 2.25,
"grad_norm_var": 96.70171279907227,
"learning_rate": 5e-05,
"loss": 0.1481,
"loss/crossentropy": 1.425809621810913,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12060546875,
"loss/idx": 0.0,
"loss/logits": 0.02751866541802883,
"step": 447
},
{
"epoch": 0.00369219611495257,
"grad_norm": 4.53125,
"grad_norm_var": 96.77743911743164,
"learning_rate": 5e-05,
"loss": 0.2282,
"loss/crossentropy": 1.8082743883132935,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1875,
"loss/idx": 0.0,
"loss/logits": 0.04072072356939316,
"step": 448
},
{
"epoch": 0.003700437624137732,
"grad_norm": 4.5,
"grad_norm_var": 3.0933570861816406,
"learning_rate": 5e-05,
"loss": 0.284,
"loss/crossentropy": 0.5547680854797363,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25,
"loss/idx": 0.0,
"loss/logits": 0.0339670293033123,
"step": 449
},
{
"epoch": 0.0037086791333228943,
"grad_norm": 4.25,
"grad_norm_var": 3.1387489318847654,
"learning_rate": 5e-05,
"loss": 0.2519,
"loss/crossentropy": 1.6840208768844604,
"loss/dist_ce": 0.0,
"loss/hidden": 0.205078125,
"loss/idx": 0.0,
"loss/logits": 0.04682979732751846,
"step": 450
},
{
"epoch": 0.003716920642508056,
"grad_norm": 3.765625,
"grad_norm_var": 3.1063392639160154,
"learning_rate": 5e-05,
"loss": 0.2045,
"loss/crossentropy": 2.407160520553589,
"loss/dist_ce": 0.0,
"loss/hidden": 0.162109375,
"loss/idx": 0.0,
"loss/logits": 0.04241305589675903,
"step": 451
},
{
"epoch": 0.003725162151693218,
"grad_norm": 5.6875,
"grad_norm_var": 3.4360816955566404,
"learning_rate": 5e-05,
"loss": 0.3723,
"loss/crossentropy": 2.6594908237457275,
"loss/dist_ce": 0.0,
"loss/hidden": 0.279296875,
"loss/idx": 0.0,
"loss/logits": 0.09303879737854004,
"step": 452
},
{
"epoch": 0.00373340366087838,
"grad_norm": 1.6328125,
"grad_norm_var": 3.6628011067708335,
"learning_rate": 5e-05,
"loss": 0.118,
"loss/crossentropy": 0.4520578682422638,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1044921875,
"loss/idx": 0.0,
"loss/logits": 0.01347460225224495,
"step": 453
},
{
"epoch": 0.003741645170063542,
"grad_norm": 2.15625,
"grad_norm_var": 3.5754150390625,
"learning_rate": 5e-05,
"loss": 0.2743,
"loss/crossentropy": 2.884896755218506,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2265625,
"loss/idx": 0.0,
"loss/logits": 0.04774241894483566,
"step": 454
},
{
"epoch": 0.003749886679248704,
"grad_norm": 1.8359375,
"grad_norm_var": 3.7349952697753905,
"learning_rate": 5e-05,
"loss": 0.1811,
"loss/crossentropy": 2.609929323196411,
"loss/dist_ce": 0.0,
"loss/hidden": 0.146484375,
"loss/idx": 0.0,
"loss/logits": 0.034633196890354156,
"step": 455
},
{
"epoch": 0.003758128188433866,
"grad_norm": 10.6875,
"grad_norm_var": 6.612827301025391,
"learning_rate": 5e-05,
"loss": 0.7979,
"loss/crossentropy": 2.925989866256714,
"loss/dist_ce": 0.0,
"loss/hidden": 0.5390625,
"loss/idx": 0.0,
"loss/logits": 0.2587950825691223,
"step": 456
},
{
"epoch": 0.003766369697619028,
"grad_norm": 3.109375,
"grad_norm_var": 6.634012603759766,
"learning_rate": 5e-05,
"loss": 0.2532,
"loss/crossentropy": 2.1211068630218506,
"loss/dist_ce": 0.0,
"loss/hidden": 0.193359375,
"loss/idx": 0.0,
"loss/logits": 0.059833116829395294,
"step": 457
},
{
"epoch": 0.00377461120680419,
"grad_norm": 1.8359375,
"grad_norm_var": 5.25826416015625,
"learning_rate": 5e-05,
"loss": 0.1663,
"loss/crossentropy": 2.170849323272705,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1337890625,
"loss/idx": 0.0,
"loss/logits": 0.03252778202295303,
"step": 458
},
{
"epoch": 0.003782852715989352,
"grad_norm": 1.1640625,
"grad_norm_var": 5.541731516520183,
"learning_rate": 5e-05,
"loss": 0.1449,
"loss/crossentropy": 1.5572426319122314,
"loss/dist_ce": 0.0,
"loss/hidden": 0.123046875,
"loss/idx": 0.0,
"loss/logits": 0.021882327273488045,
"step": 459
},
{
"epoch": 0.003791094225174514,
"grad_norm": 1.96875,
"grad_norm_var": 5.581648508707683,
"learning_rate": 5e-05,
"loss": 0.1801,
"loss/crossentropy": 2.4034504890441895,
"loss/dist_ce": 0.0,
"loss/hidden": 0.142578125,
"loss/idx": 0.0,
"loss/logits": 0.03750108927488327,
"step": 460
},
{
"epoch": 0.003799335734359676,
"grad_norm": 1.46875,
"grad_norm_var": 5.579678344726562,
"learning_rate": 5e-05,
"loss": 0.1353,
"loss/crossentropy": 2.3504481315612793,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1103515625,
"loss/idx": 0.0,
"loss/logits": 0.024939250200986862,
"step": 461
},
{
"epoch": 0.003807577243544838,
"grad_norm": 4.53125,
"grad_norm_var": 5.643570963541666,
"learning_rate": 5e-05,
"loss": 0.1909,
"loss/crossentropy": 0.8316883444786072,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16796875,
"loss/idx": 0.0,
"loss/logits": 0.022978752851486206,
"step": 462
},
{
"epoch": 0.0038158187527299998,
"grad_norm": 3.46875,
"grad_norm_var": 5.539628092447916,
"learning_rate": 5e-05,
"loss": 0.2104,
"loss/crossentropy": 2.7462053298950195,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1640625,
"loss/idx": 0.0,
"loss/logits": 0.046341672539711,
"step": 463
},
{
"epoch": 0.003824060261915162,
"grad_norm": 2.4375,
"grad_norm_var": 5.536083984375,
"learning_rate": 5e-05,
"loss": 0.1901,
"loss/crossentropy": 1.570056676864624,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16015625,
"loss/idx": 0.0,
"loss/logits": 0.02994626574218273,
"step": 464
},
{
"epoch": 0.003832301771100324,
"grad_norm": 5.125,
"grad_norm_var": 5.651643880208334,
"learning_rate": 5e-05,
"loss": 0.2886,
"loss/crossentropy": 3.014599084854126,
"loss/dist_ce": 0.0,
"loss/hidden": 0.234375,
"loss/idx": 0.0,
"loss/logits": 0.05423382669687271,
"step": 465
},
{
"epoch": 0.003840543280285486,
"grad_norm": 4.65625,
"grad_norm_var": 5.705546061197917,
"learning_rate": 5e-05,
"loss": 0.2568,
"loss/crossentropy": 2.5307607650756836,
"loss/dist_ce": 0.0,
"loss/hidden": 0.212890625,
"loss/idx": 0.0,
"loss/logits": 0.043907828629016876,
"step": 466
},
{
"epoch": 0.0038487847894706478,
"grad_norm": 2.65625,
"grad_norm_var": 5.738841756184896,
"learning_rate": 5e-05,
"loss": 0.1794,
"loss/crossentropy": 2.301478624343872,
"loss/dist_ce": 0.0,
"loss/hidden": 0.14453125,
"loss/idx": 0.0,
"loss/logits": 0.034871190786361694,
"step": 467
},
{
"epoch": 0.0038570262986558097,
"grad_norm": 2.3125,
"grad_norm_var": 5.421996053059896,
"learning_rate": 5e-05,
"loss": 0.1572,
"loss/crossentropy": 2.81413197517395,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1220703125,
"loss/idx": 0.0,
"loss/logits": 0.03511942923069,
"step": 468
},
{
"epoch": 0.003865267807840972,
"grad_norm": 3.234375,
"grad_norm_var": 5.24969253540039,
"learning_rate": 5e-05,
"loss": 0.2422,
"loss/crossentropy": 2.552509069442749,
"loss/dist_ce": 0.0,
"loss/hidden": 0.193359375,
"loss/idx": 0.0,
"loss/logits": 0.048864759504795074,
"step": 469
},
{
"epoch": 0.003873509317026134,
"grad_norm": 2.65625,
"grad_norm_var": 5.189699045817057,
"learning_rate": 5e-05,
"loss": 0.2269,
"loss/crossentropy": 1.746779441833496,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1787109375,
"loss/idx": 0.0,
"loss/logits": 0.048161737620830536,
"step": 470
},
{
"epoch": 0.0038817508262112958,
"grad_norm": 2.40625,
"grad_norm_var": 5.097041829427083,
"learning_rate": 5e-05,
"loss": 0.2224,
"loss/crossentropy": 1.6436595916748047,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1806640625,
"loss/idx": 0.0,
"loss/logits": 0.04173795133829117,
"step": 471
},
{
"epoch": 0.0038899923353964577,
"grad_norm": 1.8828125,
"grad_norm_var": 1.336993153889974,
"learning_rate": 5e-05,
"loss": 0.1768,
"loss/crossentropy": 2.6210246086120605,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1357421875,
"loss/idx": 0.0,
"loss/logits": 0.041082605719566345,
"step": 472
},
{
"epoch": 0.00389823384458162,
"grad_norm": 7.125,
"grad_norm_var": 2.5066485087076824,
"learning_rate": 5e-05,
"loss": 0.2539,
"loss/crossentropy": 2.7276291847229004,
"loss/dist_ce": 0.0,
"loss/hidden": 0.203125,
"loss/idx": 0.0,
"loss/logits": 0.050726860761642456,
"step": 473
},
{
"epoch": 0.003906475353766782,
"grad_norm": 2.0,
"grad_norm_var": 2.4815958658854167,
"learning_rate": 5e-05,
"loss": 0.1564,
"loss/crossentropy": 2.6117374897003174,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1220703125,
"loss/idx": 0.0,
"loss/logits": 0.034293532371520996,
"step": 474
},
{
"epoch": 0.003914716862951944,
"grad_norm": 10.3125,
"grad_norm_var": 5.389619700113932,
"learning_rate": 5e-05,
"loss": 0.3135,
"loss/crossentropy": 2.8067679405212402,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2451171875,
"loss/idx": 0.0,
"loss/logits": 0.06842821836471558,
"step": 475
},
{
"epoch": 0.003922958372137106,
"grad_norm": 3.1875,
"grad_norm_var": 5.210853830973307,
"learning_rate": 5e-05,
"loss": 0.237,
"loss/crossentropy": 2.265045642852783,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1875,
"loss/idx": 0.0,
"loss/logits": 0.049473538994789124,
"step": 476
},
{
"epoch": 0.003931199881322268,
"grad_norm": 2.5,
"grad_norm_var": 4.968281809488932,
"learning_rate": 5e-05,
"loss": 0.22,
"loss/crossentropy": 2.7731900215148926,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.05008155107498169,
"step": 477
},
{
"epoch": 0.0039394413905074295,
"grad_norm": 7.75,
"grad_norm_var": 5.937888336181641,
"learning_rate": 5e-05,
"loss": 0.4192,
"loss/crossentropy": 2.859137535095215,
"loss/dist_ce": 0.0,
"loss/hidden": 0.333984375,
"loss/idx": 0.0,
"loss/logits": 0.08519326895475388,
"step": 478
},
{
"epoch": 0.003947682899692591,
"grad_norm": 3.734375,
"grad_norm_var": 5.924122873942057,
"learning_rate": 5e-05,
"loss": 0.2565,
"loss/crossentropy": 1.434816598892212,
"loss/dist_ce": 0.0,
"loss/hidden": 0.224609375,
"loss/idx": 0.0,
"loss/logits": 0.03185056895017624,
"step": 479
},
{
"epoch": 0.003955924408877754,
"grad_norm": 1.90625,
"grad_norm_var": 6.052335357666015,
"learning_rate": 5e-05,
"loss": 0.207,
"loss/crossentropy": 2.1310391426086426,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16015625,
"loss/idx": 0.0,
"loss/logits": 0.04687424749135971,
"step": 480
},
{
"epoch": 0.003964165918062916,
"grad_norm": 3.25,
"grad_norm_var": 5.982144927978515,
"learning_rate": 5e-05,
"loss": 0.2501,
"loss/crossentropy": 2.3727288246154785,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1982421875,
"loss/idx": 0.0,
"loss/logits": 0.05187632888555527,
"step": 481
},
{
"epoch": 0.003972407427248078,
"grad_norm": 1.046875,
"grad_norm_var": 6.40746841430664,
"learning_rate": 5e-05,
"loss": 0.1219,
"loss/crossentropy": 1.4023224115371704,
"loss/dist_ce": 0.0,
"loss/hidden": 0.103515625,
"loss/idx": 0.0,
"loss/logits": 0.018374113366007805,
"step": 482
},
{
"epoch": 0.00398064893643324,
"grad_norm": 4.0625,
"grad_norm_var": 6.349881744384765,
"learning_rate": 5e-05,
"loss": 0.2168,
"loss/crossentropy": 1.5084764957427979,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1669921875,
"loss/idx": 0.0,
"loss/logits": 0.04980340600013733,
"step": 483
},
{
"epoch": 0.003988890445618402,
"grad_norm": 2.09375,
"grad_norm_var": 6.393645985921224,
"learning_rate": 5e-05,
"loss": 0.1647,
"loss/crossentropy": 1.9611366987228394,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1259765625,
"loss/idx": 0.0,
"loss/logits": 0.038717180490493774,
"step": 484
},
{
"epoch": 0.003997131954803564,
"grad_norm": 3.296875,
"grad_norm_var": 6.390036773681641,
"learning_rate": 5e-05,
"loss": 0.2526,
"loss/crossentropy": 2.1912283897399902,
"loss/dist_ce": 0.0,
"loss/hidden": 0.201171875,
"loss/idx": 0.0,
"loss/logits": 0.05140523985028267,
"step": 485
},
{
"epoch": 0.0040053734639887255,
"grad_norm": 2.28125,
"grad_norm_var": 6.45104751586914,
"learning_rate": 5e-05,
"loss": 0.2049,
"loss/crossentropy": 1.1990885734558105,
"loss/dist_ce": 0.0,
"loss/hidden": 0.173828125,
"loss/idx": 0.0,
"loss/logits": 0.03110179677605629,
"step": 486
},
{
"epoch": 0.004013614973173887,
"grad_norm": 2.109375,
"grad_norm_var": 6.506866200764974,
"learning_rate": 5e-05,
"loss": 0.1932,
"loss/crossentropy": 2.469733715057373,
"loss/dist_ce": 0.0,
"loss/hidden": 0.150390625,
"loss/idx": 0.0,
"loss/logits": 0.04281339794397354,
"step": 487
},
{
"epoch": 0.004021856482359049,
"grad_norm": 318.0,
"grad_norm_var": 6177.285184733073,
"learning_rate": 5e-05,
"loss": 1.5086,
"loss/crossentropy": 1.5801646709442139,
"loss/dist_ce": 0.0,
"loss/hidden": 1.390625,
"loss/idx": 0.0,
"loss/logits": 0.1180073618888855,
"step": 488
},
{
"epoch": 0.004030097991544211,
"grad_norm": 11.75,
"grad_norm_var": 6168.57597249349,
"learning_rate": 5e-05,
"loss": 0.4099,
"loss/crossentropy": 2.5404622554779053,
"loss/dist_ce": 0.0,
"loss/hidden": 0.3359375,
"loss/idx": 0.0,
"loss/logits": 0.0739157497882843,
"step": 489
},
{
"epoch": 0.004038339500729374,
"grad_norm": 4.125,
"grad_norm_var": 6162.7084269205725,
"learning_rate": 5e-05,
"loss": 0.2788,
"loss/crossentropy": 2.6295320987701416,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2109375,
"loss/idx": 0.0,
"loss/logits": 0.06782936304807663,
"step": 490
},
{
"epoch": 0.004046581009914536,
"grad_norm": 9.6875,
"grad_norm_var": 6163.8599568684895,
"learning_rate": 5e-05,
"loss": 0.4341,
"loss/crossentropy": 3.0478451251983643,
"loss/dist_ce": 0.0,
"loss/hidden": 0.310546875,
"loss/idx": 0.0,
"loss/logits": 0.12352639436721802,
"step": 491
},
{
"epoch": 0.004054822519099698,
"grad_norm": 5.90625,
"grad_norm_var": 6156.850325520833,
"learning_rate": 5e-05,
"loss": 0.1865,
"loss/crossentropy": 1.3709689378738403,
"loss/dist_ce": 0.0,
"loss/hidden": 0.162109375,
"loss/idx": 0.0,
"loss/logits": 0.02439779043197632,
"step": 492
},
{
"epoch": 0.00406306402828486,
"grad_norm": 1.625,
"grad_norm_var": 6159.402864583333,
"learning_rate": 5e-05,
"loss": 0.1459,
"loss/crossentropy": 1.324977159500122,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12109375,
"loss/idx": 0.0,
"loss/logits": 0.02481193095445633,
"step": 493
},
{
"epoch": 0.0040713055374700215,
"grad_norm": 8.4375,
"grad_norm_var": 6157.950699869792,
"learning_rate": 5e-05,
"loss": 0.2132,
"loss/crossentropy": 1.4893817901611328,
"loss/dist_ce": 0.0,
"loss/hidden": 0.173828125,
"loss/idx": 0.0,
"loss/logits": 0.039393968880176544,
"step": 494
},
{
"epoch": 0.004079547046655183,
"grad_norm": 2.046875,
"grad_norm_var": 6162.678776041666,
"learning_rate": 5e-05,
"loss": 0.132,
"loss/crossentropy": 1.3435920476913452,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1142578125,
"loss/idx": 0.0,
"loss/logits": 0.017779778689146042,
"step": 495
},
{
"epoch": 0.004087788555840345,
"grad_norm": 3.65625,
"grad_norm_var": 6157.749609375,
"learning_rate": 5e-05,
"loss": 0.2431,
"loss/crossentropy": 2.082836151123047,
"loss/dist_ce": 0.0,
"loss/hidden": 0.185546875,
"loss/idx": 0.0,
"loss/logits": 0.057581763714551926,
"step": 496
},
{
"epoch": 0.004096030065025507,
"grad_norm": 6.34375,
"grad_norm_var": 6149.804553222656,
"learning_rate": 5e-05,
"loss": 0.2089,
"loss/crossentropy": 1.231292486190796,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.02723127231001854,
"step": 497
},
{
"epoch": 0.004104271574210669,
"grad_norm": 3.296875,
"grad_norm_var": 6143.188732910156,
"learning_rate": 5e-05,
"loss": 0.2633,
"loss/crossentropy": 2.7151858806610107,
"loss/dist_ce": 0.0,
"loss/hidden": 0.197265625,
"loss/idx": 0.0,
"loss/logits": 0.06607217341661453,
"step": 498
},
{
"epoch": 0.004112513083395832,
"grad_norm": 2.515625,
"grad_norm_var": 6147.511221313476,
"learning_rate": 5e-05,
"loss": 0.1795,
"loss/crossentropy": 2.5187623500823975,
"loss/dist_ce": 0.0,
"loss/hidden": 0.138671875,
"loss/idx": 0.0,
"loss/logits": 0.040810734033584595,
"step": 499
},
{
"epoch": 0.004120754592580994,
"grad_norm": 3.640625,
"grad_norm_var": 6143.101721191406,
"learning_rate": 5e-05,
"loss": 0.3104,
"loss/crossentropy": 2.562577962875366,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23046875,
"loss/idx": 0.0,
"loss/logits": 0.07988132536411285,
"step": 500
},
{
"epoch": 0.004128996101766156,
"grad_norm": 2.46875,
"grad_norm_var": 6145.463117472331,
"learning_rate": 5e-05,
"loss": 0.1763,
"loss/crossentropy": 2.076406478881836,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.04152751341462135,
"step": 501
},
{
"epoch": 0.0041372376109513175,
"grad_norm": 2.15625,
"grad_norm_var": 6145.830125935872,
"learning_rate": 5e-05,
"loss": 0.1825,
"loss/crossentropy": 1.5978915691375732,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1474609375,
"loss/idx": 0.0,
"loss/logits": 0.03506774455308914,
"step": 502
},
{
"epoch": 0.004145479120136479,
"grad_norm": 6.5,
"grad_norm_var": 6134.082059733073,
"learning_rate": 5e-05,
"loss": 0.343,
"loss/crossentropy": 2.7203476428985596,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25390625,
"loss/idx": 0.0,
"loss/logits": 0.0890902578830719,
"step": 503
},
{
"epoch": 0.004153720629321641,
"grad_norm": 2.46875,
"grad_norm_var": 9.201432291666666,
"learning_rate": 5e-05,
"loss": 0.197,
"loss/crossentropy": 1.9945552349090576,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15625,
"loss/idx": 0.0,
"loss/logits": 0.040720634162425995,
"step": 504
},
{
"epoch": 0.004161962138506803,
"grad_norm": 2.71875,
"grad_norm_var": 5.917020670572916,
"learning_rate": 5e-05,
"loss": 0.1455,
"loss/crossentropy": 0.7928286790847778,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.010724226012825966,
"step": 505
},
{
"epoch": 0.004170203647691965,
"grad_norm": 1.5234375,
"grad_norm_var": 6.374580637613932,
"learning_rate": 5e-05,
"loss": 0.1175,
"loss/crossentropy": 1.454852819442749,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0986328125,
"loss/idx": 0.0,
"loss/logits": 0.018820609897375107,
"step": 506
},
{
"epoch": 0.004178445156877127,
"grad_norm": 5.0,
"grad_norm_var": 4.231941477457682,
"learning_rate": 5e-05,
"loss": 0.2223,
"loss/crossentropy": 1.54378080368042,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1865234375,
"loss/idx": 0.0,
"loss/logits": 0.03581738844513893,
"step": 507
},
{
"epoch": 0.00418668666606229,
"grad_norm": 2.28125,
"grad_norm_var": 4.020247141520183,
"learning_rate": 5e-05,
"loss": 0.1535,
"loss/crossentropy": 2.197096109390259,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1259765625,
"loss/idx": 0.0,
"loss/logits": 0.02750355750322342,
"step": 508
},
{
"epoch": 0.004194928175247452,
"grad_norm": 3.890625,
"grad_norm_var": 3.761824289957682,
"learning_rate": 5e-05,
"loss": 0.2321,
"loss/crossentropy": 2.152005195617676,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19140625,
"loss/idx": 0.0,
"loss/logits": 0.040720127522945404,
"step": 509
},
{
"epoch": 0.0042031696844326135,
"grad_norm": 2.625,
"grad_norm_var": 2.189497629801432,
"learning_rate": 5e-05,
"loss": 0.1794,
"loss/crossentropy": 1.419573187828064,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1455078125,
"loss/idx": 0.0,
"loss/logits": 0.03389629349112511,
"step": 510
},
{
"epoch": 0.004211411193617775,
"grad_norm": 2.59375,
"grad_norm_var": 2.1152992248535156,
"learning_rate": 5e-05,
"loss": 0.1537,
"loss/crossentropy": 1.2792004346847534,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.020872898399829865,
"step": 511
},
{
"epoch": 0.004219652702802937,
"grad_norm": 1.8984375,
"grad_norm_var": 2.2378082275390625,
"learning_rate": 5e-05,
"loss": 0.1677,
"loss/crossentropy": 2.1597256660461426,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.03487637639045715,
"step": 512
},
{
"epoch": 0.004227894211988099,
"grad_norm": 3.3125,
"grad_norm_var": 1.5597239176432292,
"learning_rate": 5e-05,
"loss": 0.2512,
"loss/crossentropy": 1.6841063499450684,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19921875,
"loss/idx": 0.0,
"loss/logits": 0.05194816365838051,
"step": 513
},
{
"epoch": 0.004236135721173261,
"grad_norm": 4.78125,
"grad_norm_var": 1.7451741536458334,
"learning_rate": 5e-05,
"loss": 0.2239,
"loss/crossentropy": 2.816648006439209,
"loss/dist_ce": 0.0,
"loss/hidden": 0.171875,
"loss/idx": 0.0,
"loss/logits": 0.051988691091537476,
"step": 514
},
{
"epoch": 0.004244377230358423,
"grad_norm": 11.6875,
"grad_norm_var": 6.229002888997396,
"learning_rate": 5e-05,
"loss": 0.3317,
"loss/crossentropy": 2.0484461784362793,
"loss/dist_ce": 0.0,
"loss/hidden": 0.275390625,
"loss/idx": 0.0,
"loss/logits": 0.05626022815704346,
"step": 515
},
{
"epoch": 0.004252618739543585,
"grad_norm": 3.546875,
"grad_norm_var": 6.2305653889973955,
"learning_rate": 5e-05,
"loss": 0.1705,
"loss/crossentropy": 1.2704802751541138,
"loss/dist_ce": 0.0,
"loss/hidden": 0.140625,
"loss/idx": 0.0,
"loss/logits": 0.029856139793992043,
"step": 516
},
{
"epoch": 0.004260860248728747,
"grad_norm": 2.859375,
"grad_norm_var": 6.175150553385417,
"learning_rate": 5e-05,
"loss": 0.2187,
"loss/crossentropy": 1.2983540296554565,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1826171875,
"loss/idx": 0.0,
"loss/logits": 0.03611702471971512,
"step": 517
},
{
"epoch": 0.0042691017579139096,
"grad_norm": 2.390625,
"grad_norm_var": 6.1290842692057295,
"learning_rate": 5e-05,
"loss": 0.1968,
"loss/crossentropy": 2.924328088760376,
"loss/dist_ce": 0.0,
"loss/hidden": 0.150390625,
"loss/idx": 0.0,
"loss/logits": 0.04644050449132919,
"step": 518
},
{
"epoch": 0.0042773432670990714,
"grad_norm": 2.453125,
"grad_norm_var": 5.671439615885417,
"learning_rate": 5e-05,
"loss": 0.2026,
"loss/crossentropy": 2.4892868995666504,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1611328125,
"loss/idx": 0.0,
"loss/logits": 0.0414496548473835,
"step": 519
},
{
"epoch": 0.004285584776284233,
"grad_norm": 2.453125,
"grad_norm_var": 5.673607381184896,
"learning_rate": 5e-05,
"loss": 0.243,
"loss/crossentropy": 0.931300163269043,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1796875,
"loss/idx": 0.0,
"loss/logits": 0.06334017217159271,
"step": 520
},
{
"epoch": 0.004293826285469395,
"grad_norm": 1.5390625,
"grad_norm_var": 5.883624013264974,
"learning_rate": 5e-05,
"loss": 0.1502,
"loss/crossentropy": 1.3612920045852661,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12109375,
"loss/idx": 0.0,
"loss/logits": 0.02915302664041519,
"step": 521
},
{
"epoch": 0.004302067794654557,
"grad_norm": 1.4609375,
"grad_norm_var": 5.899733225504558,
"learning_rate": 5e-05,
"loss": 0.1209,
"loss/crossentropy": 0.7956821918487549,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10498046875,
"loss/idx": 0.0,
"loss/logits": 0.01589544117450714,
"step": 522
},
{
"epoch": 0.004310309303839719,
"grad_norm": 8.9375,
"grad_norm_var": 7.696473948160807,
"learning_rate": 5e-05,
"loss": 0.466,
"loss/crossentropy": 2.8766582012176514,
"loss/dist_ce": 0.0,
"loss/hidden": 0.40234375,
"loss/idx": 0.0,
"loss/logits": 0.06366438418626785,
"step": 523
},
{
"epoch": 0.004318550813024881,
"grad_norm": 10.75,
"grad_norm_var": 10.611466217041016,
"learning_rate": 5e-05,
"loss": 0.2406,
"loss/crossentropy": 1.433667540550232,
"loss/dist_ce": 0.0,
"loss/hidden": 0.201171875,
"loss/idx": 0.0,
"loss/logits": 0.03946308791637421,
"step": 524
},
{
"epoch": 0.004326792322210043,
"grad_norm": 1.8828125,
"grad_norm_var": 10.94590555826823,
"learning_rate": 5e-05,
"loss": 0.1674,
"loss/crossentropy": 0.8868244290351868,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.03266747295856476,
"step": 525
},
{
"epoch": 0.004335033831395205,
"grad_norm": 10.9375,
"grad_norm_var": 13.659373982747395,
"learning_rate": 5e-05,
"loss": 0.2533,
"loss/crossentropy": 1.5202522277832031,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2236328125,
"loss/idx": 0.0,
"loss/logits": 0.02963770553469658,
"step": 526
},
{
"epoch": 0.0043432753405803675,
"grad_norm": 2.5,
"grad_norm_var": 13.684911092122396,
"learning_rate": 5e-05,
"loss": 0.1905,
"loss/crossentropy": 2.2358787059783936,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1494140625,
"loss/idx": 0.0,
"loss/logits": 0.041103295981884,
"step": 527
},
{
"epoch": 0.004351516849765529,
"grad_norm": 5.65625,
"grad_norm_var": 13.220444488525391,
"learning_rate": 5e-05,
"loss": 0.2986,
"loss/crossentropy": 2.475597381591797,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2333984375,
"loss/idx": 0.0,
"loss/logits": 0.06522452086210251,
"step": 528
},
{
"epoch": 0.004359758358950691,
"grad_norm": 6.125,
"grad_norm_var": 13.148850250244141,
"learning_rate": 5e-05,
"loss": 0.2751,
"loss/crossentropy": 2.1268441677093506,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2275390625,
"loss/idx": 0.0,
"loss/logits": 0.047553326934576035,
"step": 529
},
{
"epoch": 0.004367999868135853,
"grad_norm": 4.1875,
"grad_norm_var": 13.188008371988932,
"learning_rate": 5e-05,
"loss": 0.2687,
"loss/crossentropy": 1.8196269273757935,
"loss/dist_ce": 0.0,
"loss/hidden": 0.22265625,
"loss/idx": 0.0,
"loss/logits": 0.04601012170314789,
"step": 530
},
{
"epoch": 0.004376241377321015,
"grad_norm": 5.96875,
"grad_norm_var": 10.102638498942058,
"learning_rate": 5e-05,
"loss": 0.4053,
"loss/crossentropy": 2.056028127670288,
"loss/dist_ce": 0.0,
"loss/hidden": 0.291015625,
"loss/idx": 0.0,
"loss/logits": 0.11430098116397858,
"step": 531
},
{
"epoch": 0.004384482886506177,
"grad_norm": 2.828125,
"grad_norm_var": 10.236140696207682,
"learning_rate": 5e-05,
"loss": 0.1589,
"loss/crossentropy": 1.7495375871658325,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1298828125,
"loss/idx": 0.0,
"loss/logits": 0.02899114228785038,
"step": 532
},
{
"epoch": 0.004392724395691339,
"grad_norm": 2.65625,
"grad_norm_var": 10.28472671508789,
"learning_rate": 5e-05,
"loss": 0.177,
"loss/crossentropy": 1.6073510646820068,
"loss/dist_ce": 0.0,
"loss/hidden": 0.142578125,
"loss/idx": 0.0,
"loss/logits": 0.034396156668663025,
"step": 533
},
{
"epoch": 0.004400965904876501,
"grad_norm": 2.484375,
"grad_norm_var": 10.258341217041016,
"learning_rate": 5e-05,
"loss": 0.1938,
"loss/crossentropy": 1.503987193107605,
"loss/dist_ce": 0.0,
"loss/hidden": 0.154296875,
"loss/idx": 0.0,
"loss/logits": 0.03948179632425308,
"step": 534
},
{
"epoch": 0.004409207414061663,
"grad_norm": 1.140625,
"grad_norm_var": 10.733182525634765,
"learning_rate": 5e-05,
"loss": 0.1132,
"loss/crossentropy": 1.3690646886825562,
"loss/dist_ce": 0.0,
"loss/hidden": 0.09375,
"loss/idx": 0.0,
"loss/logits": 0.019410330802202225,
"step": 535
},
{
"epoch": 0.004417448923246825,
"grad_norm": 1.3671875,
"grad_norm_var": 11.098802693684895,
"learning_rate": 5e-05,
"loss": 0.1406,
"loss/crossentropy": 1.3874551057815552,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1171875,
"loss/idx": 0.0,
"loss/logits": 0.023373104631900787,
"step": 536
},
{
"epoch": 0.004425690432431987,
"grad_norm": 2.078125,
"grad_norm_var": 10.911236317952474,
"learning_rate": 5e-05,
"loss": 0.1453,
"loss/crossentropy": 1.0550464391708374,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12353515625,
"loss/idx": 0.0,
"loss/logits": 0.021797355264425278,
"step": 537
},
{
"epoch": 0.004433931941617149,
"grad_norm": 4.65625,
"grad_norm_var": 10.28226318359375,
"learning_rate": 5e-05,
"loss": 0.229,
"loss/crossentropy": 2.7873692512512207,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.05122203379869461,
"step": 538
},
{
"epoch": 0.004442173450802311,
"grad_norm": 4.40625,
"grad_norm_var": 8.965958658854166,
"learning_rate": 5e-05,
"loss": 0.3081,
"loss/crossentropy": 2.410203695297241,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25,
"loss/idx": 0.0,
"loss/logits": 0.058051109313964844,
"step": 539
},
{
"epoch": 0.004450414959987473,
"grad_norm": 2.671875,
"grad_norm_var": 6.152814737955729,
"learning_rate": 5e-05,
"loss": 0.1837,
"loss/crossentropy": 1.0460572242736816,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1591796875,
"loss/idx": 0.0,
"loss/logits": 0.024545256048440933,
"step": 540
},
{
"epoch": 0.004458656469172635,
"grad_norm": 2.25,
"grad_norm_var": 6.065093739827474,
"learning_rate": 5e-05,
"loss": 0.1368,
"loss/crossentropy": 0.29145750403404236,
"loss/dist_ce": 0.0,
"loss/hidden": 0.126953125,
"loss/idx": 0.0,
"loss/logits": 0.00982777401804924,
"step": 541
},
{
"epoch": 0.004466897978357797,
"grad_norm": 3.53125,
"grad_norm_var": 2.513854726155599,
"learning_rate": 5e-05,
"loss": 0.1929,
"loss/crossentropy": 1.3329319953918457,
"loss/dist_ce": 0.0,
"loss/hidden": 0.162109375,
"loss/idx": 0.0,
"loss/logits": 0.030750418081879616,
"step": 542
},
{
"epoch": 0.004475139487542959,
"grad_norm": 3.265625,
"grad_norm_var": 2.457928212483724,
"learning_rate": 5e-05,
"loss": 0.2152,
"loss/crossentropy": 2.0753843784332275,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.045312896370887756,
"step": 543
},
{
"epoch": 0.0044833809967281205,
"grad_norm": 3.1875,
"grad_norm_var": 2.114135487874349,
"learning_rate": 5e-05,
"loss": 0.1793,
"loss/crossentropy": 1.0670427083969116,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1455078125,
"loss/idx": 0.0,
"loss/logits": 0.033770665526390076,
"step": 544
},
{
"epoch": 0.004491622505913282,
"grad_norm": 1.6328125,
"grad_norm_var": 1.6834879557291667,
"learning_rate": 5e-05,
"loss": 0.1622,
"loss/crossentropy": 1.4867587089538574,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1337890625,
"loss/idx": 0.0,
"loss/logits": 0.02845672518014908,
"step": 545
},
{
"epoch": 0.004499864015098445,
"grad_norm": 2.859375,
"grad_norm_var": 1.5869049072265624,
"learning_rate": 5e-05,
"loss": 0.1723,
"loss/crossentropy": 1.544826865196228,
"loss/dist_ce": 0.0,
"loss/hidden": 0.146484375,
"loss/idx": 0.0,
"loss/logits": 0.025784984230995178,
"step": 546
},
{
"epoch": 0.004508105524283607,
"grad_norm": 2.1875,
"grad_norm_var": 0.9517730712890625,
"learning_rate": 5e-05,
"loss": 0.2222,
"loss/crossentropy": 2.5712733268737793,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16796875,
"loss/idx": 0.0,
"loss/logits": 0.05422845110297203,
"step": 547
},
{
"epoch": 0.004516347033468769,
"grad_norm": 1.6875,
"grad_norm_var": 1.0136311848958333,
"learning_rate": 5e-05,
"loss": 0.1554,
"loss/crossentropy": 1.6466069221496582,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1279296875,
"loss/idx": 0.0,
"loss/logits": 0.027493983507156372,
"step": 548
},
{
"epoch": 0.004524588542653931,
"grad_norm": 1.5546875,
"grad_norm_var": 1.085455067952474,
"learning_rate": 5e-05,
"loss": 0.1578,
"loss/crossentropy": 2.487321615219116,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12451171875,
"loss/idx": 0.0,
"loss/logits": 0.03331330791115761,
"step": 549
},
{
"epoch": 0.004532830051839093,
"grad_norm": 2.6875,
"grad_norm_var": 1.0859840393066407,
"learning_rate": 5e-05,
"loss": 0.2285,
"loss/crossentropy": 2.7870802879333496,
"loss/dist_ce": 0.0,
"loss/hidden": 0.173828125,
"loss/idx": 0.0,
"loss/logits": 0.054664455354213715,
"step": 550
},
{
"epoch": 0.004541071561024255,
"grad_norm": 1.6875,
"grad_norm_var": 1.000249989827474,
"learning_rate": 5e-05,
"loss": 0.1342,
"loss/crossentropy": 1.667282223701477,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1123046875,
"loss/idx": 0.0,
"loss/logits": 0.02186622843146324,
"step": 551
},
{
"epoch": 0.0045493130702094165,
"grad_norm": 1.265625,
"grad_norm_var": 1.0176829020182292,
"learning_rate": 5e-05,
"loss": 0.1424,
"loss/crossentropy": 2.4660463333129883,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11328125,
"loss/idx": 0.0,
"loss/logits": 0.029071927070617676,
"step": 552
},
{
"epoch": 0.0045575545793945784,
"grad_norm": 1.4609375,
"grad_norm_var": 1.084484608968099,
"learning_rate": 5e-05,
"loss": 0.1525,
"loss/crossentropy": 2.228982925415039,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1220703125,
"loss/idx": 0.0,
"loss/logits": 0.030380506068468094,
"step": 553
},
{
"epoch": 0.00456579608857974,
"grad_norm": 1.4296875,
"grad_norm_var": 0.8341949462890625,
"learning_rate": 5e-05,
"loss": 0.1534,
"loss/crossentropy": 2.0212230682373047,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1171875,
"loss/idx": 0.0,
"loss/logits": 0.0362619124352932,
"step": 554
},
{
"epoch": 0.004574037597764903,
"grad_norm": 2.890625,
"grad_norm_var": 0.5643229166666667,
"learning_rate": 5e-05,
"loss": 0.171,
"loss/crossentropy": 1.549071192741394,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1455078125,
"loss/idx": 0.0,
"loss/logits": 0.02545047551393509,
"step": 555
},
{
"epoch": 0.004582279106950065,
"grad_norm": 4.3125,
"grad_norm_var": 0.8214182535807292,
"learning_rate": 5e-05,
"loss": 0.3118,
"loss/crossentropy": 2.652635097503662,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25390625,
"loss/idx": 0.0,
"loss/logits": 0.057920753955841064,
"step": 556
},
{
"epoch": 0.004590520616135227,
"grad_norm": 1.140625,
"grad_norm_var": 0.9158162434895833,
"learning_rate": 5e-05,
"loss": 0.1287,
"loss/crossentropy": 1.6453478336334229,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1064453125,
"loss/idx": 0.0,
"loss/logits": 0.022242678329348564,
"step": 557
},
{
"epoch": 0.004598762125320389,
"grad_norm": 3.265625,
"grad_norm_var": 0.8765777587890625,
"learning_rate": 5e-05,
"loss": 0.2774,
"loss/crossentropy": 1.622141718864441,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2373046875,
"loss/idx": 0.0,
"loss/logits": 0.04007745534181595,
"step": 558
},
{
"epoch": 0.004607003634505551,
"grad_norm": 1.859375,
"grad_norm_var": 0.8157867431640625,
"learning_rate": 5e-05,
"loss": 0.1513,
"loss/crossentropy": 1.737197995185852,
"loss/dist_ce": 0.0,
"loss/hidden": 0.123046875,
"loss/idx": 0.0,
"loss/logits": 0.0283003281801939,
"step": 559
},
{
"epoch": 0.0046152451436907126,
"grad_norm": 2.328125,
"grad_norm_var": 0.74814453125,
"learning_rate": 5e-05,
"loss": 0.1046,
"loss/crossentropy": 0.2462574690580368,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0966796875,
"loss/idx": 0.0,
"loss/logits": 0.00796109065413475,
"step": 560
},
{
"epoch": 0.0046234866528758745,
"grad_norm": 3.421875,
"grad_norm_var": 0.8270566304524739,
"learning_rate": 5e-05,
"loss": 0.2059,
"loss/crossentropy": 2.6141371726989746,
"loss/dist_ce": 0.0,
"loss/hidden": 0.162109375,
"loss/idx": 0.0,
"loss/logits": 0.043790802359580994,
"step": 561
},
{
"epoch": 0.004631728162061036,
"grad_norm": 3.421875,
"grad_norm_var": 0.892352040608724,
"learning_rate": 5e-05,
"loss": 0.1974,
"loss/crossentropy": 1.422098994255066,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15625,
"loss/idx": 0.0,
"loss/logits": 0.0411381721496582,
"step": 562
},
{
"epoch": 0.004639969671246198,
"grad_norm": 6.125,
"grad_norm_var": 1.808794911702474,
"learning_rate": 5e-05,
"loss": 0.3067,
"loss/crossentropy": 2.6955533027648926,
"loss/dist_ce": 0.0,
"loss/hidden": 0.232421875,
"loss/idx": 0.0,
"loss/logits": 0.07427150756120682,
"step": 563
},
{
"epoch": 0.004648211180431361,
"grad_norm": 3.28125,
"grad_norm_var": 1.787731679280599,
"learning_rate": 5e-05,
"loss": 0.2236,
"loss/crossentropy": 2.841552972793579,
"loss/dist_ce": 0.0,
"loss/hidden": 0.181640625,
"loss/idx": 0.0,
"loss/logits": 0.041970379650592804,
"step": 564
},
{
"epoch": 0.004656452689616523,
"grad_norm": 3.453125,
"grad_norm_var": 1.7399617513020833,
"learning_rate": 5e-05,
"loss": 0.1892,
"loss/crossentropy": 2.6695666313171387,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1494140625,
"loss/idx": 0.0,
"loss/logits": 0.039814580231904984,
"step": 565
},
{
"epoch": 0.004664694198801685,
"grad_norm": 23.0,
"grad_norm_var": 27.352754720052083,
"learning_rate": 5e-05,
"loss": 0.3797,
"loss/crossentropy": 2.7561914920806885,
"loss/dist_ce": 0.0,
"loss/hidden": 0.306640625,
"loss/idx": 0.0,
"loss/logits": 0.0730535015463829,
"step": 566
},
{
"epoch": 0.004672935707986847,
"grad_norm": 4.125,
"grad_norm_var": 26.965547688802083,
"learning_rate": 5e-05,
"loss": 0.2569,
"loss/crossentropy": 1.6983141899108887,
"loss/dist_ce": 0.0,
"loss/hidden": 0.212890625,
"loss/idx": 0.0,
"loss/logits": 0.04405267536640167,
"step": 567
},
{
"epoch": 0.004681177217172009,
"grad_norm": 3.28125,
"grad_norm_var": 26.437889607747397,
"learning_rate": 5e-05,
"loss": 0.1744,
"loss/crossentropy": 1.50763738155365,
"loss/dist_ce": 0.0,
"loss/hidden": 0.14453125,
"loss/idx": 0.0,
"loss/logits": 0.029890574514865875,
"step": 568
},
{
"epoch": 0.0046894187263571705,
"grad_norm": 4.1875,
"grad_norm_var": 25.870477040608723,
"learning_rate": 5e-05,
"loss": 0.2786,
"loss/crossentropy": 2.469428300857544,
"loss/dist_ce": 0.0,
"loss/hidden": 0.216796875,
"loss/idx": 0.0,
"loss/logits": 0.06181073188781738,
"step": 569
},
{
"epoch": 0.004697660235542332,
"grad_norm": 1.4140625,
"grad_norm_var": 25.87682673136393,
"learning_rate": 5e-05,
"loss": 0.1614,
"loss/crossentropy": 1.521830439567566,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.02856556512415409,
"step": 570
},
{
"epoch": 0.004705901744727494,
"grad_norm": 3.109375,
"grad_norm_var": 25.83377456665039,
"learning_rate": 5e-05,
"loss": 0.1871,
"loss/crossentropy": 2.5410780906677246,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1484375,
"loss/idx": 0.0,
"loss/logits": 0.038688138127326965,
"step": 571
},
{
"epoch": 0.004714143253912656,
"grad_norm": 20.25,
"grad_norm_var": 41.34689712524414,
"learning_rate": 5e-05,
"loss": 0.3154,
"loss/crossentropy": 0.9852694272994995,
"loss/dist_ce": 0.0,
"loss/hidden": 0.27734375,
"loss/idx": 0.0,
"loss/logits": 0.0380379781126976,
"step": 572
},
{
"epoch": 0.004722384763097818,
"grad_norm": 6.46875,
"grad_norm_var": 40.0391476949056,
"learning_rate": 5e-05,
"loss": 0.3102,
"loss/crossentropy": 2.0467264652252197,
"loss/dist_ce": 0.0,
"loss/hidden": 0.25,
"loss/idx": 0.0,
"loss/logits": 0.06023257598280907,
"step": 573
},
{
"epoch": 0.004730626272282981,
"grad_norm": 4.40625,
"grad_norm_var": 39.73319880167643,
"learning_rate": 5e-05,
"loss": 0.2448,
"loss/crossentropy": 2.768284320831299,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1875,
"loss/idx": 0.0,
"loss/logits": 0.057292141020298004,
"step": 574
},
{
"epoch": 0.004738867781468143,
"grad_norm": 3.25,
"grad_norm_var": 39.10796076456706,
"learning_rate": 5e-05,
"loss": 0.1371,
"loss/crossentropy": 0.4907649755477905,
"loss/dist_ce": 0.0,
"loss/hidden": 0.125,
"loss/idx": 0.0,
"loss/logits": 0.01206925604492426,
"step": 575
},
{
"epoch": 0.004747109290653305,
"grad_norm": 1.1328125,
"grad_norm_var": 39.77771708170573,
"learning_rate": 5e-05,
"loss": 0.1305,
"loss/crossentropy": 1.4613217115402222,
"loss/dist_ce": 0.0,
"loss/hidden": 0.109375,
"loss/idx": 0.0,
"loss/logits": 0.021137792617082596,
"step": 576
},
{
"epoch": 0.0047553507998384665,
"grad_norm": 1.4453125,
"grad_norm_var": 40.673797353108725,
"learning_rate": 5e-05,
"loss": 0.1593,
"loss/crossentropy": 2.2154600620269775,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12890625,
"loss/idx": 0.0,
"loss/logits": 0.030367335304617882,
"step": 577
},
{
"epoch": 0.004763592309023628,
"grad_norm": 2.40625,
"grad_norm_var": 41.05650812784831,
"learning_rate": 5e-05,
"loss": 0.1951,
"loss/crossentropy": 2.493523120880127,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1513671875,
"loss/idx": 0.0,
"loss/logits": 0.04371439293026924,
"step": 578
},
{
"epoch": 0.00477183381820879,
"grad_norm": 2.28125,
"grad_norm_var": 41.76645075480143,
"learning_rate": 5e-05,
"loss": 0.1766,
"loss/crossentropy": 2.818694829940796,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.04187324270606041,
"step": 579
},
{
"epoch": 0.004780075327393952,
"grad_norm": 2.5625,
"grad_norm_var": 42.00832697550456,
"learning_rate": 5e-05,
"loss": 0.213,
"loss/crossentropy": 3.1531453132629395,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1630859375,
"loss/idx": 0.0,
"loss/logits": 0.049924593418836594,
"step": 580
},
{
"epoch": 0.004788316836579114,
"grad_norm": 2.359375,
"grad_norm_var": 42.370418039957684,
"learning_rate": 5e-05,
"loss": 0.1859,
"loss/crossentropy": 2.076953172683716,
"loss/dist_ce": 0.0,
"loss/hidden": 0.14453125,
"loss/idx": 0.0,
"loss/logits": 0.041379667818546295,
"step": 581
},
{
"epoch": 0.004796558345764276,
"grad_norm": 2.328125,
"grad_norm_var": 20.444233957926432,
"learning_rate": 5e-05,
"loss": 0.1769,
"loss/crossentropy": 2.5741026401519775,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1396484375,
"loss/idx": 0.0,
"loss/logits": 0.03728485107421875,
"step": 582
},
{
"epoch": 0.004804799854949439,
"grad_norm": 2.875,
"grad_norm_var": 20.5315549214681,
"learning_rate": 5e-05,
"loss": 0.1681,
"loss/crossentropy": 2.4309182167053223,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.033366359770298004,
"step": 583
},
{
"epoch": 0.004813041364134601,
"grad_norm": 0.94921875,
"grad_norm_var": 21.090232785542806,
"learning_rate": 5e-05,
"loss": 0.1001,
"loss/crossentropy": 0.35457542538642883,
"loss/dist_ce": 0.0,
"loss/hidden": 0.09228515625,
"loss/idx": 0.0,
"loss/logits": 0.007841967046260834,
"step": 584
},
{
"epoch": 0.0048212828733197625,
"grad_norm": 6.71875,
"grad_norm_var": 21.60826562245687,
"learning_rate": 5e-05,
"loss": 0.3594,
"loss/crossentropy": 2.4798266887664795,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28515625,
"loss/idx": 0.0,
"loss/logits": 0.07420751452445984,
"step": 585
},
{
"epoch": 0.004829524382504924,
"grad_norm": 4.15625,
"grad_norm_var": 21.133738644917806,
"learning_rate": 5e-05,
"loss": 0.2067,
"loss/crossentropy": 2.589935064315796,
"loss/dist_ce": 0.0,
"loss/hidden": 0.166015625,
"loss/idx": 0.0,
"loss/logits": 0.04071066156029701,
"step": 586
},
{
"epoch": 0.004837765891690086,
"grad_norm": 2.640625,
"grad_norm_var": 21.213679440816243,
"learning_rate": 5e-05,
"loss": 0.1794,
"loss/crossentropy": 2.3674118518829346,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1396484375,
"loss/idx": 0.0,
"loss/logits": 0.03970456123352051,
"step": 587
},
{
"epoch": 0.004846007400875248,
"grad_norm": 3.921875,
"grad_norm_var": 2.8025491714477537,
"learning_rate": 5e-05,
"loss": 0.1419,
"loss/crossentropy": 0.37833818793296814,
"loss/dist_ce": 0.0,
"loss/hidden": 0.126953125,
"loss/idx": 0.0,
"loss/logits": 0.01495468057692051,
"step": 588
},
{
"epoch": 0.00485424891006041,
"grad_norm": 5.21875,
"grad_norm_var": 2.3418965021769207,
"learning_rate": 5e-05,
"loss": 0.2142,
"loss/crossentropy": 2.5115513801574707,
"loss/dist_ce": 0.0,
"loss/hidden": 0.171875,
"loss/idx": 0.0,
"loss/logits": 0.04236599802970886,
"step": 589
},
{
"epoch": 0.004862490419245572,
"grad_norm": 1.421875,
"grad_norm_var": 2.3552057266235353,
"learning_rate": 5e-05,
"loss": 0.1423,
"loss/crossentropy": 1.5357518196105957,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1162109375,
"loss/idx": 0.0,
"loss/logits": 0.026050515472888947,
"step": 590
},
{
"epoch": 0.004870731928430734,
"grad_norm": 2.59375,
"grad_norm_var": 2.3474939346313475,
"learning_rate": 5e-05,
"loss": 0.1597,
"loss/crossentropy": 2.9540340900421143,
"loss/dist_ce": 0.0,
"loss/hidden": 0.125,
"loss/idx": 0.0,
"loss/logits": 0.03472492843866348,
"step": 591
},
{
"epoch": 0.004878973437615897,
"grad_norm": 1.40625,
"grad_norm_var": 2.2909016291300457,
"learning_rate": 5e-05,
"loss": 0.127,
"loss/crossentropy": 2.3022918701171875,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10302734375,
"loss/idx": 0.0,
"loss/logits": 0.024019837379455566,
"step": 592
},
{
"epoch": 0.0048872149468010585,
"grad_norm": 1.6171875,
"grad_norm_var": 2.261008135477702,
"learning_rate": 5e-05,
"loss": 0.1432,
"loss/crossentropy": 0.7338389158248901,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12451171875,
"loss/idx": 0.0,
"loss/logits": 0.018692631274461746,
"step": 593
},
{
"epoch": 0.00489545645598622,
"grad_norm": 2.046875,
"grad_norm_var": 2.2899148941040037,
"learning_rate": 5e-05,
"loss": 0.1657,
"loss/crossentropy": 2.504016160964966,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12890625,
"loss/idx": 0.0,
"loss/logits": 0.036834657192230225,
"step": 594
},
{
"epoch": 0.004903697965171382,
"grad_norm": 3.953125,
"grad_norm_var": 2.344827715555827,
"learning_rate": 5e-05,
"loss": 0.1572,
"loss/crossentropy": 1.7626408338546753,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1279296875,
"loss/idx": 0.0,
"loss/logits": 0.029316924512386322,
"step": 595
},
{
"epoch": 0.004911939474356544,
"grad_norm": 1.953125,
"grad_norm_var": 2.3973347345987954,
"learning_rate": 5e-05,
"loss": 0.141,
"loss/crossentropy": 1.9684767723083496,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1142578125,
"loss/idx": 0.0,
"loss/logits": 0.026772357523441315,
"step": 596
},
{
"epoch": 0.004920180983541706,
"grad_norm": 10.625,
"grad_norm_var": 6.088076210021972,
"learning_rate": 5e-05,
"loss": 0.2187,
"loss/crossentropy": 1.9169155359268188,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1865234375,
"loss/idx": 0.0,
"loss/logits": 0.03213420510292053,
"step": 597
},
{
"epoch": 0.004928422492726868,
"grad_norm": 4.5625,
"grad_norm_var": 6.080293718973795,
"learning_rate": 5e-05,
"loss": 0.1759,
"loss/crossentropy": 1.367024540901184,
"loss/dist_ce": 0.0,
"loss/hidden": 0.142578125,
"loss/idx": 0.0,
"loss/logits": 0.03334027901291847,
"step": 598
},
{
"epoch": 0.00493666400191203,
"grad_norm": 2.4375,
"grad_norm_var": 6.131121762593588,
"learning_rate": 5e-05,
"loss": 0.2373,
"loss/crossentropy": 2.452223300933838,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19921875,
"loss/idx": 0.0,
"loss/logits": 0.038047999143600464,
"step": 599
},
{
"epoch": 0.004944905511097192,
"grad_norm": 3.859375,
"grad_norm_var": 5.665278879801432,
"learning_rate": 5e-05,
"loss": 0.2239,
"loss/crossentropy": 1.2548903226852417,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1982421875,
"loss/idx": 0.0,
"loss/logits": 0.025662653148174286,
"step": 600
},
{
"epoch": 0.004953147020282354,
"grad_norm": 3.265625,
"grad_norm_var": 5.018717193603516,
"learning_rate": 5e-05,
"loss": 0.2458,
"loss/crossentropy": 2.334947347640991,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1875,
"loss/idx": 0.0,
"loss/logits": 0.058346427977085114,
"step": 601
},
{
"epoch": 0.0049613885294675164,
"grad_norm": 1.140625,
"grad_norm_var": 5.315175120035807,
"learning_rate": 5e-05,
"loss": 0.0935,
"loss/crossentropy": 0.5309077501296997,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0849609375,
"loss/idx": 0.0,
"loss/logits": 0.008535758592188358,
"step": 602
},
{
"epoch": 0.004969630038652678,
"grad_norm": 2.859375,
"grad_norm_var": 5.299181874593099,
"learning_rate": 5e-05,
"loss": 0.2036,
"loss/crossentropy": 2.4964077472686768,
"loss/dist_ce": 0.0,
"loss/hidden": 0.158203125,
"loss/idx": 0.0,
"loss/logits": 0.045372504740953445,
"step": 603
},
{
"epoch": 0.00497787154783784,
"grad_norm": 1.515625,
"grad_norm_var": 5.463201649983724,
"learning_rate": 5e-05,
"loss": 0.1444,
"loss/crossentropy": 1.4457087516784668,
"loss/dist_ce": 0.0,
"loss/hidden": 0.119140625,
"loss/idx": 0.0,
"loss/logits": 0.025238193571567535,
"step": 604
},
{
"epoch": 0.004986113057023002,
"grad_norm": 2.34375,
"grad_norm_var": 5.188616689046224,
"learning_rate": 5e-05,
"loss": 0.164,
"loss/crossentropy": 1.5325767993927002,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.033134825527668,
"step": 605
},
{
"epoch": 0.004994354566208164,
"grad_norm": 2.875,
"grad_norm_var": 5.019653065999349,
"learning_rate": 5e-05,
"loss": 0.2116,
"loss/crossentropy": 1.964009404182434,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1494140625,
"loss/idx": 0.0,
"loss/logits": 0.06218406930565834,
"step": 606
},
{
"epoch": 0.005002596075393326,
"grad_norm": 1.796875,
"grad_norm_var": 5.109509023030599,
"learning_rate": 5e-05,
"loss": 0.1477,
"loss/crossentropy": 1.4914216995239258,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12353515625,
"loss/idx": 0.0,
"loss/logits": 0.024151228368282318,
"step": 607
},
{
"epoch": 0.005010837584578488,
"grad_norm": 2.6875,
"grad_norm_var": 4.937090810139974,
"learning_rate": 5e-05,
"loss": 0.167,
"loss/crossentropy": 1.5185096263885498,
"loss/dist_ce": 0.0,
"loss/hidden": 0.140625,
"loss/idx": 0.0,
"loss/logits": 0.02637227438390255,
"step": 608
},
{
"epoch": 0.00501907909376365,
"grad_norm": 3.21875,
"grad_norm_var": 4.781574503580729,
"learning_rate": 5e-05,
"loss": 0.2422,
"loss/crossentropy": 2.671537160873413,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1904296875,
"loss/idx": 0.0,
"loss/logits": 0.05179464817047119,
"step": 609
},
{
"epoch": 0.005027320602948812,
"grad_norm": 2.640625,
"grad_norm_var": 4.712612915039062,
"learning_rate": 5e-05,
"loss": 0.1652,
"loss/crossentropy": 1.1440718173980713,
"loss/dist_ce": 0.0,
"loss/hidden": 0.14453125,
"loss/idx": 0.0,
"loss/logits": 0.020696066319942474,
"step": 610
},
{
"epoch": 0.005035562112133974,
"grad_norm": 2.265625,
"grad_norm_var": 4.7286529541015625,
"learning_rate": 5e-05,
"loss": 0.1605,
"loss/crossentropy": 0.47643014788627625,
"loss/dist_ce": 0.0,
"loss/hidden": 0.14453125,
"loss/idx": 0.0,
"loss/logits": 0.016013499349355698,
"step": 611
},
{
"epoch": 0.005043803621319136,
"grad_norm": 1.7421875,
"grad_norm_var": 4.764475250244141,
"learning_rate": 5e-05,
"loss": 0.1235,
"loss/crossentropy": 1.5909593105316162,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10546875,
"loss/idx": 0.0,
"loss/logits": 0.018061984330415726,
"step": 612
},
{
"epoch": 0.005052045130504298,
"grad_norm": 1.4765625,
"grad_norm_var": 0.8343994140625,
"learning_rate": 5e-05,
"loss": 0.1189,
"loss/crossentropy": 2.336538076400757,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0966796875,
"loss/idx": 0.0,
"loss/logits": 0.022249765694141388,
"step": 613
},
{
"epoch": 0.00506028663968946,
"grad_norm": 4.9375,
"grad_norm_var": 0.9441650390625,
"learning_rate": 5e-05,
"loss": 0.2263,
"loss/crossentropy": 1.665325403213501,
"loss/dist_ce": 0.0,
"loss/hidden": 0.189453125,
"loss/idx": 0.0,
"loss/logits": 0.036880407482385635,
"step": 614
},
{
"epoch": 0.005068528148874622,
"grad_norm": 3.765625,
"grad_norm_var": 1.0315826416015625,
"learning_rate": 5e-05,
"loss": 0.2465,
"loss/crossentropy": 1.6349374055862427,
"loss/dist_ce": 0.0,
"loss/hidden": 0.185546875,
"loss/idx": 0.0,
"loss/logits": 0.06092265248298645,
"step": 615
},
{
"epoch": 0.005076769658059784,
"grad_norm": 3.8125,
"grad_norm_var": 1.02415771484375,
"learning_rate": 5e-05,
"loss": 0.2903,
"loss/crossentropy": 2.2787890434265137,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2177734375,
"loss/idx": 0.0,
"loss/logits": 0.07248455286026001,
"step": 616
},
{
"epoch": 0.005085011167244946,
"grad_norm": 1.421875,
"grad_norm_var": 1.0844156901041666,
"learning_rate": 5e-05,
"loss": 0.1222,
"loss/crossentropy": 0.9886749982833862,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10595703125,
"loss/idx": 0.0,
"loss/logits": 0.016225244849920273,
"step": 617
},
{
"epoch": 0.005093252676430108,
"grad_norm": 2.703125,
"grad_norm_var": 0.9472900390625,
"learning_rate": 5e-05,
"loss": 0.217,
"loss/crossentropy": 2.012622594833374,
"loss/dist_ce": 0.0,
"loss/hidden": 0.171875,
"loss/idx": 0.0,
"loss/logits": 0.0451560840010643,
"step": 618
},
{
"epoch": 0.0051014941856152695,
"grad_norm": 1.9375,
"grad_norm_var": 0.9720774332682292,
"learning_rate": 5e-05,
"loss": 0.1792,
"loss/crossentropy": 2.2141380310058594,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1435546875,
"loss/idx": 0.0,
"loss/logits": 0.03566785901784897,
"step": 619
},
{
"epoch": 0.005109735694800432,
"grad_norm": 1.9765625,
"grad_norm_var": 0.9204770406087239,
"learning_rate": 5e-05,
"loss": 0.1819,
"loss/crossentropy": 2.5171334743499756,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1474609375,
"loss/idx": 0.0,
"loss/logits": 0.03444764018058777,
"step": 620
},
{
"epoch": 0.005117977203985594,
"grad_norm": 2.171875,
"grad_norm_var": 0.9281979878743489,
"learning_rate": 5e-05,
"loss": 0.1252,
"loss/crossentropy": 1.3747243881225586,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1025390625,
"loss/idx": 0.0,
"loss/logits": 0.022622188553214073,
"step": 621
},
{
"epoch": 0.005126218713170756,
"grad_norm": 3.5625,
"grad_norm_var": 0.9839230855305989,
"learning_rate": 5e-05,
"loss": 0.1941,
"loss/crossentropy": 1.6743167638778687,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15625,
"loss/idx": 0.0,
"loss/logits": 0.03780867159366608,
"step": 622
},
{
"epoch": 0.005134460222355918,
"grad_norm": 1.5703125,
"grad_norm_var": 1.0123687744140626,
"learning_rate": 5e-05,
"loss": 0.1265,
"loss/crossentropy": 2.2040176391601562,
"loss/dist_ce": 0.0,
"loss/hidden": 0.09814453125,
"loss/idx": 0.0,
"loss/logits": 0.028337322175502777,
"step": 623
},
{
"epoch": 0.00514270173154108,
"grad_norm": 2.5625,
"grad_norm_var": 1.0121897379557292,
"learning_rate": 5e-05,
"loss": 0.1402,
"loss/crossentropy": 0.7447776794433594,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1240234375,
"loss/idx": 0.0,
"loss/logits": 0.016176920384168625,
"step": 624
},
{
"epoch": 0.005150943240726242,
"grad_norm": 1.6953125,
"grad_norm_var": 1.0336626688639323,
"learning_rate": 5e-05,
"loss": 0.1657,
"loss/crossentropy": 1.4739638566970825,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.03092530183494091,
"step": 625
},
{
"epoch": 0.005159184749911404,
"grad_norm": 2.484375,
"grad_norm_var": 1.032574208577474,
"learning_rate": 5e-05,
"loss": 0.1703,
"loss/crossentropy": 2.6135871410369873,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1318359375,
"loss/idx": 0.0,
"loss/logits": 0.03844151645898819,
"step": 626
},
{
"epoch": 0.0051674262590965655,
"grad_norm": 2.75,
"grad_norm_var": 1.0317543029785157,
"learning_rate": 5e-05,
"loss": 0.1908,
"loss/crossentropy": 2.5848896503448486,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.03841109946370125,
"step": 627
},
{
"epoch": 0.005175667768281727,
"grad_norm": 1.9375,
"grad_norm_var": 1.0134755452473958,
"learning_rate": 5e-05,
"loss": 0.1499,
"loss/crossentropy": 0.32518985867500305,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1396484375,
"loss/idx": 0.0,
"loss/logits": 0.010256130248308182,
"step": 628
},
{
"epoch": 0.005183909277466889,
"grad_norm": 3.296875,
"grad_norm_var": 0.9605608622233073,
"learning_rate": 5e-05,
"loss": 0.1996,
"loss/crossentropy": 2.3065407276153564,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1640625,
"loss/idx": 0.0,
"loss/logits": 0.035511888563632965,
"step": 629
},
{
"epoch": 0.005192150786652052,
"grad_norm": 1.828125,
"grad_norm_var": 0.6212827046712239,
"learning_rate": 5e-05,
"loss": 0.1813,
"loss/crossentropy": 2.5903186798095703,
"loss/dist_ce": 0.0,
"loss/hidden": 0.140625,
"loss/idx": 0.0,
"loss/logits": 0.040679510682821274,
"step": 630
},
{
"epoch": 0.005200392295837214,
"grad_norm": 2.828125,
"grad_norm_var": 0.5139218648274739,
"learning_rate": 5e-05,
"loss": 0.1537,
"loss/crossentropy": 1.5172139406204224,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1240234375,
"loss/idx": 0.0,
"loss/logits": 0.029673364013433456,
"step": 631
},
{
"epoch": 0.005208633805022376,
"grad_norm": 2.765625,
"grad_norm_var": 0.3864702860514323,
"learning_rate": 5e-05,
"loss": 0.1728,
"loss/crossentropy": 1.9679391384124756,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1396484375,
"loss/idx": 0.0,
"loss/logits": 0.03311806917190552,
"step": 632
},
{
"epoch": 0.005216875314207538,
"grad_norm": 1.453125,
"grad_norm_var": 0.38269220987955727,
"learning_rate": 5e-05,
"loss": 0.1564,
"loss/crossentropy": 1.8976471424102783,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1259765625,
"loss/idx": 0.0,
"loss/logits": 0.030387457460165024,
"step": 633
},
{
"epoch": 0.0052251168233927,
"grad_norm": 1.4375,
"grad_norm_var": 0.4224077860514323,
"learning_rate": 5e-05,
"loss": 0.1537,
"loss/crossentropy": 2.396242618560791,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1220703125,
"loss/idx": 0.0,
"loss/logits": 0.03165833652019501,
"step": 634
},
{
"epoch": 0.0052333583325778615,
"grad_norm": 3.296875,
"grad_norm_var": 0.4783404032389323,
"learning_rate": 5e-05,
"loss": 0.2148,
"loss/crossentropy": 2.8331291675567627,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1669921875,
"loss/idx": 0.0,
"loss/logits": 0.04783923923969269,
"step": 635
},
{
"epoch": 0.005241599841763023,
"grad_norm": 2.640625,
"grad_norm_var": 0.47274169921875,
"learning_rate": 5e-05,
"loss": 0.1847,
"loss/crossentropy": 0.8493193984031677,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15625,
"loss/idx": 0.0,
"loss/logits": 0.028483962640166283,
"step": 636
},
{
"epoch": 0.005249841350948185,
"grad_norm": 2.203125,
"grad_norm_var": 0.4718831380208333,
"learning_rate": 5e-05,
"loss": 0.157,
"loss/crossentropy": 2.3299150466918945,
"loss/dist_ce": 0.0,
"loss/hidden": 0.125,
"loss/idx": 0.0,
"loss/logits": 0.032000426203012466,
"step": 637
},
{
"epoch": 0.005258082860133347,
"grad_norm": 2.796875,
"grad_norm_var": 0.3892893473307292,
"learning_rate": 5e-05,
"loss": 0.181,
"loss/crossentropy": 1.8806333541870117,
"loss/dist_ce": 0.0,
"loss/hidden": 0.14453125,
"loss/idx": 0.0,
"loss/logits": 0.03642831742763519,
"step": 638
},
{
"epoch": 0.00526632436931851,
"grad_norm": 3.828125,
"grad_norm_var": 0.4741778055826823,
"learning_rate": 5e-05,
"loss": 0.2628,
"loss/crossentropy": 2.58074951171875,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2021484375,
"loss/idx": 0.0,
"loss/logits": 0.06061544269323349,
"step": 639
},
{
"epoch": 0.005274565878503672,
"grad_norm": 3.0625,
"grad_norm_var": 0.49478327433268227,
"learning_rate": 5e-05,
"loss": 0.2085,
"loss/crossentropy": 2.6464812755584717,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1630859375,
"loss/idx": 0.0,
"loss/logits": 0.04545789211988449,
"step": 640
},
{
"epoch": 0.005282807387688834,
"grad_norm": 32.0,
"grad_norm_var": 54.56477762858073,
"learning_rate": 5e-05,
"loss": 0.8256,
"loss/crossentropy": 3.0567638874053955,
"loss/dist_ce": 0.0,
"loss/hidden": 0.546875,
"loss/idx": 0.0,
"loss/logits": 0.27873265743255615,
"step": 641
},
{
"epoch": 0.005291048896873996,
"grad_norm": 3.09375,
"grad_norm_var": 54.431278483072916,
"learning_rate": 5e-05,
"loss": 0.2481,
"loss/crossentropy": 1.814937949180603,
"loss/dist_ce": 0.0,
"loss/hidden": 0.197265625,
"loss/idx": 0.0,
"loss/logits": 0.050825513899326324,
"step": 642
},
{
"epoch": 0.0052992904060591576,
"grad_norm": 3.5,
"grad_norm_var": 54.29631754557292,
"learning_rate": 5e-05,
"loss": 0.1537,
"loss/crossentropy": 1.2851777076721191,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1337890625,
"loss/idx": 0.0,
"loss/logits": 0.019936833530664444,
"step": 643
},
{
"epoch": 0.0053075319152443194,
"grad_norm": 7.25,
"grad_norm_var": 54.24651285807292,
"learning_rate": 5e-05,
"loss": 0.2315,
"loss/crossentropy": 1.3410425186157227,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2001953125,
"loss/idx": 0.0,
"loss/logits": 0.03128086030483246,
"step": 644
},
{
"epoch": 0.005315773424429481,
"grad_norm": 2.34375,
"grad_norm_var": 54.49813537597656,
"learning_rate": 5e-05,
"loss": 0.1844,
"loss/crossentropy": 2.3179078102111816,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1435546875,
"loss/idx": 0.0,
"loss/logits": 0.04083427041769028,
"step": 645
},
{
"epoch": 0.005324014933614643,
"grad_norm": 3.90625,
"grad_norm_var": 53.952762858072916,
"learning_rate": 5e-05,
"loss": 0.2196,
"loss/crossentropy": 2.614328384399414,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1689453125,
"loss/idx": 0.0,
"loss/logits": 0.05064046010375023,
"step": 646
},
{
"epoch": 0.005332256442799805,
"grad_norm": 3088.0,
"grad_norm_var": 594094.3569895426,
"learning_rate": 5e-05,
"loss": 74.0435,
"loss/crossentropy": 5.280629634857178,
"loss/dist_ce": 0.0,
"loss/hidden": 70.5,
"loss/idx": 0.0,
"loss/logits": 3.543522834777832,
"step": 647
},
{
"epoch": 0.005340497951984968,
"grad_norm": 6.65625,
"grad_norm_var": 593994.1685831706,
"learning_rate": 5e-05,
"loss": 0.3181,
"loss/crossentropy": 2.1775877475738525,
"loss/dist_ce": 0.0,
"loss/hidden": 0.26171875,
"loss/idx": 0.0,
"loss/logits": 0.05634221434593201,
"step": 648
},
{
"epoch": 0.00534873946117013,
"grad_norm": 3.375,
"grad_norm_var": 593944.0428049724,
"learning_rate": 5e-05,
"loss": 0.2034,
"loss/crossentropy": 1.4645721912384033,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.03352264687418938,
"step": 649
},
{
"epoch": 0.005356980970355292,
"grad_norm": 1.15625,
"grad_norm_var": 593951.4221018474,
"learning_rate": 5e-05,
"loss": 0.1173,
"loss/crossentropy": 1.4656540155410767,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0986328125,
"loss/idx": 0.0,
"loss/logits": 0.018643483519554138,
"step": 650
},
{
"epoch": 0.005365222479540454,
"grad_norm": 1.1953125,
"grad_norm_var": 594006.2750038147,
"learning_rate": 5e-05,
"loss": 0.1105,
"loss/crossentropy": 0.5469728708267212,
"loss/dist_ce": 0.0,
"loss/hidden": 0.095703125,
"loss/idx": 0.0,
"loss/logits": 0.014785300940275192,
"step": 651
},
{
"epoch": 0.0053734639887256155,
"grad_norm": 1.796875,
"grad_norm_var": 594028.2904518128,
"learning_rate": 5e-05,
"loss": 0.1277,
"loss/crossentropy": 1.5997320413589478,
"loss/dist_ce": 0.0,
"loss/hidden": 0.103515625,
"loss/idx": 0.0,
"loss/logits": 0.024179790169000626,
"step": 652
},
{
"epoch": 0.005381705497910777,
"grad_norm": 2.546875,
"grad_norm_var": 594019.3290728251,
"learning_rate": 5e-05,
"loss": 0.2144,
"loss/crossentropy": 1.897312045097351,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16796875,
"loss/idx": 0.0,
"loss/logits": 0.04644346237182617,
"step": 653
},
{
"epoch": 0.005389947007095939,
"grad_norm": 7.25,
"grad_norm_var": 593904.7219866435,
"learning_rate": 5e-05,
"loss": 0.1896,
"loss/crossentropy": 0.3107888996601105,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16796875,
"loss/idx": 0.0,
"loss/logits": 0.021600116044282913,
"step": 654
},
{
"epoch": 0.005398188516281101,
"grad_norm": 3.40625,
"grad_norm_var": 593915.6656878154,
"learning_rate": 5e-05,
"loss": 0.2583,
"loss/crossentropy": 2.793339967727661,
"loss/dist_ce": 0.0,
"loss/hidden": 0.201171875,
"loss/idx": 0.0,
"loss/logits": 0.057117462158203125,
"step": 655
},
{
"epoch": 0.005406430025466263,
"grad_norm": 2.328125,
"grad_norm_var": 593934.8025632222,
"learning_rate": 5e-05,
"loss": 0.18,
"loss/crossentropy": 1.4796594381332397,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.027638476341962814,
"step": 656
},
{
"epoch": 0.005414671534651425,
"grad_norm": 1.6875,
"grad_norm_var": 594663.6030799865,
"learning_rate": 5e-05,
"loss": 0.1305,
"loss/crossentropy": 1.4301519393920898,
"loss/dist_ce": 0.0,
"loss/hidden": 0.109375,
"loss/idx": 0.0,
"loss/logits": 0.021160051226615906,
"step": 657
},
{
"epoch": 0.005422913043836588,
"grad_norm": 1.8203125,
"grad_norm_var": 594696.4953653972,
"learning_rate": 5e-05,
"loss": 0.1742,
"loss/crossentropy": 1.6321804523468018,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1376953125,
"loss/idx": 0.0,
"loss/logits": 0.03655288740992546,
"step": 658
},
{
"epoch": 0.00543115455302175,
"grad_norm": 3.46875,
"grad_norm_var": 594697.2980875651,
"learning_rate": 5e-05,
"loss": 0.263,
"loss/crossentropy": 2.723104476928711,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19921875,
"loss/idx": 0.0,
"loss/logits": 0.0637696385383606,
"step": 659
},
{
"epoch": 0.0054393960622069115,
"grad_norm": 3.140625,
"grad_norm_var": 594801.8477040608,
"learning_rate": 5e-05,
"loss": 0.1395,
"loss/crossentropy": 1.4695775508880615,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1181640625,
"loss/idx": 0.0,
"loss/logits": 0.021351780742406845,
"step": 660
},
{
"epoch": 0.005447637571392073,
"grad_norm": 1.9375,
"grad_norm_var": 594812.3412261963,
"learning_rate": 5e-05,
"loss": 0.1788,
"loss/crossentropy": 2.657424211502075,
"loss/dist_ce": 0.0,
"loss/hidden": 0.138671875,
"loss/idx": 0.0,
"loss/logits": 0.040120575577020645,
"step": 661
},
{
"epoch": 0.005455879080577235,
"grad_norm": 5.0,
"grad_norm_var": 594784.4235422771,
"learning_rate": 5e-05,
"loss": 0.1576,
"loss/crossentropy": 0.4258406162261963,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1396484375,
"loss/idx": 0.0,
"loss/logits": 0.01790551282465458,
"step": 662
},
{
"epoch": 0.005464120589762397,
"grad_norm": 3.109375,
"grad_norm_var": 3.251456705729167,
"learning_rate": 5e-05,
"loss": 0.2469,
"loss/crossentropy": 1.6977794170379639,
"loss/dist_ce": 0.0,
"loss/hidden": 0.208984375,
"loss/idx": 0.0,
"loss/logits": 0.03788114711642265,
"step": 663
},
{
"epoch": 0.005472362098947559,
"grad_norm": 1.8828125,
"grad_norm_var": 2.4230974833170573,
"learning_rate": 5e-05,
"loss": 0.1297,
"loss/crossentropy": 1.3648561239242554,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1083984375,
"loss/idx": 0.0,
"loss/logits": 0.021297637373209,
"step": 664
},
{
"epoch": 0.005480603608132721,
"grad_norm": 1.828125,
"grad_norm_var": 2.4579424540201824,
"learning_rate": 5e-05,
"loss": 0.1722,
"loss/crossentropy": 2.6582846641540527,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1337890625,
"loss/idx": 0.0,
"loss/logits": 0.03840293735265732,
"step": 665
},
{
"epoch": 0.005488845117317883,
"grad_norm": 1.703125,
"grad_norm_var": 2.3624529520670574,
"learning_rate": 5e-05,
"loss": 0.1233,
"loss/crossentropy": 1.4073052406311035,
"loss/dist_ce": 0.0,
"loss/hidden": 0.103515625,
"loss/idx": 0.0,
"loss/logits": 0.019805913791060448,
"step": 666
},
{
"epoch": 0.005497086626503046,
"grad_norm": 3.015625,
"grad_norm_var": 2.1906728108723956,
"learning_rate": 5e-05,
"loss": 0.1421,
"loss/crossentropy": 1.2232915163040161,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1181640625,
"loss/idx": 0.0,
"loss/logits": 0.02389139123260975,
"step": 667
},
{
"epoch": 0.0055053281356882075,
"grad_norm": 2.078125,
"grad_norm_var": 2.155370076497396,
"learning_rate": 5e-05,
"loss": 0.1769,
"loss/crossentropy": 2.633976697921753,
"loss/dist_ce": 0.0,
"loss/hidden": 0.138671875,
"loss/idx": 0.0,
"loss/logits": 0.03825639933347702,
"step": 668
},
{
"epoch": 0.005513569644873369,
"grad_norm": 1.3984375,
"grad_norm_var": 2.289989980061849,
"learning_rate": 5e-05,
"loss": 0.1432,
"loss/crossentropy": 2.7594661712646484,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1123046875,
"loss/idx": 0.0,
"loss/logits": 0.030905555933713913,
"step": 669
},
{
"epoch": 0.005521811154058531,
"grad_norm": 2.875,
"grad_norm_var": 0.899731190999349,
"learning_rate": 5e-05,
"loss": 0.2301,
"loss/crossentropy": 2.9316701889038086,
"loss/dist_ce": 0.0,
"loss/hidden": 0.171875,
"loss/idx": 0.0,
"loss/logits": 0.058252155780792236,
"step": 670
},
{
"epoch": 0.005530052663243693,
"grad_norm": 2.109375,
"grad_norm_var": 0.8554888407389323,
"learning_rate": 5e-05,
"loss": 0.1341,
"loss/crossentropy": 1.2458611726760864,
"loss/dist_ce": 0.0,
"loss/hidden": 0.111328125,
"loss/idx": 0.0,
"loss/logits": 0.022815225645899773,
"step": 671
},
{
"epoch": 0.005538294172428855,
"grad_norm": 1.859375,
"grad_norm_var": 0.8775530497233073,
"learning_rate": 5e-05,
"loss": 0.1773,
"loss/crossentropy": 1.787272572517395,
"loss/dist_ce": 0.0,
"loss/hidden": 0.138671875,
"loss/idx": 0.0,
"loss/logits": 0.03861699998378754,
"step": 672
},
{
"epoch": 0.005546535681614017,
"grad_norm": 5.40625,
"grad_norm_var": 1.3726600646972655,
"learning_rate": 5e-05,
"loss": 0.2194,
"loss/crossentropy": 1.330421805381775,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1943359375,
"loss/idx": 0.0,
"loss/logits": 0.025075137615203857,
"step": 673
},
{
"epoch": 0.005554777190799179,
"grad_norm": 0.9609375,
"grad_norm_var": 1.5155535380045573,
"learning_rate": 5e-05,
"loss": 0.1263,
"loss/crossentropy": 2.6272873878479004,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1015625,
"loss/idx": 0.0,
"loss/logits": 0.024767953902482986,
"step": 674
},
{
"epoch": 0.005563018699984341,
"grad_norm": 5.375,
"grad_norm_var": 1.9607175191243489,
"learning_rate": 5e-05,
"loss": 0.3144,
"loss/crossentropy": 3.5015265941619873,
"loss/dist_ce": 0.0,
"loss/hidden": 0.23828125,
"loss/idx": 0.0,
"loss/logits": 0.07607264816761017,
"step": 675
},
{
"epoch": 0.0055712602091695035,
"grad_norm": 2.546875,
"grad_norm_var": 1.9502418518066407,
"learning_rate": 5e-05,
"loss": 0.2386,
"loss/crossentropy": 2.711198091506958,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1875,
"loss/idx": 0.0,
"loss/logits": 0.05106600373983383,
"step": 676
},
{
"epoch": 0.005579501718354665,
"grad_norm": 2.765625,
"grad_norm_var": 1.909698232014974,
"learning_rate": 5e-05,
"loss": 0.203,
"loss/crossentropy": 1.9062882661819458,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1640625,
"loss/idx": 0.0,
"loss/logits": 0.03898348659276962,
"step": 677
},
{
"epoch": 0.005587743227539827,
"grad_norm": 1.796875,
"grad_norm_var": 1.5877174377441405,
"learning_rate": 5e-05,
"loss": 0.1617,
"loss/crossentropy": 2.5185117721557617,
"loss/dist_ce": 0.0,
"loss/hidden": 0.126953125,
"loss/idx": 0.0,
"loss/logits": 0.03473159298300743,
"step": 678
},
{
"epoch": 0.005595984736724989,
"grad_norm": 3.71875,
"grad_norm_var": 1.6568275451660157,
"learning_rate": 5e-05,
"loss": 0.1797,
"loss/crossentropy": 1.0520906448364258,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1640625,
"loss/idx": 0.0,
"loss/logits": 0.015656642615795135,
"step": 679
},
{
"epoch": 0.005604226245910151,
"grad_norm": 3.34375,
"grad_norm_var": 1.6539265950520834,
"learning_rate": 5e-05,
"loss": 0.1532,
"loss/crossentropy": 1.6271475553512573,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.022353362292051315,
"step": 680
},
{
"epoch": 0.005612467755095313,
"grad_norm": 1.6796875,
"grad_norm_var": 1.6720415751139324,
"learning_rate": 5e-05,
"loss": 0.1228,
"loss/crossentropy": 0.8219252228736877,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1083984375,
"loss/idx": 0.0,
"loss/logits": 0.014401828870177269,
"step": 681
},
{
"epoch": 0.005620709264280475,
"grad_norm": 2.46875,
"grad_norm_var": 1.6105323791503907,
"learning_rate": 5e-05,
"loss": 0.1697,
"loss/crossentropy": 1.7836802005767822,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.03692241013050079,
"step": 682
},
{
"epoch": 0.005628950773465637,
"grad_norm": 5.0,
"grad_norm_var": 1.9368690490722655,
"learning_rate": 5e-05,
"loss": 0.2266,
"loss/crossentropy": 2.947277784347534,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1796875,
"loss/idx": 0.0,
"loss/logits": 0.04695526510477066,
"step": 683
},
{
"epoch": 0.005637192282650799,
"grad_norm": 3.171875,
"grad_norm_var": 1.9010515848795573,
"learning_rate": 5e-05,
"loss": 0.1932,
"loss/crossentropy": 2.684128999710083,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.04086027294397354,
"step": 684
},
{
"epoch": 0.005645433791835961,
"grad_norm": 3.90625,
"grad_norm_var": 1.7904368082682292,
"learning_rate": 5e-05,
"loss": 0.3706,
"loss/crossentropy": 2.2005116939544678,
"loss/dist_ce": 0.0,
"loss/hidden": 0.30078125,
"loss/idx": 0.0,
"loss/logits": 0.06984560191631317,
"step": 685
},
{
"epoch": 0.005653675301021123,
"grad_norm": 4.5625,
"grad_norm_var": 1.9264475504557292,
"learning_rate": 5e-05,
"loss": 0.2539,
"loss/crossentropy": 2.7152457237243652,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19921875,
"loss/idx": 0.0,
"loss/logits": 0.05464334040880203,
"step": 686
},
{
"epoch": 0.005661916810206285,
"grad_norm": 3.6875,
"grad_norm_var": 1.8595621744791666,
"learning_rate": 5e-05,
"loss": 0.2398,
"loss/crossentropy": 3.029996633529663,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1796875,
"loss/idx": 0.0,
"loss/logits": 0.060077205300331116,
"step": 687
},
{
"epoch": 0.005670158319391447,
"grad_norm": 2.3125,
"grad_norm_var": 1.787433878580729,
"learning_rate": 5e-05,
"loss": 0.1699,
"loss/crossentropy": 2.006049156188965,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1376953125,
"loss/idx": 0.0,
"loss/logits": 0.032181136310100555,
"step": 688
},
{
"epoch": 0.005678399828576609,
"grad_norm": 3.296875,
"grad_norm_var": 1.4714396158854166,
"learning_rate": 5e-05,
"loss": 0.176,
"loss/crossentropy": 2.032977819442749,
"loss/dist_ce": 0.0,
"loss/hidden": 0.138671875,
"loss/idx": 0.0,
"loss/logits": 0.037300050258636475,
"step": 689
},
{
"epoch": 0.005686641337761771,
"grad_norm": 2.0,
"grad_norm_var": 1.2339637756347657,
"learning_rate": 5e-05,
"loss": 0.1712,
"loss/crossentropy": 1.572161316871643,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1474609375,
"loss/idx": 0.0,
"loss/logits": 0.023761317133903503,
"step": 690
},
{
"epoch": 0.005694882846946933,
"grad_norm": 1.578125,
"grad_norm_var": 1.0475807189941406,
"learning_rate": 5e-05,
"loss": 0.1608,
"loss/crossentropy": 0.5517882704734802,
"loss/dist_ce": 0.0,
"loss/hidden": 0.138671875,
"loss/idx": 0.0,
"loss/logits": 0.02215992659330368,
"step": 691
},
{
"epoch": 0.005703124356132095,
"grad_norm": 2.1875,
"grad_norm_var": 1.076873524983724,
"learning_rate": 5e-05,
"loss": 0.1669,
"loss/crossentropy": 2.8021109104156494,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.036080196499824524,
"step": 692
},
{
"epoch": 0.005711365865317257,
"grad_norm": 1.140625,
"grad_norm_var": 1.2856056213378906,
"learning_rate": 5e-05,
"loss": 0.1113,
"loss/crossentropy": 0.46256670355796814,
"loss/dist_ce": 0.0,
"loss/hidden": 0.09716796875,
"loss/idx": 0.0,
"loss/logits": 0.014169261790812016,
"step": 693
},
{
"epoch": 0.0057196073745024185,
"grad_norm": 1.8046875,
"grad_norm_var": 1.284496053059896,
"learning_rate": 5e-05,
"loss": 0.145,
"loss/crossentropy": 1.454952359199524,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12109375,
"loss/idx": 0.0,
"loss/logits": 0.023933004587888718,
"step": 694
},
{
"epoch": 0.005727848883687581,
"grad_norm": 1.34375,
"grad_norm_var": 1.3670644124348958,
"learning_rate": 5e-05,
"loss": 0.1255,
"loss/crossentropy": 1.4848099946975708,
"loss/dist_ce": 0.0,
"loss/hidden": 0.103515625,
"loss/idx": 0.0,
"loss/logits": 0.021940922364592552,
"step": 695
},
{
"epoch": 0.005736090392872743,
"grad_norm": 4.125,
"grad_norm_var": 1.470417277018229,
"learning_rate": 5e-05,
"loss": 0.2243,
"loss/crossentropy": 2.6562836170196533,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1748046875,
"loss/idx": 0.0,
"loss/logits": 0.04948100447654724,
"step": 696
},
{
"epoch": 0.005744331902057905,
"grad_norm": 3.71875,
"grad_norm_var": 1.4347735087076823,
"learning_rate": 5e-05,
"loss": 0.1446,
"loss/crossentropy": 1.2862632274627686,
"loss/dist_ce": 0.0,
"loss/hidden": 0.125,
"loss/idx": 0.0,
"loss/logits": 0.019616402685642242,
"step": 697
},
{
"epoch": 0.005752573411243067,
"grad_norm": 2.53125,
"grad_norm_var": 1.4314735412597657,
"learning_rate": 5e-05,
"loss": 0.1774,
"loss/crossentropy": 2.1273880004882812,
"loss/dist_ce": 0.0,
"loss/hidden": 0.140625,
"loss/idx": 0.0,
"loss/logits": 0.036726418882608414,
"step": 698
},
{
"epoch": 0.005760814920428229,
"grad_norm": 1.7421875,
"grad_norm_var": 1.1817291259765625,
"learning_rate": 5e-05,
"loss": 0.1739,
"loss/crossentropy": 2.4990146160125732,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1357421875,
"loss/idx": 0.0,
"loss/logits": 0.03811194375157356,
"step": 699
},
{
"epoch": 0.005769056429613391,
"grad_norm": 4.15625,
"grad_norm_var": 1.30496826171875,
"learning_rate": 5e-05,
"loss": 0.252,
"loss/crossentropy": 2.5132620334625244,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2041015625,
"loss/idx": 0.0,
"loss/logits": 0.04793284088373184,
"step": 700
},
{
"epoch": 0.005777297938798553,
"grad_norm": 3.359375,
"grad_norm_var": 1.2397776285807292,
"learning_rate": 5e-05,
"loss": 0.2582,
"loss/crossentropy": 2.5892200469970703,
"loss/dist_ce": 0.0,
"loss/hidden": 0.203125,
"loss/idx": 0.0,
"loss/logits": 0.05511770024895668,
"step": 701
},
{
"epoch": 0.0057855394479837145,
"grad_norm": 3.328125,
"grad_norm_var": 1.0320393880208334,
"learning_rate": 5e-05,
"loss": 0.2048,
"loss/crossentropy": 1.6278858184814453,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1650390625,
"loss/idx": 0.0,
"loss/logits": 0.03974189609289169,
"step": 702
},
{
"epoch": 0.005793780957168876,
"grad_norm": 5.4375,
"grad_norm_var": 1.4668050130208334,
"learning_rate": 5e-05,
"loss": 0.2686,
"loss/crossentropy": 1.9843369722366333,
"loss/dist_ce": 0.0,
"loss/hidden": 0.220703125,
"loss/idx": 0.0,
"loss/logits": 0.047865502536296844,
"step": 703
},
{
"epoch": 0.005802022466354039,
"grad_norm": 1.1796875,
"grad_norm_var": 1.6136797587076823,
"learning_rate": 5e-05,
"loss": 0.1149,
"loss/crossentropy": 1.3343570232391357,
"loss/dist_ce": 0.0,
"loss/hidden": 0.095703125,
"loss/idx": 0.0,
"loss/logits": 0.019155774265527725,
"step": 704
},
{
"epoch": 0.005810263975539201,
"grad_norm": 2.78125,
"grad_norm_var": 1.5880999247233072,
"learning_rate": 5e-05,
"loss": 0.2146,
"loss/crossentropy": 1.3166966438293457,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1796875,
"loss/idx": 0.0,
"loss/logits": 0.03489375486969948,
"step": 705
},
{
"epoch": 0.005818505484724363,
"grad_norm": 3.109375,
"grad_norm_var": 1.568743642171224,
"learning_rate": 5e-05,
"loss": 0.2285,
"loss/crossentropy": 1.6788283586502075,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1845703125,
"loss/idx": 0.0,
"loss/logits": 0.043936047703027725,
"step": 706
},
{
"epoch": 0.005826746993909525,
"grad_norm": 3.3125,
"grad_norm_var": 1.4926389058430989,
"learning_rate": 5e-05,
"loss": 0.1728,
"loss/crossentropy": 2.161801815032959,
"loss/dist_ce": 0.0,
"loss/hidden": 0.138671875,
"loss/idx": 0.0,
"loss/logits": 0.034114591777324677,
"step": 707
},
{
"epoch": 0.005834988503094687,
"grad_norm": 3.015625,
"grad_norm_var": 1.4647112528483073,
"learning_rate": 5e-05,
"loss": 0.1577,
"loss/crossentropy": 2.5755579471588135,
"loss/dist_ce": 0.0,
"loss/hidden": 0.125,
"loss/idx": 0.0,
"loss/logits": 0.032723598182201385,
"step": 708
},
{
"epoch": 0.005843230012279849,
"grad_norm": 3.015625,
"grad_norm_var": 1.2495012919108073,
"learning_rate": 5e-05,
"loss": 0.2659,
"loss/crossentropy": 2.3093197345733643,
"loss/dist_ce": 0.0,
"loss/hidden": 0.21484375,
"loss/idx": 0.0,
"loss/logits": 0.05105920881032944,
"step": 709
},
{
"epoch": 0.0058514715214650105,
"grad_norm": 3.3125,
"grad_norm_var": 1.1517781575520833,
"learning_rate": 5e-05,
"loss": 0.2048,
"loss/crossentropy": 1.972427248954773,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1640625,
"loss/idx": 0.0,
"loss/logits": 0.04072684049606323,
"step": 710
},
{
"epoch": 0.005859713030650172,
"grad_norm": 1.375,
"grad_norm_var": 1.1445556640625,
"learning_rate": 5e-05,
"loss": 0.1302,
"loss/crossentropy": 1.6267015933990479,
"loss/dist_ce": 0.0,
"loss/hidden": 0.107421875,
"loss/idx": 0.0,
"loss/logits": 0.022783808410167694,
"step": 711
},
{
"epoch": 0.005867954539835334,
"grad_norm": 3.046875,
"grad_norm_var": 1.0689605712890624,
"learning_rate": 5e-05,
"loss": 0.1712,
"loss/crossentropy": 2.76011323928833,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1337890625,
"loss/idx": 0.0,
"loss/logits": 0.037429310381412506,
"step": 712
},
{
"epoch": 0.005876196049020497,
"grad_norm": 1.8125,
"grad_norm_var": 1.1200917561848958,
"learning_rate": 5e-05,
"loss": 0.1373,
"loss/crossentropy": 1.5537292957305908,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11572265625,
"loss/idx": 0.0,
"loss/logits": 0.021540062502026558,
"step": 713
},
{
"epoch": 0.005884437558205659,
"grad_norm": 1.90625,
"grad_norm_var": 1.1758371988932292,
"learning_rate": 5e-05,
"loss": 0.1585,
"loss/crossentropy": 2.612987995147705,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1220703125,
"loss/idx": 0.0,
"loss/logits": 0.036412760615348816,
"step": 714
},
{
"epoch": 0.005892679067390821,
"grad_norm": 1.8671875,
"grad_norm_var": 1.1580474853515625,
"learning_rate": 5e-05,
"loss": 0.1586,
"loss/crossentropy": 2.5890583992004395,
"loss/dist_ce": 0.0,
"loss/hidden": 0.125,
"loss/idx": 0.0,
"loss/logits": 0.033603958785533905,
"step": 715
},
{
"epoch": 0.005900920576575983,
"grad_norm": 1.671875,
"grad_norm_var": 1.11971435546875,
"learning_rate": 5e-05,
"loss": 0.1545,
"loss/crossentropy": 2.612929344177246,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1181640625,
"loss/idx": 0.0,
"loss/logits": 0.0363757461309433,
"step": 716
},
{
"epoch": 0.005909162085761145,
"grad_norm": 3.078125,
"grad_norm_var": 1.1007080078125,
"learning_rate": 5e-05,
"loss": 0.1632,
"loss/crossentropy": 0.4507390856742859,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1416015625,
"loss/idx": 0.0,
"loss/logits": 0.021595872938632965,
"step": 717
},
{
"epoch": 0.0059174035949463065,
"grad_norm": 8.25,
"grad_norm_var": 3.0249176025390625,
"learning_rate": 5e-05,
"loss": 0.1985,
"loss/crossentropy": 0.8891280889511108,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1748046875,
"loss/idx": 0.0,
"loss/logits": 0.023697488009929657,
"step": 718
},
{
"epoch": 0.005925645104131468,
"grad_norm": 0.90625,
"grad_norm_var": 2.842015584309896,
"learning_rate": 5e-05,
"loss": 0.1113,
"loss/crossentropy": 1.2778152227401733,
"loss/dist_ce": 0.0,
"loss/hidden": 0.09423828125,
"loss/idx": 0.0,
"loss/logits": 0.017108086496591568,
"step": 719
},
{
"epoch": 0.00593388661331663,
"grad_norm": 3.625,
"grad_norm_var": 2.711073557535807,
"learning_rate": 5e-05,
"loss": 0.1612,
"loss/crossentropy": 2.4469552040100098,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1318359375,
"loss/idx": 0.0,
"loss/logits": 0.029361439868807793,
"step": 720
},
{
"epoch": 0.005942128122501792,
"grad_norm": 1.4453125,
"grad_norm_var": 2.8402750651041666,
"learning_rate": 5e-05,
"loss": 0.1502,
"loss/crossentropy": 2.1400833129882812,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11669921875,
"loss/idx": 0.0,
"loss/logits": 0.033529091626405716,
"step": 721
},
{
"epoch": 0.005950369631686954,
"grad_norm": 2.109375,
"grad_norm_var": 2.8611083984375,
"learning_rate": 5e-05,
"loss": 0.1749,
"loss/crossentropy": 1.3967795372009277,
"loss/dist_ce": 0.0,
"loss/hidden": 0.142578125,
"loss/idx": 0.0,
"loss/logits": 0.03236193209886551,
"step": 722
},
{
"epoch": 0.005958611140872117,
"grad_norm": 2.15625,
"grad_norm_var": 2.855537923177083,
"learning_rate": 5e-05,
"loss": 0.2143,
"loss/crossentropy": 2.475792646408081,
"loss/dist_ce": 0.0,
"loss/hidden": 0.166015625,
"loss/idx": 0.0,
"loss/logits": 0.048240065574645996,
"step": 723
},
{
"epoch": 0.005966852650057279,
"grad_norm": 2.359375,
"grad_norm_var": 2.8515218098958335,
"learning_rate": 5e-05,
"loss": 0.1735,
"loss/crossentropy": 2.5644333362579346,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.04069886356592178,
"step": 724
},
{
"epoch": 0.005975094159242441,
"grad_norm": 2.0625,
"grad_norm_var": 2.8581614176432293,
"learning_rate": 5e-05,
"loss": 0.1677,
"loss/crossentropy": 2.2065787315368652,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.0367942750453949,
"step": 725
},
{
"epoch": 0.0059833356684276025,
"grad_norm": 2.734375,
"grad_norm_var": 2.8211629231770834,
"learning_rate": 5e-05,
"loss": 0.2124,
"loss/crossentropy": 2.876722574234009,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1630859375,
"loss/idx": 0.0,
"loss/logits": 0.049362972378730774,
"step": 726
},
{
"epoch": 0.0059915771776127644,
"grad_norm": 8.0,
"grad_norm_var": 4.548148600260417,
"learning_rate": 5e-05,
"loss": 0.1838,
"loss/crossentropy": 1.2283939123153687,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.03141150623559952,
"step": 727
},
{
"epoch": 0.005999818686797926,
"grad_norm": 1.8671875,
"grad_norm_var": 4.618230946858724,
"learning_rate": 5e-05,
"loss": 0.1644,
"loss/crossentropy": 2.0428686141967773,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.03161695599555969,
"step": 728
},
{
"epoch": 0.006008060195983088,
"grad_norm": 7.65625,
"grad_norm_var": 5.931933339436849,
"learning_rate": 5e-05,
"loss": 0.3075,
"loss/crossentropy": 2.5533790588378906,
"loss/dist_ce": 0.0,
"loss/hidden": 0.240234375,
"loss/idx": 0.0,
"loss/logits": 0.0672769546508789,
"step": 729
},
{
"epoch": 0.00601630170516825,
"grad_norm": 2.46875,
"grad_norm_var": 5.852355702718099,
"learning_rate": 5e-05,
"loss": 0.1774,
"loss/crossentropy": 1.7351176738739014,
"loss/dist_ce": 0.0,
"loss/hidden": 0.150390625,
"loss/idx": 0.0,
"loss/logits": 0.027032926678657532,
"step": 730
},
{
"epoch": 0.006024543214353412,
"grad_norm": 1.9453125,
"grad_norm_var": 5.838165028889974,
"learning_rate": 5e-05,
"loss": 0.1815,
"loss/crossentropy": 2.6725094318389893,
"loss/dist_ce": 0.0,
"loss/hidden": 0.142578125,
"loss/idx": 0.0,
"loss/logits": 0.038890693336725235,
"step": 731
},
{
"epoch": 0.006032784723538575,
"grad_norm": 2.28125,
"grad_norm_var": 5.731445058186849,
"learning_rate": 5e-05,
"loss": 0.1753,
"loss/crossentropy": 1.6290442943572998,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1435546875,
"loss/idx": 0.0,
"loss/logits": 0.031768545508384705,
"step": 732
},
{
"epoch": 0.006041026232723737,
"grad_norm": 5.59375,
"grad_norm_var": 6.049501291910807,
"learning_rate": 5e-05,
"loss": 0.2555,
"loss/crossentropy": 2.4887194633483887,
"loss/dist_ce": 0.0,
"loss/hidden": 0.205078125,
"loss/idx": 0.0,
"loss/logits": 0.05038648098707199,
"step": 733
},
{
"epoch": 0.0060492677419088986,
"grad_norm": 1.859375,
"grad_norm_var": 4.525903065999349,
"learning_rate": 5e-05,
"loss": 0.1672,
"loss/crossentropy": 1.4916491508483887,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.03243900462985039,
"step": 734
},
{
"epoch": 0.0060575092510940605,
"grad_norm": 6.28125,
"grad_norm_var": 4.783105214436849,
"learning_rate": 5e-05,
"loss": 0.2469,
"loss/crossentropy": 1.9904972314834595,
"loss/dist_ce": 0.0,
"loss/hidden": 0.193359375,
"loss/idx": 0.0,
"loss/logits": 0.05358533933758736,
"step": 735
},
{
"epoch": 0.006065750760279222,
"grad_norm": 4.90625,
"grad_norm_var": 4.923659006754558,
"learning_rate": 5e-05,
"loss": 0.3124,
"loss/crossentropy": 3.1595237255096436,
"loss/dist_ce": 0.0,
"loss/hidden": 0.240234375,
"loss/idx": 0.0,
"loss/logits": 0.07215666770935059,
"step": 736
},
{
"epoch": 0.006073992269464384,
"grad_norm": 2.296875,
"grad_norm_var": 4.737629191080729,
"learning_rate": 5e-05,
"loss": 0.1351,
"loss/crossentropy": 0.5982239842414856,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11767578125,
"loss/idx": 0.0,
"loss/logits": 0.017444239929318428,
"step": 737
},
{
"epoch": 0.006082233778649546,
"grad_norm": 2.1875,
"grad_norm_var": 4.723148600260417,
"learning_rate": 5e-05,
"loss": 0.1614,
"loss/crossentropy": 1.076785922050476,
"loss/dist_ce": 0.0,
"loss/hidden": 0.13671875,
"loss/idx": 0.0,
"loss/logits": 0.02469494380056858,
"step": 738
},
{
"epoch": 0.006090475287834708,
"grad_norm": 2.609375,
"grad_norm_var": 4.6523183186848955,
"learning_rate": 5e-05,
"loss": 0.2057,
"loss/crossentropy": 2.2849977016448975,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1513671875,
"loss/idx": 0.0,
"loss/logits": 0.05430486798286438,
"step": 739
},
{
"epoch": 0.00609871679701987,
"grad_norm": 2.578125,
"grad_norm_var": 4.620018513997396,
"learning_rate": 5e-05,
"loss": 0.2088,
"loss/crossentropy": 1.5881438255310059,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1708984375,
"loss/idx": 0.0,
"loss/logits": 0.037879034876823425,
"step": 740
},
{
"epoch": 0.006106958306205033,
"grad_norm": 1.671875,
"grad_norm_var": 4.708748372395833,
"learning_rate": 5e-05,
"loss": 0.1337,
"loss/crossentropy": 1.5729397535324097,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1142578125,
"loss/idx": 0.0,
"loss/logits": 0.01942865364253521,
"step": 741
},
{
"epoch": 0.006115199815390195,
"grad_norm": 1.7578125,
"grad_norm_var": 4.875673166910807,
"learning_rate": 5e-05,
"loss": 0.1473,
"loss/crossentropy": 2.5460681915283203,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11865234375,
"loss/idx": 0.0,
"loss/logits": 0.028684455901384354,
"step": 742
},
{
"epoch": 0.0061234413245753565,
"grad_norm": 3.484375,
"grad_norm_var": 3.4392575581868488,
"learning_rate": 5e-05,
"loss": 0.197,
"loss/crossentropy": 2.502234697341919,
"loss/dist_ce": 0.0,
"loss/hidden": 0.158203125,
"loss/idx": 0.0,
"loss/logits": 0.03875020891427994,
"step": 743
},
{
"epoch": 0.006131682833760518,
"grad_norm": 1.5625,
"grad_norm_var": 3.499828084309896,
"learning_rate": 5e-05,
"loss": 0.1167,
"loss/crossentropy": 0.5112316608428955,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10546875,
"loss/idx": 0.0,
"loss/logits": 0.011273887008428574,
"step": 744
},
{
"epoch": 0.00613992434294568,
"grad_norm": 1.734375,
"grad_norm_var": 2.17010498046875,
"learning_rate": 5e-05,
"loss": 0.1642,
"loss/crossentropy": 1.5730743408203125,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.029403435066342354,
"step": 745
},
{
"epoch": 0.006148165852130842,
"grad_norm": 5.875,
"grad_norm_var": 2.7329345703125,
"learning_rate": 5e-05,
"loss": 0.2765,
"loss/crossentropy": 2.3317244052886963,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2236328125,
"loss/idx": 0.0,
"loss/logits": 0.05287738889455795,
"step": 746
},
{
"epoch": 0.006156407361316004,
"grad_norm": 4.65625,
"grad_norm_var": 2.7969134012858072,
"learning_rate": 5e-05,
"loss": 0.3948,
"loss/crossentropy": 2.6969289779663086,
"loss/dist_ce": 0.0,
"loss/hidden": 0.287109375,
"loss/idx": 0.0,
"loss/logits": 0.10772477090358734,
"step": 747
},
{
"epoch": 0.006164648870501166,
"grad_norm": 2.34375,
"grad_norm_var": 2.7894304911295573,
"learning_rate": 5e-05,
"loss": 0.1563,
"loss/crossentropy": 2.7726356983184814,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1240234375,
"loss/idx": 0.0,
"loss/logits": 0.03231953829526901,
"step": 748
},
{
"epoch": 0.006172890379686328,
"grad_norm": 2.3125,
"grad_norm_var": 2.4205034891764323,
"learning_rate": 5e-05,
"loss": 0.1669,
"loss/crossentropy": 2.8912436962127686,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.03605188801884651,
"step": 749
},
{
"epoch": 0.00618113188887149,
"grad_norm": 1.6796875,
"grad_norm_var": 2.4500244140625,
"learning_rate": 5e-05,
"loss": 0.1556,
"loss/crossentropy": 1.5787498950958252,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1318359375,
"loss/idx": 0.0,
"loss/logits": 0.02375878393650055,
"step": 750
},
{
"epoch": 0.0061893733980566525,
"grad_norm": 6.1875,
"grad_norm_var": 2.40950927734375,
"learning_rate": 5e-05,
"loss": 0.2622,
"loss/crossentropy": 1.5021111965179443,
"loss/dist_ce": 0.0,
"loss/hidden": 0.212890625,
"loss/idx": 0.0,
"loss/logits": 0.04928001016378403,
"step": 751
},
{
"epoch": 0.006197614907241814,
"grad_norm": 1.828125,
"grad_norm_var": 2.2153228759765624,
"learning_rate": 5e-05,
"loss": 0.1812,
"loss/crossentropy": 2.771916627883911,
"loss/dist_ce": 0.0,
"loss/hidden": 0.13671875,
"loss/idx": 0.0,
"loss/logits": 0.044479530304670334,
"step": 752
},
{
"epoch": 0.006205856416426976,
"grad_norm": 2.265625,
"grad_norm_var": 2.2174713134765627,
"learning_rate": 5e-05,
"loss": 0.1752,
"loss/crossentropy": 1.6487168073654175,
"loss/dist_ce": 0.0,
"loss/hidden": 0.142578125,
"loss/idx": 0.0,
"loss/logits": 0.03264114260673523,
"step": 753
},
{
"epoch": 0.006214097925612138,
"grad_norm": 2.90625,
"grad_norm_var": 2.191454060872396,
"learning_rate": 5e-05,
"loss": 0.2076,
"loss/crossentropy": 3.0564393997192383,
"loss/dist_ce": 0.0,
"loss/hidden": 0.158203125,
"loss/idx": 0.0,
"loss/logits": 0.04940104857087135,
"step": 754
},
{
"epoch": 0.0062223394347973,
"grad_norm": 1.7109375,
"grad_norm_var": 2.269628651936849,
"learning_rate": 5e-05,
"loss": 0.1733,
"loss/crossentropy": 2.259493112564087,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1337890625,
"loss/idx": 0.0,
"loss/logits": 0.03946584463119507,
"step": 755
},
{
"epoch": 0.006230580943982462,
"grad_norm": 1.8046875,
"grad_norm_var": 2.328316243489583,
"learning_rate": 5e-05,
"loss": 0.1575,
"loss/crossentropy": 2.8051905632019043,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1201171875,
"loss/idx": 0.0,
"loss/logits": 0.037370190024375916,
"step": 756
},
{
"epoch": 0.006238822453167624,
"grad_norm": 2.65625,
"grad_norm_var": 2.2491689046223957,
"learning_rate": 5e-05,
"loss": 0.193,
"loss/crossentropy": 2.3121681213378906,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1494140625,
"loss/idx": 0.0,
"loss/logits": 0.043607860803604126,
"step": 757
},
{
"epoch": 0.006247063962352786,
"grad_norm": 1.7265625,
"grad_norm_var": 2.2535634358723957,
"learning_rate": 5e-05,
"loss": 0.1707,
"loss/crossentropy": 2.660825490951538,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.03984237462282181,
"step": 758
},
{
"epoch": 0.006255305471537948,
"grad_norm": 2.46875,
"grad_norm_var": 2.224800618489583,
"learning_rate": 5e-05,
"loss": 0.1356,
"loss/crossentropy": 1.1409167051315308,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11279296875,
"loss/idx": 0.0,
"loss/logits": 0.02284255623817444,
"step": 759
},
{
"epoch": 0.00626354698072311,
"grad_norm": 1.96875,
"grad_norm_var": 2.171744791666667,
"learning_rate": 5e-05,
"loss": 0.1432,
"loss/crossentropy": 1.4278790950775146,
"loss/dist_ce": 0.0,
"loss/hidden": 0.115234375,
"loss/idx": 0.0,
"loss/logits": 0.02792993187904358,
"step": 760
},
{
"epoch": 0.006271788489908272,
"grad_norm": 2.875,
"grad_norm_var": 2.0974110921223956,
"learning_rate": 5e-05,
"loss": 0.1644,
"loss/crossentropy": 1.693648099899292,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1357421875,
"loss/idx": 0.0,
"loss/logits": 0.028608174994587898,
"step": 761
},
{
"epoch": 0.006280029999093434,
"grad_norm": 1.4140625,
"grad_norm_var": 1.5294837951660156,
"learning_rate": 5e-05,
"loss": 0.1372,
"loss/crossentropy": 1.6515194177627563,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1171875,
"loss/idx": 0.0,
"loss/logits": 0.020010514184832573,
"step": 762
},
{
"epoch": 0.006288271508278596,
"grad_norm": 3.578125,
"grad_norm_var": 1.2993995666503906,
"learning_rate": 5e-05,
"loss": 0.228,
"loss/crossentropy": 2.8086752891540527,
"loss/dist_ce": 0.0,
"loss/hidden": 0.17578125,
"loss/idx": 0.0,
"loss/logits": 0.052195869386196136,
"step": 763
},
{
"epoch": 0.006296513017463758,
"grad_norm": 2.515625,
"grad_norm_var": 1.2980567932128906,
"learning_rate": 5e-05,
"loss": 0.1333,
"loss/crossentropy": 2.0333292484283447,
"loss/dist_ce": 0.0,
"loss/hidden": 0.109375,
"loss/idx": 0.0,
"loss/logits": 0.023911556228995323,
"step": 764
},
{
"epoch": 0.00630475452664892,
"grad_norm": 1.703125,
"grad_norm_var": 1.3359840393066407,
"learning_rate": 5e-05,
"loss": 0.1194,
"loss/crossentropy": 1.3590047359466553,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10302734375,
"loss/idx": 0.0,
"loss/logits": 0.016341259703040123,
"step": 765
},
{
"epoch": 0.006312996035834082,
"grad_norm": 1.875,
"grad_norm_var": 1.3181630452473958,
"learning_rate": 5e-05,
"loss": 0.1641,
"loss/crossentropy": 2.541020631790161,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12890625,
"loss/idx": 0.0,
"loss/logits": 0.03523167222738266,
"step": 766
},
{
"epoch": 0.006321237545019244,
"grad_norm": 2.15625,
"grad_norm_var": 0.3344960530598958,
"learning_rate": 5e-05,
"loss": 0.1642,
"loss/crossentropy": 3.1116065979003906,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1259765625,
"loss/idx": 0.0,
"loss/logits": 0.038257598876953125,
"step": 767
},
{
"epoch": 0.0063294790542044056,
"grad_norm": 3.984375,
"grad_norm_var": 0.5136220296223958,
"learning_rate": 5e-05,
"loss": 0.2127,
"loss/crossentropy": 0.4452613890171051,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1884765625,
"loss/idx": 0.0,
"loss/logits": 0.024267811328172684,
"step": 768
},
{
"epoch": 0.006337720563389568,
"grad_norm": 2.421875,
"grad_norm_var": 0.5133778889973958,
"learning_rate": 5e-05,
"loss": 0.1991,
"loss/crossentropy": 2.1372740268707275,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1572265625,
"loss/idx": 0.0,
"loss/logits": 0.04187183082103729,
"step": 769
},
{
"epoch": 0.00634596207257473,
"grad_norm": 2.1875,
"grad_norm_var": 0.4933502197265625,
"learning_rate": 5e-05,
"loss": 0.147,
"loss/crossentropy": 1.2034099102020264,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1220703125,
"loss/idx": 0.0,
"loss/logits": 0.024930456653237343,
"step": 770
},
{
"epoch": 0.006354203581759892,
"grad_norm": 1.640625,
"grad_norm_var": 0.4993263244628906,
"learning_rate": 5e-05,
"loss": 0.154,
"loss/crossentropy": 1.4210480451583862,
"loss/dist_ce": 0.0,
"loss/hidden": 0.125,
"loss/idx": 0.0,
"loss/logits": 0.0289776474237442,
"step": 771
},
{
"epoch": 0.006362445090945054,
"grad_norm": 2.90625,
"grad_norm_var": 0.5007965087890625,
"learning_rate": 5e-05,
"loss": 0.1915,
"loss/crossentropy": 2.461935520172119,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1494140625,
"loss/idx": 0.0,
"loss/logits": 0.042080432176589966,
"step": 772
},
{
"epoch": 0.006370686600130216,
"grad_norm": 3.984375,
"grad_norm_var": 0.6599812825520833,
"learning_rate": 5e-05,
"loss": 0.2646,
"loss/crossentropy": 2.870908260345459,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2041015625,
"loss/idx": 0.0,
"loss/logits": 0.06047297269105911,
"step": 773
},
{
"epoch": 0.006378928109315378,
"grad_norm": 2.015625,
"grad_norm_var": 0.6368242899576823,
"learning_rate": 5e-05,
"loss": 0.1692,
"loss/crossentropy": 2.4187848567962646,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.03446546941995621,
"step": 774
},
{
"epoch": 0.00638716961850054,
"grad_norm": 3.140625,
"grad_norm_var": 0.6639442443847656,
"learning_rate": 5e-05,
"loss": 0.2993,
"loss/crossentropy": 2.4693610668182373,
"loss/dist_ce": 0.0,
"loss/hidden": 0.21484375,
"loss/idx": 0.0,
"loss/logits": 0.08449941873550415,
"step": 775
},
{
"epoch": 0.006395411127685702,
"grad_norm": 1.7578125,
"grad_norm_var": 0.68231201171875,
"learning_rate": 5e-05,
"loss": 0.1579,
"loss/crossentropy": 2.635270833969116,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1259765625,
"loss/idx": 0.0,
"loss/logits": 0.031969744712114334,
"step": 776
},
{
"epoch": 0.0064036526368708635,
"grad_norm": 1.203125,
"grad_norm_var": 0.7755930582682292,
"learning_rate": 5e-05,
"loss": 0.1124,
"loss/crossentropy": 1.4390558004379272,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0966796875,
"loss/idx": 0.0,
"loss/logits": 0.015761706978082657,
"step": 777
},
{
"epoch": 0.006411894146056025,
"grad_norm": 4.09375,
"grad_norm_var": 0.8702369689941406,
"learning_rate": 5e-05,
"loss": 0.3235,
"loss/crossentropy": 1.998376727104187,
"loss/dist_ce": 0.0,
"loss/hidden": 0.263671875,
"loss/idx": 0.0,
"loss/logits": 0.05982797592878342,
"step": 778
},
{
"epoch": 0.006420135655241188,
"grad_norm": 2.015625,
"grad_norm_var": 0.813372548421224,
"learning_rate": 5e-05,
"loss": 0.2049,
"loss/crossentropy": 1.550020694732666,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16015625,
"loss/idx": 0.0,
"loss/logits": 0.04472289979457855,
"step": 779
},
{
"epoch": 0.00642837716442635,
"grad_norm": 1.2578125,
"grad_norm_var": 0.90545654296875,
"learning_rate": 5e-05,
"loss": 0.1262,
"loss/crossentropy": 2.6197104454040527,
"loss/dist_ce": 0.0,
"loss/hidden": 0.099609375,
"loss/idx": 0.0,
"loss/logits": 0.02659156545996666,
"step": 780
},
{
"epoch": 0.006436618673611512,
"grad_norm": 2.25,
"grad_norm_var": 0.8735911051432291,
"learning_rate": 5e-05,
"loss": 0.1781,
"loss/crossentropy": 2.597325563430786,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.045299261808395386,
"step": 781
},
{
"epoch": 0.006444860182796674,
"grad_norm": 2.09375,
"grad_norm_var": 0.8603749593098958,
"learning_rate": 5e-05,
"loss": 0.1826,
"loss/crossentropy": 1.9669688940048218,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1455078125,
"loss/idx": 0.0,
"loss/logits": 0.03709650784730911,
"step": 782
},
{
"epoch": 0.006453101691981836,
"grad_norm": 1.703125,
"grad_norm_var": 0.89061279296875,
"learning_rate": 5e-05,
"loss": 0.1355,
"loss/crossentropy": 1.5600085258483887,
"loss/dist_ce": 0.0,
"loss/hidden": 0.111328125,
"loss/idx": 0.0,
"loss/logits": 0.02414374053478241,
"step": 783
},
{
"epoch": 0.006461343201166998,
"grad_norm": 3.65625,
"grad_norm_var": 0.8287261962890625,
"learning_rate": 5e-05,
"loss": 0.2038,
"loss/crossentropy": 2.052290678024292,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.033876098692417145,
"step": 784
},
{
"epoch": 0.0064695847103521595,
"grad_norm": 1.78125,
"grad_norm_var": 0.8521240234375,
"learning_rate": 5e-05,
"loss": 0.1581,
"loss/crossentropy": 0.9798577427864075,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.023312989622354507,
"step": 785
},
{
"epoch": 0.006477826219537321,
"grad_norm": 2.296875,
"grad_norm_var": 0.8504221598307292,
"learning_rate": 5e-05,
"loss": 0.147,
"loss/crossentropy": 2.4323980808258057,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11865234375,
"loss/idx": 0.0,
"loss/logits": 0.028353385627269745,
"step": 786
},
{
"epoch": 0.006486067728722483,
"grad_norm": 3.484375,
"grad_norm_var": 0.8854726155598959,
"learning_rate": 5e-05,
"loss": 0.2295,
"loss/crossentropy": 2.9406378269195557,
"loss/dist_ce": 0.0,
"loss/hidden": 0.17578125,
"loss/idx": 0.0,
"loss/logits": 0.05376865714788437,
"step": 787
},
{
"epoch": 0.006494309237907646,
"grad_norm": 1.71875,
"grad_norm_var": 0.9057281494140625,
"learning_rate": 5e-05,
"loss": 0.1408,
"loss/crossentropy": 2.3078272342681885,
"loss/dist_ce": 0.0,
"loss/hidden": 0.111328125,
"loss/idx": 0.0,
"loss/logits": 0.029444556683301926,
"step": 788
},
{
"epoch": 0.006502550747092808,
"grad_norm": 1.125,
"grad_norm_var": 0.81395263671875,
"learning_rate": 5e-05,
"loss": 0.1168,
"loss/crossentropy": 1.5063586235046387,
"loss/dist_ce": 0.0,
"loss/hidden": 0.095703125,
"loss/idx": 0.0,
"loss/logits": 0.021064041182398796,
"step": 789
},
{
"epoch": 0.00651079225627797,
"grad_norm": 2.09375,
"grad_norm_var": 0.8121571858723958,
"learning_rate": 5e-05,
"loss": 0.1659,
"loss/crossentropy": 2.5850398540496826,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1337890625,
"loss/idx": 0.0,
"loss/logits": 0.03208760544657707,
"step": 790
},
{
"epoch": 0.006519033765463132,
"grad_norm": 3.03125,
"grad_norm_var": 0.7996175130208333,
"learning_rate": 5e-05,
"loss": 0.1852,
"loss/crossentropy": 1.5392999649047852,
"loss/dist_ce": 0.0,
"loss/hidden": 0.146484375,
"loss/idx": 0.0,
"loss/logits": 0.038750022649765015,
"step": 791
},
{
"epoch": 0.006527275274648294,
"grad_norm": 1.7890625,
"grad_norm_var": 0.79774169921875,
"learning_rate": 5e-05,
"loss": 0.1902,
"loss/crossentropy": 2.854396343231201,
"loss/dist_ce": 0.0,
"loss/hidden": 0.146484375,
"loss/idx": 0.0,
"loss/logits": 0.043708011507987976,
"step": 792
},
{
"epoch": 0.0065355167838334555,
"grad_norm": 3.671875,
"grad_norm_var": 0.8424235026041667,
"learning_rate": 5e-05,
"loss": 0.2264,
"loss/crossentropy": 2.2643284797668457,
"loss/dist_ce": 0.0,
"loss/hidden": 0.18359375,
"loss/idx": 0.0,
"loss/logits": 0.04277587682008743,
"step": 793
},
{
"epoch": 0.006543758293018617,
"grad_norm": 2.96875,
"grad_norm_var": 0.6642985026041667,
"learning_rate": 5e-05,
"loss": 0.161,
"loss/crossentropy": 2.5523831844329834,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1279296875,
"loss/idx": 0.0,
"loss/logits": 0.0331122949719429,
"step": 794
},
{
"epoch": 0.006551999802203779,
"grad_norm": 2.328125,
"grad_norm_var": 0.6581949869791667,
"learning_rate": 5e-05,
"loss": 0.177,
"loss/crossentropy": 3.067322015762329,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1357421875,
"loss/idx": 0.0,
"loss/logits": 0.041216038167476654,
"step": 795
},
{
"epoch": 0.006560241311388941,
"grad_norm": 1.171875,
"grad_norm_var": 0.6709205627441406,
"learning_rate": 5e-05,
"loss": 0.1258,
"loss/crossentropy": 1.477942943572998,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1044921875,
"loss/idx": 0.0,
"loss/logits": 0.021268734708428383,
"step": 796
},
{
"epoch": 0.006568482820574104,
"grad_norm": 7.375,
"grad_norm_var": 2.2628069559733075,
"learning_rate": 5e-05,
"loss": 0.2407,
"loss/crossentropy": 2.4961607456207275,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19140625,
"loss/idx": 0.0,
"loss/logits": 0.049283482134342194,
"step": 797
},
{
"epoch": 0.006576724329759266,
"grad_norm": 2.828125,
"grad_norm_var": 2.2427263895670575,
"learning_rate": 5e-05,
"loss": 0.2416,
"loss/crossentropy": 2.435840129852295,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1826171875,
"loss/idx": 0.0,
"loss/logits": 0.05896880850195885,
"step": 798
},
{
"epoch": 0.006584965838944428,
"grad_norm": 1.5703125,
"grad_norm_var": 2.261286417643229,
"learning_rate": 5e-05,
"loss": 0.147,
"loss/crossentropy": 1.4684247970581055,
"loss/dist_ce": 0.0,
"loss/hidden": 0.119140625,
"loss/idx": 0.0,
"loss/logits": 0.027891069650650024,
"step": 799
},
{
"epoch": 0.00659320734812959,
"grad_norm": 1.296875,
"grad_norm_var": 2.302298990885417,
"learning_rate": 5e-05,
"loss": 0.122,
"loss/crossentropy": 1.3427636623382568,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1015625,
"loss/idx": 0.0,
"loss/logits": 0.020389681681990623,
"step": 800
},
{
"epoch": 0.0066014488573147515,
"grad_norm": 2.46875,
"grad_norm_var": 2.2629109700520833,
"learning_rate": 5e-05,
"loss": 0.0932,
"loss/crossentropy": 0.27775296568870544,
"loss/dist_ce": 0.0,
"loss/hidden": 0.08642578125,
"loss/idx": 0.0,
"loss/logits": 0.006820976734161377,
"step": 801
},
{
"epoch": 0.006609690366499913,
"grad_norm": 1.4140625,
"grad_norm_var": 2.344496409098307,
"learning_rate": 5e-05,
"loss": 0.1262,
"loss/crossentropy": 0.8798704147338867,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10791015625,
"loss/idx": 0.0,
"loss/logits": 0.018274936825037003,
"step": 802
},
{
"epoch": 0.006617931875685075,
"grad_norm": 1.625,
"grad_norm_var": 2.321738433837891,
"learning_rate": 5e-05,
"loss": 0.1389,
"loss/crossentropy": 2.3500454425811768,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1103515625,
"loss/idx": 0.0,
"loss/logits": 0.02855532616376877,
"step": 803
},
{
"epoch": 0.006626173384870237,
"grad_norm": 3.21875,
"grad_norm_var": 2.325156402587891,
"learning_rate": 5e-05,
"loss": 0.2325,
"loss/crossentropy": 2.58161997795105,
"loss/dist_ce": 0.0,
"loss/hidden": 0.177734375,
"loss/idx": 0.0,
"loss/logits": 0.05475683510303497,
"step": 804
},
{
"epoch": 0.006634414894055399,
"grad_norm": 2.265625,
"grad_norm_var": 2.1975786844889322,
"learning_rate": 5e-05,
"loss": 0.2116,
"loss/crossentropy": 2.6687753200531006,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16015625,
"loss/idx": 0.0,
"loss/logits": 0.051458559930324554,
"step": 805
},
{
"epoch": 0.006642656403240561,
"grad_norm": 3.078125,
"grad_norm_var": 2.1956560770670572,
"learning_rate": 5e-05,
"loss": 0.1836,
"loss/crossentropy": 2.341993808746338,
"loss/dist_ce": 0.0,
"loss/hidden": 0.150390625,
"loss/idx": 0.0,
"loss/logits": 0.033185191452503204,
"step": 806
},
{
"epoch": 0.006650897912425724,
"grad_norm": 1.8515625,
"grad_norm_var": 2.2197336832682293,
"learning_rate": 5e-05,
"loss": 0.1828,
"loss/crossentropy": 2.465346336364746,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1416015625,
"loss/idx": 0.0,
"loss/logits": 0.041189346462488174,
"step": 807
},
{
"epoch": 0.006659139421610886,
"grad_norm": 1.09375,
"grad_norm_var": 2.3212013244628906,
"learning_rate": 5e-05,
"loss": 0.1117,
"loss/crossentropy": 1.3634965419769287,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0927734375,
"loss/idx": 0.0,
"loss/logits": 0.01888800971210003,
"step": 808
},
{
"epoch": 0.0066673809307960475,
"grad_norm": 1.046875,
"grad_norm_var": 2.3466651916503904,
"learning_rate": 5e-05,
"loss": 0.095,
"loss/crossentropy": 1.5679908990859985,
"loss/dist_ce": 0.0,
"loss/hidden": 0.08251953125,
"loss/idx": 0.0,
"loss/logits": 0.012483851984143257,
"step": 809
},
{
"epoch": 0.006675622439981209,
"grad_norm": 2.046875,
"grad_norm_var": 2.3237383524576822,
"learning_rate": 5e-05,
"loss": 0.1209,
"loss/crossentropy": 1.3377238512039185,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10107421875,
"loss/idx": 0.0,
"loss/logits": 0.01983119174838066,
"step": 810
},
{
"epoch": 0.006683863949166371,
"grad_norm": 2.875,
"grad_norm_var": 2.3450294494628907,
"learning_rate": 5e-05,
"loss": 0.2099,
"loss/crossentropy": 1.367640495300293,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1826171875,
"loss/idx": 0.0,
"loss/logits": 0.027279119938611984,
"step": 811
},
{
"epoch": 0.006692105458351533,
"grad_norm": 2.453125,
"grad_norm_var": 2.2503537495930988,
"learning_rate": 5e-05,
"loss": 0.2034,
"loss/crossentropy": 2.2119548320770264,
"loss/dist_ce": 0.0,
"loss/hidden": 0.158203125,
"loss/idx": 0.0,
"loss/logits": 0.045244939625263214,
"step": 812
},
{
"epoch": 0.006700346967536695,
"grad_norm": 2.140625,
"grad_norm_var": 0.4953386942545573,
"learning_rate": 5e-05,
"loss": 0.1415,
"loss/crossentropy": 1.21071457862854,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12109375,
"loss/idx": 0.0,
"loss/logits": 0.020365629345178604,
"step": 813
},
{
"epoch": 0.006708588476721857,
"grad_norm": 3.8125,
"grad_norm_var": 0.6541460673014323,
"learning_rate": 5e-05,
"loss": 0.2487,
"loss/crossentropy": 2.8512344360351562,
"loss/dist_ce": 0.0,
"loss/hidden": 0.193359375,
"loss/idx": 0.0,
"loss/logits": 0.055374931544065475,
"step": 814
},
{
"epoch": 0.006716829985907019,
"grad_norm": 2.671875,
"grad_norm_var": 0.6461496988932292,
"learning_rate": 5e-05,
"loss": 0.1403,
"loss/crossentropy": 0.623081624507904,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12158203125,
"loss/idx": 0.0,
"loss/logits": 0.01867399737238884,
"step": 815
},
{
"epoch": 0.006725071495092182,
"grad_norm": 1.2421875,
"grad_norm_var": 0.6529945373535156,
"learning_rate": 5e-05,
"loss": 0.1044,
"loss/crossentropy": 0.35599735379219055,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0966796875,
"loss/idx": 0.0,
"loss/logits": 0.007739436347037554,
"step": 816
},
{
"epoch": 0.0067333130042773436,
"grad_norm": 2.0625,
"grad_norm_var": 0.6491065979003906,
"learning_rate": 5e-05,
"loss": 0.1802,
"loss/crossentropy": 2.3499081134796143,
"loss/dist_ce": 0.0,
"loss/hidden": 0.140625,
"loss/idx": 0.0,
"loss/logits": 0.039534226059913635,
"step": 817
},
{
"epoch": 0.0067415545134625054,
"grad_norm": 2.953125,
"grad_norm_var": 0.6397379557291667,
"learning_rate": 5e-05,
"loss": 0.1885,
"loss/crossentropy": 1.8850847482681274,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16015625,
"loss/idx": 0.0,
"loss/logits": 0.02836015075445175,
"step": 818
},
{
"epoch": 0.006749796022647667,
"grad_norm": 2.703125,
"grad_norm_var": 0.6186106363932292,
"learning_rate": 5e-05,
"loss": 0.216,
"loss/crossentropy": 1.4330132007598877,
"loss/dist_ce": 0.0,
"loss/hidden": 0.171875,
"loss/idx": 0.0,
"loss/logits": 0.04408085718750954,
"step": 819
},
{
"epoch": 0.006758037531832829,
"grad_norm": 3.296875,
"grad_norm_var": 0.6280965169270833,
"learning_rate": 5e-05,
"loss": 0.2694,
"loss/crossentropy": 1.9407634735107422,
"loss/dist_ce": 0.0,
"loss/hidden": 0.20703125,
"loss/idx": 0.0,
"loss/logits": 0.062371157109737396,
"step": 820
},
{
"epoch": 0.006766279041017991,
"grad_norm": 1.53125,
"grad_norm_var": 0.6700266520182292,
"learning_rate": 5e-05,
"loss": 0.1509,
"loss/crossentropy": 1.891184687614441,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1181640625,
"loss/idx": 0.0,
"loss/logits": 0.032700151205062866,
"step": 821
},
{
"epoch": 0.006774520550203153,
"grad_norm": 2.921875,
"grad_norm_var": 0.6554189046223958,
"learning_rate": 5e-05,
"loss": 0.212,
"loss/crossentropy": 2.192823648452759,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1650390625,
"loss/idx": 0.0,
"loss/logits": 0.04695805162191391,
"step": 822
},
{
"epoch": 0.006782762059388315,
"grad_norm": 1.2734375,
"grad_norm_var": 0.7104085286458334,
"learning_rate": 5e-05,
"loss": 0.1329,
"loss/crossentropy": 1.4268293380737305,
"loss/dist_ce": 0.0,
"loss/hidden": 0.109375,
"loss/idx": 0.0,
"loss/logits": 0.023486068472266197,
"step": 823
},
{
"epoch": 0.006791003568573477,
"grad_norm": 5.03125,
"grad_norm_var": 1.0682698567708333,
"learning_rate": 5e-05,
"loss": 0.4061,
"loss/crossentropy": 2.2408840656280518,
"loss/dist_ce": 0.0,
"loss/hidden": 0.30078125,
"loss/idx": 0.0,
"loss/logits": 0.10528542101383209,
"step": 824
},
{
"epoch": 0.00679924507775864,
"grad_norm": 1.6328125,
"grad_norm_var": 0.9758969624837239,
"learning_rate": 5e-05,
"loss": 0.1375,
"loss/crossentropy": 2.6388261318206787,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1083984375,
"loss/idx": 0.0,
"loss/logits": 0.029141269624233246,
"step": 825
},
{
"epoch": 0.0068074865869438015,
"grad_norm": 1.328125,
"grad_norm_var": 1.055492909749349,
"learning_rate": 5e-05,
"loss": 0.1284,
"loss/crossentropy": 1.3783127069473267,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1103515625,
"loss/idx": 0.0,
"loss/logits": 0.01809128187596798,
"step": 826
},
{
"epoch": 0.006815728096128963,
"grad_norm": 1.140625,
"grad_norm_var": 1.155761464436849,
"learning_rate": 5e-05,
"loss": 0.1161,
"loss/crossentropy": 1.5458354949951172,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0966796875,
"loss/idx": 0.0,
"loss/logits": 0.01938330940902233,
"step": 827
},
{
"epoch": 0.006823969605314125,
"grad_norm": 3.234375,
"grad_norm_var": 1.200774892171224,
"learning_rate": 5e-05,
"loss": 0.2329,
"loss/crossentropy": 1.8702012300491333,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1875,
"loss/idx": 0.0,
"loss/logits": 0.045410916209220886,
"step": 828
},
{
"epoch": 0.006832211114499287,
"grad_norm": 5.4375,
"grad_norm_var": 1.7502540588378905,
"learning_rate": 5e-05,
"loss": 0.3512,
"loss/crossentropy": 2.7246875762939453,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28125,
"loss/idx": 0.0,
"loss/logits": 0.0699634999036789,
"step": 829
},
{
"epoch": 0.006840452623684449,
"grad_norm": 2.265625,
"grad_norm_var": 1.6584083557128906,
"learning_rate": 5e-05,
"loss": 0.1294,
"loss/crossentropy": 1.506727695465088,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10791015625,
"loss/idx": 0.0,
"loss/logits": 0.021486353129148483,
"step": 830
},
{
"epoch": 0.006848694132869611,
"grad_norm": 2.25,
"grad_norm_var": 1.6624183654785156,
"learning_rate": 5e-05,
"loss": 0.1725,
"loss/crossentropy": 2.507237434387207,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1357421875,
"loss/idx": 0.0,
"loss/logits": 0.03672666847705841,
"step": 831
},
{
"epoch": 0.006856935642054773,
"grad_norm": 1.96875,
"grad_norm_var": 1.57171630859375,
"learning_rate": 5e-05,
"loss": 0.1601,
"loss/crossentropy": 1.883087158203125,
"loss/dist_ce": 0.0,
"loss/hidden": 0.13671875,
"loss/idx": 0.0,
"loss/logits": 0.023375539109110832,
"step": 832
},
{
"epoch": 0.006865177151239935,
"grad_norm": 1.296875,
"grad_norm_var": 1.6595937093098958,
"learning_rate": 5e-05,
"loss": 0.1151,
"loss/crossentropy": 2.2904350757598877,
"loss/dist_ce": 0.0,
"loss/hidden": 0.09375,
"loss/idx": 0.0,
"loss/logits": 0.02137480303645134,
"step": 833
},
{
"epoch": 0.006873418660425097,
"grad_norm": 1.546875,
"grad_norm_var": 1.7013417561848958,
"learning_rate": 5e-05,
"loss": 0.1588,
"loss/crossentropy": 2.104323625564575,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1279296875,
"loss/idx": 0.0,
"loss/logits": 0.03088623285293579,
"step": 834
},
{
"epoch": 0.006881660169610259,
"grad_norm": 1.40625,
"grad_norm_var": 1.7590087890625,
"learning_rate": 5e-05,
"loss": 0.1215,
"loss/crossentropy": 1.4084559679031372,
"loss/dist_ce": 0.0,
"loss/hidden": 0.103515625,
"loss/idx": 0.0,
"loss/logits": 0.017994871363043785,
"step": 835
},
{
"epoch": 0.006889901678795421,
"grad_norm": 1.5,
"grad_norm_var": 1.7333892822265624,
"learning_rate": 5e-05,
"loss": 0.1111,
"loss/crossentropy": 1.1881444454193115,
"loss/dist_ce": 0.0,
"loss/hidden": 0.095703125,
"loss/idx": 0.0,
"loss/logits": 0.01541186310350895,
"step": 836
},
{
"epoch": 0.006898143187980583,
"grad_norm": 3.4375,
"grad_norm_var": 1.7815419514973958,
"learning_rate": 5e-05,
"loss": 0.1738,
"loss/crossentropy": 2.4373316764831543,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1357421875,
"loss/idx": 0.0,
"loss/logits": 0.038025081157684326,
"step": 837
},
{
"epoch": 0.006906384697165745,
"grad_norm": 1.34375,
"grad_norm_var": 1.81781005859375,
"learning_rate": 5e-05,
"loss": 0.1448,
"loss/crossentropy": 2.4372975826263428,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1142578125,
"loss/idx": 0.0,
"loss/logits": 0.030569259077310562,
"step": 838
},
{
"epoch": 0.006914626206350907,
"grad_norm": 2.171875,
"grad_norm_var": 1.7505734761555989,
"learning_rate": 5e-05,
"loss": 0.1729,
"loss/crossentropy": 1.551772117614746,
"loss/dist_ce": 0.0,
"loss/hidden": 0.14453125,
"loss/idx": 0.0,
"loss/logits": 0.028406476601958275,
"step": 839
},
{
"epoch": 0.006922867715536069,
"grad_norm": 1.78125,
"grad_norm_var": 1.2323931376139323,
"learning_rate": 5e-05,
"loss": 0.1473,
"loss/crossentropy": 2.1337997913360596,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1142578125,
"loss/idx": 0.0,
"loss/logits": 0.033026814460754395,
"step": 840
},
{
"epoch": 0.006931109224721231,
"grad_norm": 2.453125,
"grad_norm_var": 1.2223795572916667,
"learning_rate": 5e-05,
"loss": 0.1839,
"loss/crossentropy": 2.638575792312622,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1416015625,
"loss/idx": 0.0,
"loss/logits": 0.04230286926031113,
"step": 841
},
{
"epoch": 0.006939350733906393,
"grad_norm": 2.390625,
"grad_norm_var": 1.1750651041666667,
"learning_rate": 5e-05,
"loss": 0.2205,
"loss/crossentropy": 2.267352342605591,
"loss/dist_ce": 0.0,
"loss/hidden": 0.169921875,
"loss/idx": 0.0,
"loss/logits": 0.05056004598736763,
"step": 842
},
{
"epoch": 0.0069475922430915545,
"grad_norm": 4.34375,
"grad_norm_var": 1.3525299072265624,
"learning_rate": 5e-05,
"loss": 0.3381,
"loss/crossentropy": 2.6636710166931152,
"loss/dist_ce": 0.0,
"loss/hidden": 0.248046875,
"loss/idx": 0.0,
"loss/logits": 0.09001342952251434,
"step": 843
},
{
"epoch": 0.006955833752276717,
"grad_norm": 3.109375,
"grad_norm_var": 1.340046183268229,
"learning_rate": 5e-05,
"loss": 0.1769,
"loss/crossentropy": 1.5471034049987793,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1484375,
"loss/idx": 0.0,
"loss/logits": 0.02849499136209488,
"step": 844
},
{
"epoch": 0.006964075261461879,
"grad_norm": 2.546875,
"grad_norm_var": 0.698876953125,
"learning_rate": 5e-05,
"loss": 0.1365,
"loss/crossentropy": 2.5193700790405273,
"loss/dist_ce": 0.0,
"loss/hidden": 0.107421875,
"loss/idx": 0.0,
"loss/logits": 0.029105795547366142,
"step": 845
},
{
"epoch": 0.006972316770647041,
"grad_norm": 2.28125,
"grad_norm_var": 0.6989491780598959,
"learning_rate": 5e-05,
"loss": 0.1387,
"loss/crossentropy": 1.5674251317977905,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11376953125,
"loss/idx": 0.0,
"loss/logits": 0.024912243708968163,
"step": 846
},
{
"epoch": 0.006980558279832203,
"grad_norm": 3.28125,
"grad_norm_var": 0.7668935139973958,
"learning_rate": 5e-05,
"loss": 0.1924,
"loss/crossentropy": 1.3732967376708984,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15625,
"loss/idx": 0.0,
"loss/logits": 0.036177463829517365,
"step": 847
},
{
"epoch": 0.006988799789017365,
"grad_norm": 4.0625,
"grad_norm_var": 0.9473704020182292,
"learning_rate": 5e-05,
"loss": 0.2466,
"loss/crossentropy": 1.4407217502593994,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19921875,
"loss/idx": 0.0,
"loss/logits": 0.04741185903549194,
"step": 848
},
{
"epoch": 0.006997041298202527,
"grad_norm": 1.5078125,
"grad_norm_var": 0.9181536356608073,
"learning_rate": 5e-05,
"loss": 0.1338,
"loss/crossentropy": 2.713207721710205,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1064453125,
"loss/idx": 0.0,
"loss/logits": 0.027391444891691208,
"step": 849
},
{
"epoch": 0.007005282807387689,
"grad_norm": 2.5,
"grad_norm_var": 0.8604448954264323,
"learning_rate": 5e-05,
"loss": 0.2414,
"loss/crossentropy": 1.8647682666778564,
"loss/dist_ce": 0.0,
"loss/hidden": 0.203125,
"loss/idx": 0.0,
"loss/logits": 0.038255929946899414,
"step": 850
},
{
"epoch": 0.0070135243165728505,
"grad_norm": 3.640625,
"grad_norm_var": 0.8444435119628906,
"learning_rate": 5e-05,
"loss": 0.1953,
"loss/crossentropy": 3.0968027114868164,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1513671875,
"loss/idx": 0.0,
"loss/logits": 0.04395551607012749,
"step": 851
},
{
"epoch": 0.0070217658257580124,
"grad_norm": 1.1015625,
"grad_norm_var": 0.9152984619140625,
"learning_rate": 5e-05,
"loss": 0.1179,
"loss/crossentropy": 1.364888072013855,
"loss/dist_ce": 0.0,
"loss/hidden": 0.09765625,
"loss/idx": 0.0,
"loss/logits": 0.020214572548866272,
"step": 852
},
{
"epoch": 0.007030007334943175,
"grad_norm": 4.03125,
"grad_norm_var": 1.0018870035807292,
"learning_rate": 5e-05,
"loss": 0.2262,
"loss/crossentropy": 1.5738481283187866,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1923828125,
"loss/idx": 0.0,
"loss/logits": 0.03383617848157883,
"step": 853
},
{
"epoch": 0.007038248844128337,
"grad_norm": 2.375,
"grad_norm_var": 0.8874827067057292,
"learning_rate": 5e-05,
"loss": 0.147,
"loss/crossentropy": 1.7854039669036865,
"loss/dist_ce": 0.0,
"loss/hidden": 0.125,
"loss/idx": 0.0,
"loss/logits": 0.02201111428439617,
"step": 854
},
{
"epoch": 0.007046490353313499,
"grad_norm": 1.078125,
"grad_norm_var": 1.0427154541015624,
"learning_rate": 5e-05,
"loss": 0.1042,
"loss/crossentropy": 0.8679842352867126,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0908203125,
"loss/idx": 0.0,
"loss/logits": 0.013427493162453175,
"step": 855
},
{
"epoch": 0.007054731862498661,
"grad_norm": 2.0,
"grad_norm_var": 1.020213826497396,
"learning_rate": 5e-05,
"loss": 0.1693,
"loss/crossentropy": 1.9278184175491333,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.03644675761461258,
"step": 856
},
{
"epoch": 0.007062973371683823,
"grad_norm": 2.265625,
"grad_norm_var": 1.0278065999348958,
"learning_rate": 5e-05,
"loss": 0.1806,
"loss/crossentropy": 1.801297664642334,
"loss/dist_ce": 0.0,
"loss/hidden": 0.142578125,
"loss/idx": 0.0,
"loss/logits": 0.03802645206451416,
"step": 857
},
{
"epoch": 0.007071214880868985,
"grad_norm": 2.390625,
"grad_norm_var": 1.0278065999348958,
"learning_rate": 5e-05,
"loss": 0.1861,
"loss/crossentropy": 2.6730916500091553,
"loss/dist_ce": 0.0,
"loss/hidden": 0.142578125,
"loss/idx": 0.0,
"loss/logits": 0.04350658878684044,
"step": 858
},
{
"epoch": 0.0070794563900541466,
"grad_norm": 1.625,
"grad_norm_var": 0.8784169514973958,
"learning_rate": 5e-05,
"loss": 0.1474,
"loss/crossentropy": 2.8773488998413086,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1142578125,
"loss/idx": 0.0,
"loss/logits": 0.033148590475320816,
"step": 859
},
{
"epoch": 0.0070876978992393085,
"grad_norm": 2.171875,
"grad_norm_var": 0.8555898030598958,
"learning_rate": 5e-05,
"loss": 0.1824,
"loss/crossentropy": 2.271367311477661,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1435546875,
"loss/idx": 0.0,
"loss/logits": 0.03879944607615471,
"step": 860
},
{
"epoch": 0.00709593940842447,
"grad_norm": 2.078125,
"grad_norm_var": 0.8619374593098958,
"learning_rate": 5e-05,
"loss": 0.1606,
"loss/crossentropy": 2.637540340423584,
"loss/dist_ce": 0.0,
"loss/hidden": 0.126953125,
"loss/idx": 0.0,
"loss/logits": 0.033611979335546494,
"step": 861
},
{
"epoch": 0.007104180917609632,
"grad_norm": 1.375,
"grad_norm_var": 0.9275461832682291,
"learning_rate": 5e-05,
"loss": 0.1324,
"loss/crossentropy": 1.5838335752487183,
"loss/dist_ce": 0.0,
"loss/hidden": 0.109375,
"loss/idx": 0.0,
"loss/logits": 0.02299325168132782,
"step": 862
},
{
"epoch": 0.007112422426794795,
"grad_norm": 1.1484375,
"grad_norm_var": 0.9449724833170573,
"learning_rate": 5e-05,
"loss": 0.1209,
"loss/crossentropy": 1.4155738353729248,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1015625,
"loss/idx": 0.0,
"loss/logits": 0.019340351223945618,
"step": 863
},
{
"epoch": 0.007120663935979957,
"grad_norm": 29.625,
"grad_norm_var": 48.10079523722331,
"learning_rate": 5e-05,
"loss": 0.374,
"loss/crossentropy": 2.0414719581604004,
"loss/dist_ce": 0.0,
"loss/hidden": 0.32421875,
"loss/idx": 0.0,
"loss/logits": 0.049798041582107544,
"step": 864
},
{
"epoch": 0.007128905445165119,
"grad_norm": 1.3515625,
"grad_norm_var": 48.15022354125976,
"learning_rate": 5e-05,
"loss": 0.1175,
"loss/crossentropy": 1.4641659259796143,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0986328125,
"loss/idx": 0.0,
"loss/logits": 0.018840216100215912,
"step": 865
},
{
"epoch": 0.007137146954350281,
"grad_norm": 1.2890625,
"grad_norm_var": 48.451341756184895,
"learning_rate": 5e-05,
"loss": 0.1129,
"loss/crossentropy": 0.8852246403694153,
"loss/dist_ce": 0.0,
"loss/hidden": 0.09619140625,
"loss/idx": 0.0,
"loss/logits": 0.01674000360071659,
"step": 866
},
{
"epoch": 0.007145388463535443,
"grad_norm": 1.4296875,
"grad_norm_var": 48.78075129191081,
"learning_rate": 5e-05,
"loss": 0.1627,
"loss/crossentropy": 2.618283748626709,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1240234375,
"loss/idx": 0.0,
"loss/logits": 0.03862760215997696,
"step": 867
},
{
"epoch": 0.0071536299727206045,
"grad_norm": 2.390625,
"grad_norm_var": 48.45802408854167,
"learning_rate": 5e-05,
"loss": 0.1756,
"loss/crossentropy": 1.5870869159698486,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.0232391357421875,
"step": 868
},
{
"epoch": 0.007161871481905766,
"grad_norm": 2.375,
"grad_norm_var": 48.54838460286458,
"learning_rate": 5e-05,
"loss": 0.1535,
"loss/crossentropy": 1.9420498609542847,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1220703125,
"loss/idx": 0.0,
"loss/logits": 0.03145188093185425,
"step": 869
},
{
"epoch": 0.007170112991090928,
"grad_norm": 1.375,
"grad_norm_var": 48.76895751953125,
"learning_rate": 5e-05,
"loss": 0.1167,
"loss/crossentropy": 1.6243377923965454,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0947265625,
"loss/idx": 0.0,
"loss/logits": 0.021999340504407883,
"step": 870
},
{
"epoch": 0.00717835450027609,
"grad_norm": 1.6484375,
"grad_norm_var": 48.60527114868164,
"learning_rate": 5e-05,
"loss": 0.1465,
"loss/crossentropy": 0.5207220911979675,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.015599271282553673,
"step": 871
},
{
"epoch": 0.007186596009461253,
"grad_norm": 1.6015625,
"grad_norm_var": 48.69667053222656,
"learning_rate": 5e-05,
"loss": 0.1485,
"loss/crossentropy": 2.410845994949341,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1181640625,
"loss/idx": 0.0,
"loss/logits": 0.030334439128637314,
"step": 872
},
{
"epoch": 0.007194837518646415,
"grad_norm": 2.46875,
"grad_norm_var": 48.66558024088542,
"learning_rate": 5e-05,
"loss": 0.1478,
"loss/crossentropy": 1.4278953075408936,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12890625,
"loss/idx": 0.0,
"loss/logits": 0.01892838627099991,
"step": 873
},
{
"epoch": 0.007203079027831577,
"grad_norm": 2.015625,
"grad_norm_var": 48.73091227213542,
"learning_rate": 5e-05,
"loss": 0.1412,
"loss/crossentropy": 2.3847408294677734,
"loss/dist_ce": 0.0,
"loss/hidden": 0.111328125,
"loss/idx": 0.0,
"loss/logits": 0.029860273003578186,
"step": 874
},
{
"epoch": 0.007211320537016739,
"grad_norm": 2.4375,
"grad_norm_var": 48.56925862630208,
"learning_rate": 5e-05,
"loss": 0.2361,
"loss/crossentropy": 2.355764627456665,
"loss/dist_ce": 0.0,
"loss/hidden": 0.185546875,
"loss/idx": 0.0,
"loss/logits": 0.05050516501069069,
"step": 875
},
{
"epoch": 0.0072195620462019005,
"grad_norm": 1.6875,
"grad_norm_var": 48.67285054524739,
"learning_rate": 5e-05,
"loss": 0.2005,
"loss/crossentropy": 2.284627676010132,
"loss/dist_ce": 0.0,
"loss/hidden": 0.158203125,
"loss/idx": 0.0,
"loss/logits": 0.04234454780817032,
"step": 876
},
{
"epoch": 0.007227803555387062,
"grad_norm": 3.296875,
"grad_norm_var": 48.53161519368489,
"learning_rate": 5e-05,
"loss": 0.2251,
"loss/crossentropy": 2.510892152786255,
"loss/dist_ce": 0.0,
"loss/hidden": 0.18359375,
"loss/idx": 0.0,
"loss/logits": 0.041470736265182495,
"step": 877
},
{
"epoch": 0.007236045064572224,
"grad_norm": 1.90625,
"grad_norm_var": 48.39202372233073,
"learning_rate": 5e-05,
"loss": 0.1637,
"loss/crossentropy": 1.8948522806167603,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.030879024416208267,
"step": 878
},
{
"epoch": 0.007244286573757386,
"grad_norm": 2.078125,
"grad_norm_var": 48.13868993123372,
"learning_rate": 5e-05,
"loss": 0.1593,
"loss/crossentropy": 1.3858805894851685,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1259765625,
"loss/idx": 0.0,
"loss/logits": 0.03331441059708595,
"step": 879
},
{
"epoch": 0.007252528082942548,
"grad_norm": 2.171875,
"grad_norm_var": 0.29590021769205727,
"learning_rate": 5e-05,
"loss": 0.1446,
"loss/crossentropy": 1.4740588665008545,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1181640625,
"loss/idx": 0.0,
"loss/logits": 0.02642858400940895,
"step": 880
},
{
"epoch": 0.007260769592127711,
"grad_norm": 3.1875,
"grad_norm_var": 0.3551259358723958,
"learning_rate": 5e-05,
"loss": 0.1482,
"loss/crossentropy": 1.0553547143936157,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12890625,
"loss/idx": 0.0,
"loss/logits": 0.01929531991481781,
"step": 881
},
{
"epoch": 0.007269011101312873,
"grad_norm": 2.375,
"grad_norm_var": 0.31359024047851564,
"learning_rate": 5e-05,
"loss": 0.1372,
"loss/crossentropy": 1.9523429870605469,
"loss/dist_ce": 0.0,
"loss/hidden": 0.109375,
"loss/idx": 0.0,
"loss/logits": 0.027855783700942993,
"step": 882
},
{
"epoch": 0.007277252610498035,
"grad_norm": 1.390625,
"grad_norm_var": 0.3174519856770833,
"learning_rate": 5e-05,
"loss": 0.0988,
"loss/crossentropy": 0.5503354072570801,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0888671875,
"loss/idx": 0.0,
"loss/logits": 0.009944621473550797,
"step": 883
},
{
"epoch": 0.0072854941196831965,
"grad_norm": 1.2578125,
"grad_norm_var": 0.36137059529622395,
"learning_rate": 5e-05,
"loss": 0.1208,
"loss/crossentropy": 1.7299476861953735,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10009765625,
"loss/idx": 0.0,
"loss/logits": 0.020662667229771614,
"step": 884
},
{
"epoch": 0.007293735628868358,
"grad_norm": 3.65625,
"grad_norm_var": 0.5144365946451823,
"learning_rate": 5e-05,
"loss": 0.1408,
"loss/crossentropy": 1.2476385831832886,
"loss/dist_ce": 0.0,
"loss/hidden": 0.115234375,
"loss/idx": 0.0,
"loss/logits": 0.025581957772374153,
"step": 885
},
{
"epoch": 0.00730197713805352,
"grad_norm": 1.078125,
"grad_norm_var": 0.5510047912597656,
"learning_rate": 5e-05,
"loss": 0.103,
"loss/crossentropy": 1.68153715133667,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0849609375,
"loss/idx": 0.0,
"loss/logits": 0.018041210249066353,
"step": 886
},
{
"epoch": 0.007310218647238682,
"grad_norm": 2.4375,
"grad_norm_var": 0.5380849202473958,
"learning_rate": 5e-05,
"loss": 0.1718,
"loss/crossentropy": 2.690136671066284,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1318359375,
"loss/idx": 0.0,
"loss/logits": 0.03993324190378189,
"step": 887
},
{
"epoch": 0.007318460156423844,
"grad_norm": 1.59375,
"grad_norm_var": 0.5387021382649739,
"learning_rate": 5e-05,
"loss": 0.1553,
"loss/crossentropy": 1.3326576948165894,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1240234375,
"loss/idx": 0.0,
"loss/logits": 0.03129717335104942,
"step": 888
},
{
"epoch": 0.007326701665609006,
"grad_norm": 2.109375,
"grad_norm_var": 0.5334144592285156,
"learning_rate": 5e-05,
"loss": 0.1613,
"loss/crossentropy": 2.293314218521118,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12890625,
"loss/idx": 0.0,
"loss/logits": 0.03244052827358246,
"step": 889
},
{
"epoch": 0.007334943174794169,
"grad_norm": 2.3125,
"grad_norm_var": 0.5329119364420573,
"learning_rate": 5e-05,
"loss": 0.1668,
"loss/crossentropy": 2.6757304668426514,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12890625,
"loss/idx": 0.0,
"loss/logits": 0.03789503872394562,
"step": 890
},
{
"epoch": 0.007343184683979331,
"grad_norm": 2.265625,
"grad_norm_var": 0.5289955139160156,
"learning_rate": 5e-05,
"loss": 0.172,
"loss/crossentropy": 1.9997798204421997,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.03719984367489815,
"step": 891
},
{
"epoch": 0.0073514261931644925,
"grad_norm": 1.5703125,
"grad_norm_var": 0.5374755859375,
"learning_rate": 5e-05,
"loss": 0.1209,
"loss/crossentropy": 0.4429371654987335,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1015625,
"loss/idx": 0.0,
"loss/logits": 0.019379278644919395,
"step": 892
},
{
"epoch": 0.007359667702349654,
"grad_norm": 1.3515625,
"grad_norm_var": 0.48118057250976565,
"learning_rate": 5e-05,
"loss": 0.0994,
"loss/crossentropy": 0.2325073629617691,
"loss/dist_ce": 0.0,
"loss/hidden": 0.09375,
"loss/idx": 0.0,
"loss/logits": 0.005671404767781496,
"step": 893
},
{
"epoch": 0.007367909211534816,
"grad_norm": 1.328125,
"grad_norm_var": 0.512872060139974,
"learning_rate": 5e-05,
"loss": 0.1444,
"loss/crossentropy": 2.507219076156616,
"loss/dist_ce": 0.0,
"loss/hidden": 0.109375,
"loss/idx": 0.0,
"loss/logits": 0.035071056336164474,
"step": 894
},
{
"epoch": 0.007376150720719978,
"grad_norm": 2.234375,
"grad_norm_var": 0.5158119201660156,
"learning_rate": 5e-05,
"loss": 0.1701,
"loss/crossentropy": 2.556119918823242,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.037296053022146225,
"step": 895
},
{
"epoch": 0.00738439222990514,
"grad_norm": 2.3125,
"grad_norm_var": 0.5198951721191406,
"learning_rate": 5e-05,
"loss": 0.2003,
"loss/crossentropy": 2.8151919841766357,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.04793722182512283,
"step": 896
},
{
"epoch": 0.007392633739090302,
"grad_norm": 1.9296875,
"grad_norm_var": 0.4244537353515625,
"learning_rate": 5e-05,
"loss": 0.2205,
"loss/crossentropy": 2.4031951427459717,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16796875,
"loss/idx": 0.0,
"loss/logits": 0.05256333947181702,
"step": 897
},
{
"epoch": 0.007400875248275464,
"grad_norm": 2.21875,
"grad_norm_var": 0.4171295166015625,
"learning_rate": 5e-05,
"loss": 0.1722,
"loss/crossentropy": 3.179619312286377,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1328125,
"loss/idx": 0.0,
"loss/logits": 0.0393570140004158,
"step": 898
},
{
"epoch": 0.007409116757460626,
"grad_norm": 4.625,
"grad_norm_var": 0.83385009765625,
"learning_rate": 5e-05,
"loss": 0.2282,
"loss/crossentropy": 1.429465651512146,
"loss/dist_ce": 0.0,
"loss/hidden": 0.19921875,
"loss/idx": 0.0,
"loss/logits": 0.028934892266988754,
"step": 899
},
{
"epoch": 0.0074173582666457885,
"grad_norm": 1.515625,
"grad_norm_var": 0.8075904846191406,
"learning_rate": 5e-05,
"loss": 0.1484,
"loss/crossentropy": 1.4451292753219604,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1240234375,
"loss/idx": 0.0,
"loss/logits": 0.024369023740291595,
"step": 900
},
{
"epoch": 0.0074255997758309504,
"grad_norm": 103.5,
"grad_norm_var": 643.7922401428223,
"learning_rate": 5e-05,
"loss": 0.5508,
"loss/crossentropy": 0.928047239780426,
"loss/dist_ce": 0.0,
"loss/hidden": 0.50390625,
"loss/idx": 0.0,
"loss/logits": 0.04687977582216263,
"step": 901
},
{
"epoch": 0.007433841285016112,
"grad_norm": 1.0078125,
"grad_norm_var": 643.861181640625,
"learning_rate": 5e-05,
"loss": 0.1275,
"loss/crossentropy": 2.235487461090088,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1044921875,
"loss/idx": 0.0,
"loss/logits": 0.022974800318479538,
"step": 902
},
{
"epoch": 0.007442082794201274,
"grad_norm": 2.421875,
"grad_norm_var": 643.8736073811849,
"learning_rate": 5e-05,
"loss": 0.1372,
"loss/crossentropy": 1.5220298767089844,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11328125,
"loss/idx": 0.0,
"loss/logits": 0.02395622618496418,
"step": 903
},
{
"epoch": 0.007450324303386436,
"grad_norm": 2.34375,
"grad_norm_var": 643.2287831624349,
"learning_rate": 5e-05,
"loss": 0.1307,
"loss/crossentropy": 1.4191033840179443,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10595703125,
"loss/idx": 0.0,
"loss/logits": 0.024734167382121086,
"step": 904
},
{
"epoch": 0.007458565812571598,
"grad_norm": 5.125,
"grad_norm_var": 641.2515462239584,
"learning_rate": 5e-05,
"loss": 0.3705,
"loss/crossentropy": 1.2650396823883057,
"loss/dist_ce": 0.0,
"loss/hidden": 0.26953125,
"loss/idx": 0.0,
"loss/logits": 0.10096491873264313,
"step": 905
},
{
"epoch": 0.00746680732175676,
"grad_norm": 3.234375,
"grad_norm_var": 640.5282704671224,
"learning_rate": 5e-05,
"loss": 0.1926,
"loss/crossentropy": 1.6492811441421509,
"loss/dist_ce": 0.0,
"loss/hidden": 0.154296875,
"loss/idx": 0.0,
"loss/logits": 0.0383470356464386,
"step": 906
},
{
"epoch": 0.007475048830941922,
"grad_norm": 2.671875,
"grad_norm_var": 640.190786743164,
"learning_rate": 5e-05,
"loss": 0.1486,
"loss/crossentropy": 1.6101174354553223,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12353515625,
"loss/idx": 0.0,
"loss/logits": 0.02505500242114067,
"step": 907
},
{
"epoch": 0.007483290340127084,
"grad_norm": 3.21875,
"grad_norm_var": 638.7909563700358,
"learning_rate": 5e-05,
"loss": 0.1805,
"loss/crossentropy": 1.6961467266082764,
"loss/dist_ce": 0.0,
"loss/hidden": 0.150390625,
"loss/idx": 0.0,
"loss/logits": 0.030135734006762505,
"step": 908
},
{
"epoch": 0.0074915318493122465,
"grad_norm": 3.28125,
"grad_norm_var": 637.1034220377604,
"learning_rate": 5e-05,
"loss": 0.2268,
"loss/crossentropy": 2.749086618423462,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1787109375,
"loss/idx": 0.0,
"loss/logits": 0.048118021339178085,
"step": 909
},
{
"epoch": 0.007499773358497408,
"grad_norm": 1.3515625,
"grad_norm_var": 637.0796831766764,
"learning_rate": 5e-05,
"loss": 0.1485,
"loss/crossentropy": 1.4836238622665405,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1220703125,
"loss/idx": 0.0,
"loss/logits": 0.02646125853061676,
"step": 910
},
{
"epoch": 0.00750801486768257,
"grad_norm": 2.21875,
"grad_norm_var": 637.0936622619629,
"learning_rate": 5e-05,
"loss": 0.16,
"loss/crossentropy": 1.5685328245162964,
"loss/dist_ce": 0.0,
"loss/hidden": 0.126953125,
"loss/idx": 0.0,
"loss/logits": 0.033031269907951355,
"step": 911
},
{
"epoch": 0.007516256376867732,
"grad_norm": 1.7890625,
"grad_norm_var": 637.5730539957682,
"learning_rate": 5e-05,
"loss": 0.1701,
"loss/crossentropy": 0.7678622007369995,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1435546875,
"loss/idx": 0.0,
"loss/logits": 0.02650710754096508,
"step": 912
},
{
"epoch": 0.007524497886052894,
"grad_norm": 2.5625,
"grad_norm_var": 637.0096819559733,
"learning_rate": 5e-05,
"loss": 0.1833,
"loss/crossentropy": 2.773200273513794,
"loss/dist_ce": 0.0,
"loss/hidden": 0.138671875,
"loss/idx": 0.0,
"loss/logits": 0.04464181140065193,
"step": 913
},
{
"epoch": 0.007532739395238056,
"grad_norm": 1.703125,
"grad_norm_var": 637.4885821024577,
"learning_rate": 5e-05,
"loss": 0.1809,
"loss/crossentropy": 2.4580299854278564,
"loss/dist_ce": 0.0,
"loss/hidden": 0.138671875,
"loss/idx": 0.0,
"loss/logits": 0.04220724105834961,
"step": 914
},
{
"epoch": 0.007540980904423218,
"grad_norm": 1.6796875,
"grad_norm_var": 639.71376953125,
"learning_rate": 5e-05,
"loss": 0.1522,
"loss/crossentropy": 2.1130568981170654,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1201171875,
"loss/idx": 0.0,
"loss/logits": 0.03203842043876648,
"step": 915
},
{
"epoch": 0.00754922241360838,
"grad_norm": 1.84375,
"grad_norm_var": 639.4050201416015,
"learning_rate": 5e-05,
"loss": 0.1293,
"loss/crossentropy": 1.8496575355529785,
"loss/dist_ce": 0.0,
"loss/hidden": 0.103515625,
"loss/idx": 0.0,
"loss/logits": 0.025790153071284294,
"step": 916
},
{
"epoch": 0.007557463922793542,
"grad_norm": 3.078125,
"grad_norm_var": 0.9873331705729167,
"learning_rate": 5e-05,
"loss": 0.1361,
"loss/crossentropy": 1.5800864696502686,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1123046875,
"loss/idx": 0.0,
"loss/logits": 0.023767339065670967,
"step": 917
},
{
"epoch": 0.007565705431978704,
"grad_norm": 1.8359375,
"grad_norm_var": 0.8686676025390625,
"learning_rate": 5e-05,
"loss": 0.1117,
"loss/crossentropy": 1.1425081491470337,
"loss/dist_ce": 0.0,
"loss/hidden": 0.09375,
"loss/idx": 0.0,
"loss/logits": 0.01797986589372158,
"step": 918
},
{
"epoch": 0.007573946941163866,
"grad_norm": 3.578125,
"grad_norm_var": 0.9367177327473958,
"learning_rate": 5e-05,
"loss": 0.1796,
"loss/crossentropy": 2.826726198196411,
"loss/dist_ce": 0.0,
"loss/hidden": 0.142578125,
"loss/idx": 0.0,
"loss/logits": 0.036978572607040405,
"step": 919
},
{
"epoch": 0.007582188450349028,
"grad_norm": 1.8671875,
"grad_norm_var": 0.9668596903483073,
"learning_rate": 5e-05,
"loss": 0.1869,
"loss/crossentropy": 2.1164968013763428,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1484375,
"loss/idx": 0.0,
"loss/logits": 0.03847302123904228,
"step": 920
},
{
"epoch": 0.00759042995953419,
"grad_norm": 4.40625,
"grad_norm_var": 0.753808339436849,
"learning_rate": 5e-05,
"loss": 0.1586,
"loss/crossentropy": 2.5811195373535156,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1240234375,
"loss/idx": 0.0,
"loss/logits": 0.034622643142938614,
"step": 921
},
{
"epoch": 0.007598671468719352,
"grad_norm": 1.65625,
"grad_norm_var": 0.7591509501139323,
"learning_rate": 5e-05,
"loss": 0.1544,
"loss/crossentropy": 2.408280611038208,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1181640625,
"loss/idx": 0.0,
"loss/logits": 0.036218732595443726,
"step": 922
},
{
"epoch": 0.007606912977904514,
"grad_norm": 1.78125,
"grad_norm_var": 0.7789812723795573,
"learning_rate": 5e-05,
"loss": 0.1502,
"loss/crossentropy": 1.7369745969772339,
"loss/dist_ce": 0.0,
"loss/hidden": 0.123046875,
"loss/idx": 0.0,
"loss/logits": 0.027189793065190315,
"step": 923
},
{
"epoch": 0.007615154487089676,
"grad_norm": 4.75,
"grad_norm_var": 1.0996864318847657,
"learning_rate": 5e-05,
"loss": 0.4286,
"loss/crossentropy": 2.7182440757751465,
"loss/dist_ce": 0.0,
"loss/hidden": 0.3671875,
"loss/idx": 0.0,
"loss/logits": 0.061443451792001724,
"step": 924
},
{
"epoch": 0.007623395996274838,
"grad_norm": 1.671875,
"grad_norm_var": 1.0856463114420574,
"learning_rate": 5e-05,
"loss": 0.1476,
"loss/crossentropy": 2.5658042430877686,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1171875,
"loss/idx": 0.0,
"loss/logits": 0.030445091426372528,
"step": 925
},
{
"epoch": 0.0076316375054599995,
"grad_norm": 1.578125,
"grad_norm_var": 1.0583658854166667,
"learning_rate": 5e-05,
"loss": 0.1394,
"loss/crossentropy": 1.2689893245697021,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11279296875,
"loss/idx": 0.0,
"loss/logits": 0.02664242871105671,
"step": 926
},
{
"epoch": 0.007639879014645161,
"grad_norm": 1.8125,
"grad_norm_var": 1.0771443684895834,
"learning_rate": 5e-05,
"loss": 0.1498,
"loss/crossentropy": 2.381871223449707,
"loss/dist_ce": 0.0,
"loss/hidden": 0.119140625,
"loss/idx": 0.0,
"loss/logits": 0.03070569597184658,
"step": 927
},
{
"epoch": 0.007648120523830324,
"grad_norm": 2.25,
"grad_norm_var": 1.0559730529785156,
"learning_rate": 5e-05,
"loss": 0.1529,
"loss/crossentropy": 2.887047529220581,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11962890625,
"loss/idx": 0.0,
"loss/logits": 0.033232178539037704,
"step": 928
},
{
"epoch": 0.007656362033015486,
"grad_norm": 11.875,
"grad_norm_var": 6.704707590738932,
"learning_rate": 5e-05,
"loss": 0.8791,
"loss/crossentropy": 1.3170801401138306,
"loss/dist_ce": 0.0,
"loss/hidden": 0.734375,
"loss/idx": 0.0,
"loss/logits": 0.14475420117378235,
"step": 929
},
{
"epoch": 0.007664603542200648,
"grad_norm": 1.6484375,
"grad_norm_var": 6.7140625,
"learning_rate": 5e-05,
"loss": 0.127,
"loss/crossentropy": 1.6071490049362183,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10107421875,
"loss/idx": 0.0,
"loss/logits": 0.025960184633731842,
"step": 930
},
{
"epoch": 0.00767284505138581,
"grad_norm": 1.8984375,
"grad_norm_var": 6.67979736328125,
"learning_rate": 5e-05,
"loss": 0.1926,
"loss/crossentropy": 2.840394973754883,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1455078125,
"loss/idx": 0.0,
"loss/logits": 0.047091417014598846,
"step": 931
},
{
"epoch": 0.007681086560570972,
"grad_norm": 1.9921875,
"grad_norm_var": 6.658870188395182,
"learning_rate": 5e-05,
"loss": 0.155,
"loss/crossentropy": 1.9618257284164429,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1240234375,
"loss/idx": 0.0,
"loss/logits": 0.03101358562707901,
"step": 932
},
{
"epoch": 0.007689328069756134,
"grad_norm": 1.6953125,
"grad_norm_var": 6.760285441080729,
"learning_rate": 5e-05,
"loss": 0.1729,
"loss/crossentropy": 0.5429065823554993,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1552734375,
"loss/idx": 0.0,
"loss/logits": 0.017611898481845856,
"step": 933
},
{
"epoch": 0.0076975695789412955,
"grad_norm": 1.953125,
"grad_norm_var": 6.744618479410807,
"learning_rate": 5e-05,
"loss": 0.1526,
"loss/crossentropy": 2.004976749420166,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12255859375,
"loss/idx": 0.0,
"loss/logits": 0.030033178627490997,
"step": 934
},
{
"epoch": 0.007705811088126457,
"grad_norm": 1.3046875,
"grad_norm_var": 6.8623606363932295,
"learning_rate": 5e-05,
"loss": 0.106,
"loss/crossentropy": 1.3220717906951904,
"loss/dist_ce": 0.0,
"loss/hidden": 0.08984375,
"loss/idx": 0.0,
"loss/logits": 0.01619834452867508,
"step": 935
},
{
"epoch": 0.007714052597311619,
"grad_norm": 1.46875,
"grad_norm_var": 6.919648996988932,
"learning_rate": 5e-05,
"loss": 0.1283,
"loss/crossentropy": 1.6176024675369263,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1044921875,
"loss/idx": 0.0,
"loss/logits": 0.023826373741030693,
"step": 936
},
{
"epoch": 0.007722294106496782,
"grad_norm": 2.296875,
"grad_norm_var": 6.727388254801432,
"learning_rate": 5e-05,
"loss": 0.1905,
"loss/crossentropy": 2.7859342098236084,
"loss/dist_ce": 0.0,
"loss/hidden": 0.14453125,
"loss/idx": 0.0,
"loss/logits": 0.04592683166265488,
"step": 937
},
{
"epoch": 0.007730535615681944,
"grad_norm": 2.28125,
"grad_norm_var": 6.672985585530599,
"learning_rate": 5e-05,
"loss": 0.1824,
"loss/crossentropy": 2.764125108718872,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1416015625,
"loss/idx": 0.0,
"loss/logits": 0.04075000435113907,
"step": 938
},
{
"epoch": 0.007738777124867106,
"grad_norm": 1.8203125,
"grad_norm_var": 6.668602498372396,
"learning_rate": 5e-05,
"loss": 0.1234,
"loss/crossentropy": 2.6147968769073486,
"loss/dist_ce": 0.0,
"loss/hidden": 0.099609375,
"loss/idx": 0.0,
"loss/logits": 0.023791346698999405,
"step": 939
},
{
"epoch": 0.007747018634052268,
"grad_norm": 1.8828125,
"grad_norm_var": 6.377123769124349,
"learning_rate": 5e-05,
"loss": 0.155,
"loss/crossentropy": 1.4540939331054688,
"loss/dist_ce": 0.0,
"loss/hidden": 0.125,
"loss/idx": 0.0,
"loss/logits": 0.029985029250383377,
"step": 940
},
{
"epoch": 0.00775526014323743,
"grad_norm": 2.0625,
"grad_norm_var": 6.345385487874349,
"learning_rate": 5e-05,
"loss": 0.1772,
"loss/crossentropy": 2.8355026245117188,
"loss/dist_ce": 0.0,
"loss/hidden": 0.138671875,
"loss/idx": 0.0,
"loss/logits": 0.038478825241327286,
"step": 941
},
{
"epoch": 0.0077635016524225916,
"grad_norm": 2.6875,
"grad_norm_var": 6.287605539957682,
"learning_rate": 5e-05,
"loss": 0.1997,
"loss/crossentropy": 2.347745895385742,
"loss/dist_ce": 0.0,
"loss/hidden": 0.154296875,
"loss/idx": 0.0,
"loss/logits": 0.0454033724963665,
"step": 942
},
{
"epoch": 0.0077717431616077534,
"grad_norm": 1.03125,
"grad_norm_var": 6.403419748942057,
"learning_rate": 5e-05,
"loss": 0.098,
"loss/crossentropy": 0.4349134564399719,
"loss/dist_ce": 0.0,
"loss/hidden": 0.087890625,
"loss/idx": 0.0,
"loss/logits": 0.010073849000036716,
"step": 943
},
{
"epoch": 0.007779984670792915,
"grad_norm": 1.5,
"grad_norm_var": 6.464503733317057,
"learning_rate": 5e-05,
"loss": 0.132,
"loss/crossentropy": 2.370471239089966,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10546875,
"loss/idx": 0.0,
"loss/logits": 0.02652416005730629,
"step": 944
},
{
"epoch": 0.007788226179978077,
"grad_norm": 2.3125,
"grad_norm_var": 0.1785296122233073,
"learning_rate": 5e-05,
"loss": 0.1954,
"loss/crossentropy": 2.6951100826263428,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1484375,
"loss/idx": 0.0,
"loss/logits": 0.04693574458360672,
"step": 945
},
{
"epoch": 0.00779646768916324,
"grad_norm": 2.9375,
"grad_norm_var": 0.24520670572916667,
"learning_rate": 5e-05,
"loss": 0.1673,
"loss/crossentropy": 1.1609289646148682,
"loss/dist_ce": 0.0,
"loss/hidden": 0.134765625,
"loss/idx": 0.0,
"loss/logits": 0.03258271515369415,
"step": 946
},
{
"epoch": 0.007804709198348402,
"grad_norm": 1.8125,
"grad_norm_var": 0.2462053934733073,
"learning_rate": 5e-05,
"loss": 0.1578,
"loss/crossentropy": 2.407763957977295,
"loss/dist_ce": 0.0,
"loss/hidden": 0.12353515625,
"loss/idx": 0.0,
"loss/logits": 0.034231819212436676,
"step": 947
},
{
"epoch": 0.007812950707533564,
"grad_norm": 1.703125,
"grad_norm_var": 0.2494140625,
"learning_rate": 5e-05,
"loss": 0.155,
"loss/crossentropy": 1.5243741273880005,
"loss/dist_ce": 0.0,
"loss/hidden": 0.126953125,
"loss/idx": 0.0,
"loss/logits": 0.02800397202372551,
"step": 948
},
{
"epoch": 0.007821192216718726,
"grad_norm": 5.0625,
"grad_norm_var": 0.8563189188639323,
"learning_rate": 5e-05,
"loss": 0.1713,
"loss/crossentropy": 1.9425218105316162,
"loss/dist_ce": 0.0,
"loss/hidden": 0.140625,
"loss/idx": 0.0,
"loss/logits": 0.03065253421664238,
"step": 949
},
{
"epoch": 0.007829433725903888,
"grad_norm": 3.8125,
"grad_norm_var": 1.027972157796224,
"learning_rate": 5e-05,
"loss": 0.2875,
"loss/crossentropy": 2.2058985233306885,
"loss/dist_ce": 0.0,
"loss/hidden": 0.228515625,
"loss/idx": 0.0,
"loss/logits": 0.0590139701962471,
"step": 950
},
{
"epoch": 0.00783767523508905,
"grad_norm": 2.171875,
"grad_norm_var": 0.9658406575520834,
"learning_rate": 5e-05,
"loss": 0.2211,
"loss/crossentropy": 2.6648082733154297,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1630859375,
"loss/idx": 0.0,
"loss/logits": 0.05802769958972931,
"step": 951
},
{
"epoch": 0.007845916744274211,
"grad_norm": 22.875,
"grad_norm_var": 27.2247314453125,
"learning_rate": 5e-05,
"loss": 0.3126,
"loss/crossentropy": 2.362283945083618,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2578125,
"loss/idx": 0.0,
"loss/logits": 0.05479743331670761,
"step": 952
},
{
"epoch": 0.007854158253459373,
"grad_norm": 1.3046875,
"grad_norm_var": 27.4640256245931,
"learning_rate": 5e-05,
"loss": 0.1007,
"loss/crossentropy": 0.38303616642951965,
"loss/dist_ce": 0.0,
"loss/hidden": 0.08935546875,
"loss/idx": 0.0,
"loss/logits": 0.011384121142327785,
"step": 953
},
{
"epoch": 0.007862399762644535,
"grad_norm": 1.1484375,
"grad_norm_var": 27.740185546875,
"learning_rate": 5e-05,
"loss": 0.1253,
"loss/crossentropy": 1.6580368280410767,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0986328125,
"loss/idx": 0.0,
"loss/logits": 0.026700211688876152,
"step": 954
},
{
"epoch": 0.007870641271829697,
"grad_norm": 1.7734375,
"grad_norm_var": 27.750869750976562,
"learning_rate": 5e-05,
"loss": 0.1519,
"loss/crossentropy": 2.318824529647827,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11767578125,
"loss/idx": 0.0,
"loss/logits": 0.03420557081699371,
"step": 955
},
{
"epoch": 0.007878882781014859,
"grad_norm": 3.03125,
"grad_norm_var": 27.58492202758789,
"learning_rate": 5e-05,
"loss": 0.1829,
"loss/crossentropy": 2.9506826400756836,
"loss/dist_ce": 0.0,
"loss/hidden": 0.140625,
"loss/idx": 0.0,
"loss/logits": 0.042233243584632874,
"step": 956
},
{
"epoch": 0.00788712429020002,
"grad_norm": 2.75,
"grad_norm_var": 27.475665028889974,
"learning_rate": 5e-05,
"loss": 0.1622,
"loss/crossentropy": 1.327090859413147,
"loss/dist_ce": 0.0,
"loss/hidden": 0.13671875,
"loss/idx": 0.0,
"loss/logits": 0.02552720718085766,
"step": 957
},
{
"epoch": 0.007895365799385183,
"grad_norm": 2.046875,
"grad_norm_var": 27.580934397379558,
"learning_rate": 5e-05,
"loss": 0.1882,
"loss/crossentropy": 2.5473806858062744,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1455078125,
"loss/idx": 0.0,
"loss/logits": 0.04271348565816879,
"step": 958
},
{
"epoch": 0.007903607308570345,
"grad_norm": 1.4453125,
"grad_norm_var": 27.450960286458333,
"learning_rate": 5e-05,
"loss": 0.1266,
"loss/crossentropy": 1.5830978155136108,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10400390625,
"loss/idx": 0.0,
"loss/logits": 0.022615976631641388,
"step": 959
},
{
"epoch": 0.007911848817755508,
"grad_norm": 2.53125,
"grad_norm_var": 27.227925618489582,
"learning_rate": 5e-05,
"loss": 0.1684,
"loss/crossentropy": 2.8910608291625977,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1298828125,
"loss/idx": 0.0,
"loss/logits": 0.03847365081310272,
"step": 960
},
{
"epoch": 0.00792009032694067,
"grad_norm": 2.109375,
"grad_norm_var": 27.26726786295573,
"learning_rate": 5e-05,
"loss": 0.1292,
"loss/crossentropy": 1.506900429725647,
"loss/dist_ce": 0.0,
"loss/hidden": 0.111328125,
"loss/idx": 0.0,
"loss/logits": 0.017824511975049973,
"step": 961
},
{
"epoch": 0.007928331836125832,
"grad_norm": 5.3125,
"grad_norm_var": 27.391893513997395,
"learning_rate": 5e-05,
"loss": 0.3447,
"loss/crossentropy": 2.996657133102417,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2734375,
"loss/idx": 0.0,
"loss/logits": 0.07124556601047516,
"step": 962
},
{
"epoch": 0.007936573345310994,
"grad_norm": 1.703125,
"grad_norm_var": 27.421708170572916,
"learning_rate": 5e-05,
"loss": 0.1274,
"loss/crossentropy": 1.4830890893936157,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1064453125,
"loss/idx": 0.0,
"loss/logits": 0.02091062068939209,
"step": 963
},
{
"epoch": 0.007944814854496156,
"grad_norm": 1.0390625,
"grad_norm_var": 27.6348264058431,
"learning_rate": 5e-05,
"loss": 0.0988,
"loss/crossentropy": 0.4731054902076721,
"loss/dist_ce": 0.0,
"loss/hidden": 0.08740234375,
"loss/idx": 0.0,
"loss/logits": 0.011356725357472897,
"step": 964
},
{
"epoch": 0.007953056363681318,
"grad_norm": 3.203125,
"grad_norm_var": 27.5273312886556,
"learning_rate": 5e-05,
"loss": 0.2465,
"loss/crossentropy": 2.7651779651641846,
"loss/dist_ce": 0.0,
"loss/hidden": 0.203125,
"loss/idx": 0.0,
"loss/logits": 0.043382175266742706,
"step": 965
},
{
"epoch": 0.00796129787286648,
"grad_norm": 2.953125,
"grad_norm_var": 27.553851064046224,
"learning_rate": 5e-05,
"loss": 0.2593,
"loss/crossentropy": 1.5127090215682983,
"loss/dist_ce": 0.0,
"loss/hidden": 0.212890625,
"loss/idx": 0.0,
"loss/logits": 0.04636671021580696,
"step": 966
},
{
"epoch": 0.007969539382051642,
"grad_norm": 3.0,
"grad_norm_var": 27.44041519165039,
"learning_rate": 5e-05,
"loss": 0.1922,
"loss/crossentropy": 1.4316555261611938,
"loss/dist_ce": 0.0,
"loss/hidden": 0.16796875,
"loss/idx": 0.0,
"loss/logits": 0.024203313514590263,
"step": 967
},
{
"epoch": 0.007977780891236803,
"grad_norm": 1.6875,
"grad_norm_var": 1.1560523986816407,
"learning_rate": 5e-05,
"loss": 0.1196,
"loss/crossentropy": 1.2071629762649536,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1025390625,
"loss/idx": 0.0,
"loss/logits": 0.017063483595848083,
"step": 968
},
{
"epoch": 0.007986022400421965,
"grad_norm": 2.375,
"grad_norm_var": 1.0834788004557292,
"learning_rate": 5e-05,
"loss": 0.1698,
"loss/crossentropy": 2.020332098007202,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1396484375,
"loss/idx": 0.0,
"loss/logits": 0.030112620443105698,
"step": 969
},
{
"epoch": 0.007994263909607127,
"grad_norm": 3.421875,
"grad_norm_var": 1.0326372782389324,
"learning_rate": 5e-05,
"loss": 0.2121,
"loss/crossentropy": 1.4469772577285767,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1767578125,
"loss/idx": 0.0,
"loss/logits": 0.03534634783864021,
"step": 970
},
{
"epoch": 0.008002505418792289,
"grad_norm": 2.8125,
"grad_norm_var": 0.9961415608723958,
"learning_rate": 5e-05,
"loss": 0.198,
"loss/crossentropy": 2.7077815532684326,
"loss/dist_ce": 0.0,
"loss/hidden": 0.15234375,
"loss/idx": 0.0,
"loss/logits": 0.045610323548316956,
"step": 971
},
{
"epoch": 0.008010746927977451,
"grad_norm": 2.21875,
"grad_norm_var": 0.9894765218098959,
"learning_rate": 5e-05,
"loss": 0.1874,
"loss/crossentropy": 1.9022347927093506,
"loss/dist_ce": 0.0,
"loss/hidden": 0.154296875,
"loss/idx": 0.0,
"loss/logits": 0.033076584339141846,
"step": 972
},
{
"epoch": 0.008018988437162613,
"grad_norm": 2.703125,
"grad_norm_var": 0.9882893880208333,
"learning_rate": 5e-05,
"loss": 0.1256,
"loss/crossentropy": 0.7915277481079102,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11181640625,
"loss/idx": 0.0,
"loss/logits": 0.013769976794719696,
"step": 973
},
{
"epoch": 0.008027229946347775,
"grad_norm": 4.875,
"grad_norm_var": 1.3040598551432292,
"learning_rate": 5e-05,
"loss": 0.3439,
"loss/crossentropy": 2.4664230346679688,
"loss/dist_ce": 0.0,
"loss/hidden": 0.28515625,
"loss/idx": 0.0,
"loss/logits": 0.05875328183174133,
"step": 974
},
{
"epoch": 0.008035471455532937,
"grad_norm": 1.609375,
"grad_norm_var": 1.2780352274576823,
"learning_rate": 5e-05,
"loss": 0.1361,
"loss/crossentropy": 2.0170304775238037,
"loss/dist_ce": 0.0,
"loss/hidden": 0.111328125,
"loss/idx": 0.0,
"loss/logits": 0.024749569594860077,
"step": 975
},
{
"epoch": 0.008043712964718099,
"grad_norm": 2.078125,
"grad_norm_var": 1.3024024963378906,
"learning_rate": 5e-05,
"loss": 0.1647,
"loss/crossentropy": 2.516977310180664,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.033810123801231384,
"step": 976
},
{
"epoch": 0.00805195447390326,
"grad_norm": 2.421875,
"grad_norm_var": 1.2841529846191406,
"learning_rate": 5e-05,
"loss": 0.1797,
"loss/crossentropy": 1.6354763507843018,
"loss/dist_ce": 0.0,
"loss/hidden": 0.142578125,
"loss/idx": 0.0,
"loss/logits": 0.03713398799300194,
"step": 977
},
{
"epoch": 0.008060195983088422,
"grad_norm": 2.609375,
"grad_norm_var": 0.8040667215983073,
"learning_rate": 5e-05,
"loss": 0.1307,
"loss/crossentropy": 2.52553653717041,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10546875,
"loss/idx": 0.0,
"loss/logits": 0.025194775313138962,
"step": 978
},
{
"epoch": 0.008068437492273586,
"grad_norm": 1.15625,
"grad_norm_var": 0.8841041564941406,
"learning_rate": 5e-05,
"loss": 0.1438,
"loss/crossentropy": 2.593212127685547,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11328125,
"loss/idx": 0.0,
"loss/logits": 0.030498359352350235,
"step": 979
},
{
"epoch": 0.008076679001458748,
"grad_norm": 1.3203125,
"grad_norm_var": 0.8338783264160157,
"learning_rate": 5e-05,
"loss": 0.1281,
"loss/crossentropy": 2.2054710388183594,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10107421875,
"loss/idx": 0.0,
"loss/logits": 0.02705022320151329,
"step": 980
},
{
"epoch": 0.00808492051064391,
"grad_norm": 1.390625,
"grad_norm_var": 0.8760047912597656,
"learning_rate": 5e-05,
"loss": 0.1358,
"loss/crossentropy": 1.5589760541915894,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1103515625,
"loss/idx": 0.0,
"loss/logits": 0.025421902537345886,
"step": 981
},
{
"epoch": 0.008093162019829072,
"grad_norm": 2.4375,
"grad_norm_var": 0.8555946350097656,
"learning_rate": 5e-05,
"loss": 0.1685,
"loss/crossentropy": 2.5976901054382324,
"loss/dist_ce": 0.0,
"loss/hidden": 0.130859375,
"loss/idx": 0.0,
"loss/logits": 0.03766857087612152,
"step": 982
},
{
"epoch": 0.008101403529014234,
"grad_norm": 1.703125,
"grad_norm_var": 0.853905995686849,
"learning_rate": 5e-05,
"loss": 0.1405,
"loss/crossentropy": 1.4126673936843872,
"loss/dist_ce": 0.0,
"loss/hidden": 0.115234375,
"loss/idx": 0.0,
"loss/logits": 0.025234002619981766,
"step": 983
},
{
"epoch": 0.008109645038199395,
"grad_norm": 3.0625,
"grad_norm_var": 0.859545644124349,
"learning_rate": 5e-05,
"loss": 0.1731,
"loss/crossentropy": 1.74605131149292,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1376953125,
"loss/idx": 0.0,
"loss/logits": 0.035446591675281525,
"step": 984
},
{
"epoch": 0.008117886547384557,
"grad_norm": 2.90625,
"grad_norm_var": 0.8763201395670573,
"learning_rate": 5e-05,
"loss": 0.124,
"loss/crossentropy": 1.0640939474105835,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1044921875,
"loss/idx": 0.0,
"loss/logits": 0.019489990547299385,
"step": 985
},
{
"epoch": 0.00812612805656972,
"grad_norm": 1.6171875,
"grad_norm_var": 0.8388987223307292,
"learning_rate": 5e-05,
"loss": 0.142,
"loss/crossentropy": 1.6699655055999756,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11474609375,
"loss/idx": 0.0,
"loss/logits": 0.02729114145040512,
"step": 986
},
{
"epoch": 0.008134369565754881,
"grad_norm": 2.40625,
"grad_norm_var": 0.8218658447265625,
"learning_rate": 5e-05,
"loss": 0.1273,
"loss/crossentropy": 0.9518370032310486,
"loss/dist_ce": 0.0,
"loss/hidden": 0.10693359375,
"loss/idx": 0.0,
"loss/logits": 0.020414654165506363,
"step": 987
},
{
"epoch": 0.008142611074940043,
"grad_norm": 1.9453125,
"grad_norm_var": 0.8288530985514323,
"learning_rate": 5e-05,
"loss": 0.1543,
"loss/crossentropy": 2.4604508876800537,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1220703125,
"loss/idx": 0.0,
"loss/logits": 0.03225576505064964,
"step": 988
},
{
"epoch": 0.008150852584125205,
"grad_norm": 2.46875,
"grad_norm_var": 0.8185991923014323,
"learning_rate": 5e-05,
"loss": 0.1825,
"loss/crossentropy": 2.5151901245117188,
"loss/dist_ce": 0.0,
"loss/hidden": 0.140625,
"loss/idx": 0.0,
"loss/logits": 0.04188704490661621,
"step": 989
},
{
"epoch": 0.008159094093310367,
"grad_norm": 3.8125,
"grad_norm_var": 0.5173500061035157,
"learning_rate": 5e-05,
"loss": 0.1537,
"loss/crossentropy": 2.6265015602111816,
"loss/dist_ce": 0.0,
"loss/hidden": 0.119140625,
"loss/idx": 0.0,
"loss/logits": 0.03460276871919632,
"step": 990
},
{
"epoch": 0.008167335602495529,
"grad_norm": 1.4921875,
"grad_norm_var": 0.5271881103515625,
"learning_rate": 5e-05,
"loss": 0.1395,
"loss/crossentropy": 2.9253175258636475,
"loss/dist_ce": 0.0,
"loss/hidden": 0.109375,
"loss/idx": 0.0,
"loss/logits": 0.030113544315099716,
"step": 991
},
{
"epoch": 0.00817557711168069,
"grad_norm": 1.3828125,
"grad_norm_var": 0.5665484110514323,
"learning_rate": 5e-05,
"loss": 0.1456,
"loss/crossentropy": 2.487765073776245,
"loss/dist_ce": 0.0,
"loss/hidden": 0.11328125,
"loss/idx": 0.0,
"loss/logits": 0.032366957515478134,
"step": 992
},
{
"epoch": 0.008183818620865852,
"grad_norm": 1.796875,
"grad_norm_var": 0.5669146219889323,
"learning_rate": 5e-05,
"loss": 0.1707,
"loss/crossentropy": 2.329315185546875,
"loss/dist_ce": 0.0,
"loss/hidden": 0.13671875,
"loss/idx": 0.0,
"loss/logits": 0.03394667059183121,
"step": 993
},
{
"epoch": 0.008192060130051014,
"grad_norm": 3.78125,
"grad_norm_var": 0.7332354227701823,
"learning_rate": 5e-05,
"loss": 0.1519,
"loss/crossentropy": 2.148042917251587,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1220703125,
"loss/idx": 0.0,
"loss/logits": 0.029824528843164444,
"step": 994
},
{
"epoch": 0.008200301639236176,
"grad_norm": 25.625,
"grad_norm_var": 34.854078928629555,
"learning_rate": 5e-05,
"loss": 0.334,
"loss/crossentropy": 1.8628984689712524,
"loss/dist_ce": 0.0,
"loss/hidden": 0.27734375,
"loss/idx": 0.0,
"loss/logits": 0.05670515447854996,
"step": 995
},
{
"epoch": 0.008208543148421338,
"grad_norm": 1.4375,
"grad_norm_var": 34.81780497233073,
"learning_rate": 5e-05,
"loss": 0.1181,
"loss/crossentropy": 2.3790695667266846,
"loss/dist_ce": 0.0,
"loss/hidden": 0.0947265625,
"loss/idx": 0.0,
"loss/logits": 0.02332988940179348,
"step": 996
},
{
"epoch": 0.0082167846576065,
"grad_norm": 1.2734375,
"grad_norm_var": 34.854811350504555,
"learning_rate": 5e-05,
"loss": 0.1176,
"loss/crossentropy": 1.283698320388794,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1005859375,
"loss/idx": 0.0,
"loss/logits": 0.016972240060567856,
"step": 997
},
{
"epoch": 0.008225026166791664,
"grad_norm": 2.640625,
"grad_norm_var": 34.82328465779622,
"learning_rate": 5e-05,
"loss": 0.1779,
"loss/crossentropy": 2.7694525718688965,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1337890625,
"loss/idx": 0.0,
"loss/logits": 0.04413297772407532,
"step": 998
},
{
"epoch": 0.008233267675976826,
"grad_norm": 1.8828125,
"grad_norm_var": 34.77723388671875,
"learning_rate": 5e-05,
"loss": 0.1636,
"loss/crossentropy": 2.530651330947876,
"loss/dist_ce": 0.0,
"loss/hidden": 0.1298828125,
"loss/idx": 0.0,
"loss/logits": 0.0337049663066864,
"step": 999
},
{
"epoch": 0.008241509185161987,
"grad_norm": 5.5625,
"grad_norm_var": 34.94845784505208,
"learning_rate": 5e-05,
"loss": 0.278,
"loss/crossentropy": 1.1534559726715088,
"loss/dist_ce": 0.0,
"loss/hidden": 0.2333984375,
"loss/idx": 0.0,
"loss/logits": 0.04459930956363678,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.8956539674624e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}