{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.425,
  "eval_steps": 500,
  "global_step": 33000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "gate_value": 0.0,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 0
    },
    {
      "grad_norm": 0.0005426404532045126,
      "learning_rate": 1.3499999999999998e-06,
      "loss": 0.5017,
      "step": 10
    },
    {
      "gate_value": 2.407148258498637e-06,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 10
    },
    {
      "grad_norm": 0.0014531903434544802,
      "learning_rate": 2.85e-06,
      "loss": 0.4952,
      "step": 20
    },
    {
      "gate_value": 2.730801497818902e-06,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 20
    },
    {
      "grad_norm": 0.0007584649138152599,
      "learning_rate": 4.35e-06,
      "loss": 0.5197,
      "step": 30
    },
    {
      "gate_value": -4.140019882470369e-09,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30
    },
    {
      "grad_norm": 0.002548313234001398,
      "learning_rate": 5.85e-06,
      "loss": 0.5194,
      "step": 40
    },
    {
      "gate_value": -9.026175575854722e-07,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 40
    },
    {
      "grad_norm": 0.002914809389039874,
      "learning_rate": 7.35e-06,
      "loss": 0.5269,
      "step": 50
    },
    {
      "gate_value": -1.088102408175473e-06,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 50
    },
    {
      "grad_norm": 0.0004767653881572187,
      "learning_rate": 8.849999999999998e-06,
      "loss": 0.51,
      "step": 60
    },
    {
      "gate_value": 5.859649718331639e-06,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 60
    },
    {
      "grad_norm": 0.000854952319059521,
      "learning_rate": 1.035e-05,
      "loss": 0.5154,
      "step": 70
    },
    {
      "gate_value": 3.494698103168048e-05,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 70
    },
    {
      "grad_norm": 0.006400048267096281,
      "learning_rate": 1.1849999999999998e-05,
      "loss": 0.5026,
      "step": 80
    },
    {
      "gate_value": 9.951820538844913e-05,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 80
    },
    {
      "grad_norm": 0.0003364745352882892,
      "learning_rate": 1.3349999999999998e-05,
      "loss": 0.5083,
      "step": 90
    },
    {
      "gate_value": 0.00029271937091834843,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 90
    },
    {
      "grad_norm": 0.09524580836296082,
      "learning_rate": 1.485e-05,
      "loss": 0.5008,
      "step": 100
    },
    {
      "gate_value": 0.000530488439835608,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 100
    },
    {
      "grad_norm": 0.05546204373240471,
      "learning_rate": 1.6349999999999998e-05,
      "loss": 0.4951,
      "step": 110
    },
    {
      "gate_value": 0.0008035043138079345,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 110
    },
    {
      "grad_norm": 0.16272345185279846,
      "learning_rate": 1.7849999999999997e-05,
      "loss": 0.5003,
      "step": 120
    },
    {
      "gate_value": 0.0010982438689097762,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 120
    },
    {
      "grad_norm": 0.23696964979171753,
      "learning_rate": 1.935e-05,
      "loss": 0.4987,
      "step": 130
    },
    {
      "gate_value": 0.001406953320838511,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 130
    },
    {
      "grad_norm": 0.13120773434638977,
      "learning_rate": 2.085e-05,
      "loss": 0.5179,
      "step": 140
    },
    {
      "gate_value": 0.001694629667326808,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 140
    },
    {
      "grad_norm": 0.1184849962592125,
      "learning_rate": 2.2349999999999998e-05,
      "loss": 0.4964,
      "step": 150
    },
    {
      "gate_value": 0.002073641400784254,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 150
    },
    {
      "grad_norm": 0.04556508734822273,
      "learning_rate": 2.3849999999999997e-05,
      "loss": 0.5121,
      "step": 160
    },
    {
      "gate_value": 0.002421831712126732,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 160
    },
    {
      "grad_norm": 1.6241488456726074,
      "learning_rate": 2.535e-05,
      "loss": 0.5208,
      "step": 170
    },
    {
      "gate_value": 0.0027016534004360437,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 170
    },
    {
      "grad_norm": 0.16671186685562134,
      "learning_rate": 2.6849999999999995e-05,
      "loss": 0.5104,
      "step": 180
    },
    {
      "gate_value": 0.0029992451891303062,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 180
    },
    {
      "grad_norm": 1.049537181854248,
      "learning_rate": 2.8349999999999998e-05,
      "loss": 0.5182,
      "step": 190
    },
    {
      "gate_value": 0.003241482889279723,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 190
    },
    {
      "grad_norm": 0.29614609479904175,
      "learning_rate": 2.985e-05,
      "loss": 0.5094,
      "step": 200
    },
    {
      "gate_value": 0.0032604828011244535,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 200
    },
    {
      "grad_norm": 0.44253769516944885,
      "learning_rate": 3.1349999999999996e-05,
      "loss": 0.508,
      "step": 210
    },
    {
      "gate_value": 0.0032824440859258175,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 210
    },
    {
      "grad_norm": 0.010273229330778122,
      "learning_rate": 3.285e-05,
      "loss": 0.502,
      "step": 220
    },
    {
      "gate_value": 0.0034144630189985037,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 220
    },
    {
      "grad_norm": 0.9588198661804199,
      "learning_rate": 3.435e-05,
      "loss": 0.5176,
      "step": 230
    },
    {
      "gate_value": 0.0035216009709984064,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 230
    },
    {
      "grad_norm": 1.1617745161056519,
      "learning_rate": 3.585e-05,
      "loss": 0.5123,
      "step": 240
    },
    {
      "gate_value": 0.0036562951281666756,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 240
    },
    {
      "grad_norm": 1.0517369508743286,
      "learning_rate": 3.735e-05,
      "loss": 0.515,
      "step": 250
    },
    {
      "gate_value": 0.0038379342295229435,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 250
    },
    {
      "grad_norm": 0.42224568128585815,
      "learning_rate": 3.8849999999999996e-05,
      "loss": 0.5109,
      "step": 260
    },
    {
      "gate_value": 0.004033830948174,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 260
    },
    {
      "grad_norm": 1.4201754331588745,
      "learning_rate": 4.035e-05,
      "loss": 0.5135,
      "step": 270
    },
    {
      "gate_value": 0.004163349512964487,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 270
    },
    {
      "grad_norm": 0.2649329900741577,
      "learning_rate": 4.185e-05,
      "loss": 0.5231,
      "step": 280
    },
    {
      "gate_value": 0.0040314337238669395,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 280
    },
    {
      "grad_norm": 0.06176350265741348,
      "learning_rate": 4.334999999999999e-05,
      "loss": 0.5,
      "step": 290
    },
    {
      "gate_value": 0.004018639679998159,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 290
    },
    {
      "grad_norm": 2.028596878051758,
      "learning_rate": 4.484999999999999e-05,
      "loss": 0.5075,
      "step": 300
    },
    {
      "gate_value": 0.00426053861156106,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 300
    },
    {
      "grad_norm": 1.6869006156921387,
      "learning_rate": 4.6349999999999995e-05,
      "loss": 0.5225,
      "step": 310
    },
    {
      "gate_value": 0.004472165368497372,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 310
    },
    {
      "grad_norm": 0.09726118296384811,
      "learning_rate": 4.785e-05,
      "loss": 0.4986,
      "step": 320
    },
    {
      "gate_value": 0.004443428013473749,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 320
    },
    {
      "grad_norm": 1.6347602605819702,
      "learning_rate": 4.935e-05,
      "loss": 0.5104,
      "step": 330
    },
    {
      "gate_value": 0.004441737663000822,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 330
    },
    {
      "grad_norm": 0.5094766616821289,
      "learning_rate": 5.0849999999999996e-05,
      "loss": 0.5075,
      "step": 340
    },
    {
      "gate_value": 0.0046890913508832455,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 340
    },
    {
      "grad_norm": 0.13526400923728943,
      "learning_rate": 5.234999999999999e-05,
      "loss": 0.4973,
      "step": 350
    },
    {
      "gate_value": 0.0050169299356639385,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 350
    },
    {
      "grad_norm": 0.2546197175979614,
      "learning_rate": 5.3849999999999994e-05,
      "loss": 0.4989,
      "step": 360
    },
    {
      "gate_value": 0.005079896654933691,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 360
    },
    {
      "grad_norm": 0.4461250901222229,
      "learning_rate": 5.535e-05,
      "loss": 0.4993,
      "step": 370
    },
    {
      "gate_value": 0.005180824548006058,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 370
    },
    {
      "grad_norm": 0.11173674464225769,
      "learning_rate": 5.684999999999999e-05,
      "loss": 0.4963,
      "step": 380
    },
    {
      "gate_value": 0.005200548097491264,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 380
    },
    {
      "grad_norm": 0.050604645162820816,
      "learning_rate": 5.8349999999999995e-05,
      "loss": 0.5173,
      "step": 390
    },
    {
      "gate_value": 0.005443707574158907,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 390
    },
    {
      "grad_norm": 0.23654121160507202,
      "learning_rate": 5.985e-05,
      "loss": 0.5035,
      "step": 400
    },
    {
      "gate_value": 0.005672121420502663,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 400
    },
    {
      "grad_norm": 0.854729413986206,
      "learning_rate": 6.134999999999999e-05,
      "loss": 0.5289,
      "step": 410
    },
    {
      "gate_value": 0.005770199932157993,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 410
    },
    {
      "grad_norm": 0.31689170002937317,
      "learning_rate": 6.285e-05,
      "loss": 0.4897,
      "step": 420
    },
    {
      "gate_value": 0.005838444456458092,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 420
    },
    {
      "grad_norm": 0.13151150941848755,
      "learning_rate": 6.434999999999999e-05,
      "loss": 0.5071,
      "step": 430
    },
    {
      "gate_value": 0.006072549149394035,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 430
    },
    {
      "grad_norm": 0.3894720673561096,
      "learning_rate": 6.584999999999999e-05,
      "loss": 0.5082,
      "step": 440
    },
    {
      "gate_value": 0.006371940486133099,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 440
    },
    {
      "grad_norm": 1.0877070426940918,
      "learning_rate": 6.735e-05,
      "loss": 0.5012,
      "step": 450
    },
    {
      "gate_value": 0.006615795660763979,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 450
    },
    {
      "grad_norm": 1.3406215906143188,
      "learning_rate": 6.884999999999999e-05,
      "loss": 0.5193,
      "step": 460
    },
    {
      "gate_value": 0.006942338310182095,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 460
    },
    {
      "grad_norm": 1.0344425439834595,
      "learning_rate": 7.034999999999999e-05,
      "loss": 0.4969,
      "step": 470
    },
    {
      "gate_value": 0.007014387287199497,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 470
    },
    {
      "grad_norm": 0.24051937460899353,
      "learning_rate": 7.184999999999998e-05,
      "loss": 0.5019,
      "step": 480
    },
    {
      "gate_value": 0.007041481789201498,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 480
    },
    {
      "grad_norm": 0.24314218759536743,
      "learning_rate": 7.335e-05,
      "loss": 0.5165,
      "step": 490
    },
    {
      "gate_value": 0.007269471418112516,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 490
    },
    {
      "grad_norm": 0.8075150847434998,
      "learning_rate": 7.484999999999999e-05,
      "loss": 0.5024,
      "step": 500
    },
    {
      "gate_value": 0.00759664922952652,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 500
    },
    {
      "grad_norm": 1.4133530855178833,
      "learning_rate": 7.635e-05,
      "loss": 0.5059,
      "step": 510
    },
    {
      "gate_value": 0.007728520315140486,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 510
    },
    {
      "grad_norm": 0.4607851505279541,
      "learning_rate": 7.785e-05,
      "loss": 0.5165,
      "step": 520
    },
    {
      "gate_value": 0.008264156058430672,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 520
    },
    {
      "grad_norm": 0.3550325930118561,
      "learning_rate": 7.934999999999999e-05,
      "loss": 0.5042,
      "step": 530
    },
    {
      "gate_value": 0.008512669242918491,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 530
    },
    {
      "grad_norm": 0.1921839714050293,
      "learning_rate": 8.085e-05,
      "loss": 0.4936,
      "step": 540
    },
    {
      "gate_value": 0.008503591641783714,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 540
    },
    {
      "grad_norm": 0.5971819162368774,
      "learning_rate": 8.235e-05,
      "loss": 0.5087,
      "step": 550
    },
    {
      "gate_value": 0.008713321760296822,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 550
    },
    {
      "grad_norm": 0.0914035215973854,
      "learning_rate": 8.385e-05,
      "loss": 0.4861,
      "step": 560
    },
    {
      "gate_value": 0.009013736620545387,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 560
    },
    {
      "grad_norm": 0.11908543854951859,
      "learning_rate": 8.534999999999999e-05,
      "loss": 0.4941,
      "step": 570
    },
    {
      "gate_value": 0.009317001327872276,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 570
    },
    {
      "grad_norm": 0.2162892073392868,
      "learning_rate": 8.684999999999998e-05,
      "loss": 0.5106,
      "step": 580
    },
    {
      "gate_value": 0.009594669565558434,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 580
    },
    {
      "grad_norm": 0.7542089223861694,
      "learning_rate": 8.834999999999999e-05,
      "loss": 0.5217,
      "step": 590
    },
    {
      "gate_value": 0.009778150357306004,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 590
    },
    {
      "grad_norm": 0.09826211631298065,
      "learning_rate": 8.984999999999999e-05,
      "loss": 0.51,
      "step": 600
    },
    {
      "gate_value": 0.010008268058300018,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 600
    },
    {
      "grad_norm": 0.8503474593162537,
      "learning_rate": 9.134999999999998e-05,
      "loss": 0.4968,
      "step": 610
    },
    {
      "gate_value": 0.010128550231456757,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 610
    },
    {
      "grad_norm": 1.022375226020813,
      "learning_rate": 9.285e-05,
      "loss": 0.5022,
      "step": 620
    },
    {
      "gate_value": 0.010452290996909142,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 620
    },
    {
      "grad_norm": 1.4550477266311646,
      "learning_rate": 9.434999999999999e-05,
      "loss": 0.5124,
      "step": 630
    },
    {
      "gate_value": 0.01016128808259964,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 630
    },
    {
      "grad_norm": 0.3279017508029938,
      "learning_rate": 9.585e-05,
      "loss": 0.5014,
      "step": 640
    },
    {
      "gate_value": 0.010357659310102463,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 640
    },
    {
      "grad_norm": 0.12235884368419647,
      "learning_rate": 9.735e-05,
      "loss": 0.5094,
      "step": 650
    },
    {
      "gate_value": 0.010804083198308945,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 650
    },
    {
      "grad_norm": 0.11897007375955582,
      "learning_rate": 9.884999999999999e-05,
      "loss": 0.4895,
      "step": 660
    },
    {
      "gate_value": 0.011202951893210411,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 660
    },
    {
      "grad_norm": 0.2711750268936157,
      "learning_rate": 0.00010035,
      "loss": 0.4964,
      "step": 670
    },
    {
      "gate_value": 0.011760725639760494,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 670
    },
    {
      "grad_norm": 0.3249741494655609,
      "learning_rate": 0.00010185,
      "loss": 0.4867,
      "step": 680
    },
    {
      "gate_value": 0.012173856608569622,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 680
    },
    {
      "grad_norm": 0.30625203251838684,
      "learning_rate": 0.00010334999999999998,
      "loss": 0.5046,
      "step": 690
    },
    {
      "gate_value": 0.01261813659220934,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 690
    },
    {
      "grad_norm": 0.014923992566764355,
      "learning_rate": 0.00010484999999999999,
      "loss": 0.5079,
      "step": 700
    },
    {
      "gate_value": 0.012826438061892986,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 700
    },
    {
      "grad_norm": 0.14983655512332916,
      "learning_rate": 0.00010634999999999998,
      "loss": 0.5051,
      "step": 710
    },
    {
      "gate_value": 0.012988142669200897,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 710
    },
    {
      "grad_norm": 0.5405359864234924,
      "learning_rate": 0.00010784999999999999,
      "loss": 0.4939,
      "step": 720
    },
    {
      "gate_value": 0.013321064412593842,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 720
    },
    {
      "grad_norm": 0.16480578482151031,
      "learning_rate": 0.00010934999999999999,
      "loss": 0.487,
      "step": 730
    },
    {
      "gate_value": 0.013345051556825638,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 730
    },
    {
      "grad_norm": 0.0591035857796669,
      "learning_rate": 0.00011084999999999998,
      "loss": 0.4962,
      "step": 740
    },
    {
      "gate_value": 0.013593264855444431,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 740
    },
    {
      "grad_norm": 0.20157171785831451,
      "learning_rate": 0.00011235,
      "loss": 0.5034,
      "step": 750
    },
    {
      "gate_value": 0.014044249430298805,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 750
    },
    {
      "grad_norm": 0.8499253392219543,
      "learning_rate": 0.00011384999999999999,
      "loss": 0.4846,
      "step": 760
    },
    {
      "gate_value": 0.014514867216348648,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 760
    },
    {
      "grad_norm": 0.3363804817199707,
      "learning_rate": 0.00011535,
      "loss": 0.492,
      "step": 770
    },
    {
      "gate_value": 0.014975903555750847,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 770
    },
    {
      "grad_norm": 0.4271162152290344,
      "learning_rate": 0.00011685,
      "loss": 0.5139,
      "step": 780
    },
    {
      "gate_value": 0.015347362495958805,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 780
    },
    {
      "grad_norm": 0.07361973077058792,
      "learning_rate": 0.00011834999999999999,
      "loss": 0.5002,
      "step": 790
    },
    {
      "gate_value": 0.0156552717089653,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 790
    },
    {
      "grad_norm": 0.042372770607471466,
      "learning_rate": 0.00011985,
      "loss": 0.4942,
      "step": 800
    },
    {
      "gate_value": 0.01595964841544628,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 800
    },
    {
      "grad_norm": 0.3930019438266754,
      "learning_rate": 0.00012135,
      "loss": 0.5074,
      "step": 810
    },
    {
      "gate_value": 0.016275372356176376,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 810
    },
    {
      "grad_norm": 0.2732306718826294,
      "learning_rate": 0.00012284999999999998,
      "loss": 0.5001,
      "step": 820
    },
    {
      "gate_value": 0.01652705669403076,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 820
    },
    {
      "grad_norm": 0.06455976516008377,
      "learning_rate": 0.00012435,
      "loss": 0.5188,
      "step": 830
    },
    {
      "gate_value": 0.016465678811073303,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 830
    },
    {
      "grad_norm": 0.19495975971221924,
      "learning_rate": 0.00012585,
      "loss": 0.4919,
      "step": 840
    },
    {
      "gate_value": 0.016236618161201477,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 840
    },
    {
      "grad_norm": 0.2919803261756897,
      "learning_rate": 0.00012734999999999998,
      "loss": 0.5025,
      "step": 850
    },
    {
      "gate_value": 0.016103582456707954,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 850
    },
    {
      "grad_norm": 0.06609172374010086,
      "learning_rate": 0.00012885,
      "loss": 0.4796,
      "step": 860
    },
    {
      "gate_value": 0.016391385346651077,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 860
    },
    {
      "grad_norm": 0.08503394573926926,
      "learning_rate": 0.00013035,
      "loss": 0.486,
      "step": 870
    },
    {
      "gate_value": 0.016892239451408386,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 870
    },
    {
      "grad_norm": 0.0417107455432415,
      "learning_rate": 0.00013184999999999998,
      "loss": 0.5064,
      "step": 880
    },
    {
      "gate_value": 0.01728055067360401,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 880
    },
    {
      "grad_norm": 0.15999619662761688,
      "learning_rate": 0.00013335,
      "loss": 0.4913,
      "step": 890
    },
    {
      "gate_value": 0.017700038850307465,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 890
    },
    {
      "grad_norm": 0.3381291627883911,
      "learning_rate": 0.00013485,
      "loss": 0.478,
      "step": 900
    },
    {
      "gate_value": 0.017996180802583694,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 900
    },
    {
      "grad_norm": 0.294606477022171,
      "learning_rate": 0.00013634999999999998,
      "loss": 0.5028,
      "step": 910
    },
    {
      "gate_value": 0.018202368170022964,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 910
    },
    {
      "grad_norm": 0.34997233748435974,
      "learning_rate": 0.00013785,
      "loss": 0.4892,
      "step": 920
    },
    {
      "gate_value": 0.01846320927143097,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 920
    },
    {
      "grad_norm": 0.045470427721738815,
      "learning_rate": 0.00013935,
      "loss": 0.4955,
      "step": 930
    },
    {
      "gate_value": 0.019063180312514305,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 930
    },
    {
      "grad_norm": 0.5815165042877197,
      "learning_rate": 0.00014084999999999998,
      "loss": 0.5002,
      "step": 940
    },
    {
      "gate_value": 0.019558217376470566,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 940
    },
    {
      "grad_norm": 0.2606048583984375,
      "learning_rate": 0.00014235,
      "loss": 0.4999,
      "step": 950
    },
    {
      "gate_value": 0.019863948225975037,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 950
    },
    {
      "grad_norm": 0.17119236290454865,
      "learning_rate": 0.00014384999999999997,
      "loss": 0.4776,
      "step": 960
    },
    {
      "gate_value": 0.019959047436714172,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 960
    },
    {
      "grad_norm": 0.34382596611976624,
      "learning_rate": 0.00014534999999999998,
      "loss": 0.4833,
      "step": 970
    },
    {
      "gate_value": 0.0202496275305748,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 970
    },
    {
      "grad_norm": 0.35615915060043335,
      "learning_rate": 0.00014685,
      "loss": 0.4879,
      "step": 980
    },
    {
      "gate_value": 0.020738650113344193,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 980
    },
    {
      "grad_norm": 1.0342373847961426,
      "learning_rate": 0.00014834999999999997,
      "loss": 0.4922,
      "step": 990
    },
    {
      "gate_value": 0.02130601368844509,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 990
    },
    {
      "grad_norm": 0.5985687375068665,
      "learning_rate": 0.00014984999999999998,
      "loss": 0.4829,
      "step": 1000
    },
    {
      "gate_value": 0.021172812208533287,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 1000
    },
    {
      "grad_norm": 0.6002101302146912,
      "learning_rate": 0.00015134999999999997,
      "loss": 0.4958,
      "step": 1010
    },
    {
      "gate_value": 0.021218104287981987,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 1010
    },
    {
      "grad_norm": 0.041248299181461334,
      "learning_rate": 0.00015284999999999997,
      "loss": 0.4881,
      "step": 1020
    },
    {
      "gate_value": 0.021419478580355644,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 1020
    },
    {
      "grad_norm": 0.024791941046714783,
      "learning_rate": 0.00015434999999999998,
      "loss": 0.4768,
      "step": 1030
    },
    {
      "gate_value": 0.021730070933699608,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 1030
    },
    {
      "grad_norm": 0.3756122887134552,
      "learning_rate": 0.00015584999999999997,
      "loss": 0.4765,
      "step": 1040
    },
    {
      "gate_value": 0.02219421975314617,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 1040
    },
    {
      "grad_norm": 0.10375858843326569,
      "learning_rate": 0.00015734999999999998,
      "loss": 0.4935,
      "step": 1050
    },
    {
      "gate_value": 0.02247510477900505,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 1050
    },
    {
      "grad_norm": 0.1254405975341797,
      "learning_rate": 0.00015884999999999999,
      "loss": 0.4967,
      "step": 1060
    },
    {
      "gate_value": 0.02249990962445736,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 1060
    },
    {
      "grad_norm": 0.41738972067832947,
      "learning_rate": 0.00016034999999999997,
      "loss": 0.4802,
      "step": 1070
    },
    {
      "gate_value": 0.02269868366420269,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 1070
    },
    {
      "grad_norm": 0.28267791867256165,
      "learning_rate": 0.00016184999999999998,
      "loss": 0.4799,
      "step": 1080
    },
    {
      "gate_value": 0.023243827745318413,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 1080
    },
    {
      "grad_norm": 0.07439376413822174,
      "learning_rate": 0.00016334999999999999,
      "loss": 0.4946,
      "step": 1090
    },
    {
      "gate_value": 0.023553457111120224,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 1090
    },
    {
      "grad_norm": 0.08923624455928802,
      "learning_rate": 0.00016485,
      "loss": 0.5008,
      "step": 1100
    },
    {
      "gate_value": 0.02370520494878292,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 1100
    },
    {
      "grad_norm": 0.04910886287689209,
      "learning_rate": 0.00016634999999999998,
      "loss": 0.5038,
      "step": 1110
    },
    {
      "gate_value": 0.02379109151661396,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 1110
    },
    {
      "grad_norm": 0.2725479304790497,
      "learning_rate": 0.00016785,
      "loss": 0.48,
      "step": 1120
    },
    {
      "gate_value": 0.024141671136021614,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 1120
    },
    {
      "grad_norm": 0.025722775608301163,
      "learning_rate": 0.00016935,
      "loss": 0.4886,
      "step": 1130
    },
    {
      "gate_value": 0.024660132825374603,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 1130
    },
    {
      "grad_norm": 0.2205311357975006,
      "learning_rate": 0.00017084999999999998,
      "loss": 0.4879,
      "step": 1140
    },
    {
      "gate_value": 0.0249018631875515,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 1140
    },
    {
      "grad_norm": 0.24804756045341492,
      "learning_rate": 0.00017235,
      "loss": 0.4851,
      "step": 1150
    },
    {
      "gate_value": 0.024769756942987442,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 1150
    },
    {
      "grad_norm": 0.029389042407274246,
      "learning_rate": 0.00017385,
      "loss": 0.4914,
      "step": 1160
    },
    {
      "gate_value": 0.0248092133551836,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 1160
    },
    {
      "grad_norm": 0.4203813076019287,
      "learning_rate": 0.00017534999999999998,
      "loss": 0.4618,
      "step": 1170
    },
    {
      "gate_value": 0.025401754304766655,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 1170
    },
    {
      "grad_norm": 0.30038025975227356,
      "learning_rate": 0.00017685,
      "loss": 0.4842,
      "step": 1180
    },
    {
      "gate_value": 0.02615329623222351,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 1180
    },
    {
      "grad_norm": 0.05151379108428955,
      "learning_rate": 0.00017835,
      "loss": 0.4929,
      "step": 1190
    },
    {
      "gate_value": 0.02670441009104252,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 1190
    },
    {
      "grad_norm": 0.03685954958200455,
      "learning_rate": 0.00017984999999999998,
      "loss": 0.4851,
      "step": 1200
    },
    {
      "gate_value": 0.026601877063512802,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 1200
    },
    {
      "grad_norm": 0.15710294246673584,
      "learning_rate": 0.00018135,
      "loss": 0.4759,
      "step": 1210
    },
    {
      "gate_value": 0.02667396143078804,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 1210
    },
    {
      "grad_norm": 0.3143344819545746,
      "learning_rate": 0.00018285,
      "loss": 0.5068,
      "step": 1220
    },
    {
      "gate_value": 0.02670447900891304,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 1220
    },
    {
      "grad_norm": 0.1960684210062027,
      "learning_rate": 0.00018435,
      "loss": 0.4893,
      "step": 1230
    },
    {
      "gate_value": 0.02659144438803196,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 1230
    },
    {
      "grad_norm": 0.05600379407405853,
      "learning_rate": 0.00018585,
      "loss": 0.4886,
      "step": 1240
    },
    {
      "gate_value": 0.026692554354667664,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 1240
    },
    {
      "grad_norm": 0.4063480794429779,
      "learning_rate": 0.00018735,
      "loss": 0.4997,
      "step": 1250
    },
    {
      "gate_value": 0.027202336117625237,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 1250
    },
    {
      "grad_norm": 0.12925776839256287,
      "learning_rate": 0.00018884999999999996,
      "loss": 0.486,
      "step": 1260
    },
    {
      "gate_value": 0.027684736996889114,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 1260
    },
    {
      "grad_norm": 0.1763681322336197,
      "learning_rate": 0.00019034999999999996,
      "loss": 0.4925,
      "step": 1270
    },
    {
      "gate_value": 0.028168512508273125,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 1270
    },
    {
      "grad_norm": 0.2327224165201187,
      "learning_rate": 0.00019184999999999997,
      "loss": 0.4672,
      "step": 1280
    },
    {
      "gate_value": 0.028568653389811516,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 1280
    },
    {
      "grad_norm": 0.0987858697772026,
      "learning_rate": 0.00019334999999999998,
      "loss": 0.4948,
      "step": 1290
    },
    {
      "gate_value": 0.028939686715602875,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 1290
    },
    {
      "grad_norm": 0.4070354998111725,
      "learning_rate": 0.00019484999999999997,
      "loss": 0.4959,
      "step": 1300
    },
    {
      "gate_value": 0.028898224234580994,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 1300
    },
    {
      "grad_norm": 0.14461100101470947,
      "learning_rate": 0.00019634999999999998,
      "loss": 0.4818,
      "step": 1310
    },
    {
      "gate_value": 0.029195407405495644,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 1310
    },
    {
      "grad_norm": 0.0757102370262146,
      "learning_rate": 0.00019784999999999998,
      "loss": 0.4879,
      "step": 1320
    },
    {
      "gate_value": 0.029616594314575195,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 1320
    },
    {
      "grad_norm": 0.07737571746110916,
      "learning_rate": 0.00019934999999999997,
      "loss": 0.4633,
      "step": 1330
    },
    {
      "gate_value": 0.03013782575726509,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 1330
    },
    {
      "grad_norm": 0.08557803928852081,
      "learning_rate": 0.00020084999999999998,
      "loss": 0.4849,
      "step": 1340
    },
    {
      "gate_value": 0.030645808205008507,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 1340
    },
    {
      "grad_norm": 0.062334273010492325,
      "learning_rate": 0.00020234999999999999,
      "loss": 0.4856,
      "step": 1350
    },
    {
      "gate_value": 0.030954955145716667,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 1350
    },
    {
      "grad_norm": 0.10366081446409225,
      "learning_rate": 0.00020384999999999997,
      "loss": 0.4857,
      "step": 1360
    },
    {
      "gate_value": 0.030923420563340187,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 1360
    },
    {
      "grad_norm": 0.045627713203430176,
      "learning_rate": 0.00020534999999999998,
      "loss": 0.4701,
      "step": 1370
    },
    {
      "gate_value": 0.030991991981863976,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 1370
    },
    {
      "grad_norm": 0.2241339385509491,
      "learning_rate": 0.00020684999999999999,
      "loss": 0.4736,
      "step": 1380
    },
    {
      "gate_value": 0.03133467212319374,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 1380
    },
    {
      "grad_norm": 0.29695796966552734,
      "learning_rate": 0.00020835,
      "loss": 0.4808,
      "step": 1390
    },
    {
      "gate_value": 0.03150676190853119,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 1390
    },
    {
      "grad_norm": 0.28195545077323914,
      "learning_rate": 0.00020984999999999998,
      "loss": 0.496,
      "step": 1400
    },
    {
      "gate_value": 0.03169822692871094,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 1400
    },
    {
      "grad_norm": 0.2775692045688629,
      "learning_rate": 0.00021135,
      "loss": 0.4751,
      "step": 1410
    },
    {
      "gate_value": 0.031678296625614166,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 1410
    },
    {
      "grad_norm": 0.2424466758966446,
      "learning_rate": 0.00021285,
      "loss": 0.467,
      "step": 1420
    },
    {
      "gate_value": 0.032020535320043564,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 1420
    },
    {
      "grad_norm": 0.22923263907432556,
      "learning_rate": 0.00021434999999999998,
      "loss": 0.4701,
      "step": 1430
    },
    {
      "gate_value": 0.03269083425402641,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 1430
    },
    {
      "grad_norm": 0.34625235199928284,
      "learning_rate": 0.00021585,
      "loss": 0.471,
      "step": 1440
    },
    {
      "gate_value": 0.033144090324640274,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 1440
    },
    {
      "grad_norm": 0.11075719445943832,
      "learning_rate": 0.00021735,
      "loss": 0.4731,
      "step": 1450
    },
    {
      "gate_value": 0.033079419285058975,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 1450
    },
    {
      "grad_norm": 0.12235695123672485,
      "learning_rate": 0.00021884999999999998,
      "loss": 0.4775,
      "step": 1460
    },
    {
      "gate_value": 0.032961998134851456,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 1460
    },
    {
      "grad_norm": 0.023144006729125977,
      "learning_rate": 0.00022035,
      "loss": 0.4742,
      "step": 1470
    },
    {
      "gate_value": 0.0333966389298439,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 1470
    },
    {
      "grad_norm": 0.09035952389240265,
      "learning_rate": 0.00022185,
      "loss": 0.4869,
      "step": 1480
    },
    {
      "gate_value": 0.033850379288196564,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 1480
    },
    {
      "grad_norm": 0.028102407231926918,
      "learning_rate": 0.00022335,
      "loss": 0.459,
      "step": 1490
    },
    {
      "gate_value": 0.03416411206126213,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 1490
    },
    {
      "grad_norm": 0.02450348250567913,
      "learning_rate": 0.00022485,
      "loss": 0.4591,
      "step": 1500
    },
    {
      "gate_value": 0.034449104219675064,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 1500
    },
    {
      "grad_norm": 0.05895009636878967,
      "learning_rate": 0.00022634999999999997,
      "loss": 0.4898,
      "step": 1510
    },
    {
      "gate_value": 0.034758225083351135,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 1510
    },
    {
      "grad_norm": 0.0686354786157608,
      "learning_rate": 0.00022784999999999995,
      "loss": 0.4675,
      "step": 1520
    },
    {
      "gate_value": 0.03516167402267456,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 1520
    },
    {
      "grad_norm": 0.028159357607364655,
      "learning_rate": 0.00022934999999999996,
      "loss": 0.4851,
      "step": 1530
    },
    {
      "gate_value": 0.03552531823515892,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 1530
    },
    {
      "grad_norm": 0.13609439134597778,
      "learning_rate": 0.00023084999999999997,
      "loss": 0.4766,
      "step": 1540
    },
    {
      "gate_value": 0.03593320772051811,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 1540
    },
    {
      "grad_norm": 0.15504056215286255,
      "learning_rate": 0.00023234999999999998,
      "loss": 0.4619,
      "step": 1550
    },
    {
      "gate_value": 0.03641683608293533,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 1550
    },
    {
      "grad_norm": 0.040739111602306366,
      "learning_rate": 0.00023384999999999997,
      "loss": 0.4636,
      "step": 1560
    },
    {
      "gate_value": 0.03673742339015007,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 1560
    },
    {
      "grad_norm": 0.14744669198989868,
      "learning_rate": 0.00023534999999999997,
      "loss": 0.4955,
      "step": 1570
    },
    {
      "gate_value": 0.036518871784210205,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 1570
    },
    {
      "grad_norm": 0.20779630541801453,
      "learning_rate": 0.00023684999999999998,
      "loss": 0.4931,
      "step": 1580
    },
    {
      "gate_value": 0.03666940703988075,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 1580
    },
    {
      "grad_norm": 0.02847031131386757,
      "learning_rate": 0.00023834999999999997,
      "loss": 0.4923,
      "step": 1590
    },
    {
      "gate_value": 0.03700125217437744,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 1590
    },
    {
      "grad_norm": 0.16238997876644135,
      "learning_rate": 0.00023984999999999998,
      "loss": 0.4749,
      "step": 1600
    },
    {
      "gate_value": 0.03693028539419174,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 1600
    },
    {
      "grad_norm": 0.04399807006120682,
      "learning_rate": 0.00024134999999999998,
      "loss": 0.4827,
      "step": 1610
    },
    {
      "gate_value": 0.03712141513824463,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 1610
    },
    {
      "grad_norm": 0.07293123006820679,
      "learning_rate": 0.00024284999999999997,
      "loss": 0.4892,
      "step": 1620
    },
    {
      "gate_value": 0.0372476652264595,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 1620
    },
    {
      "grad_norm": 0.14840328693389893,
      "learning_rate": 0.00024435,
      "loss": 0.4627,
      "step": 1630
    },
    {
      "gate_value": 0.037470173090696335,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 1630
    },
    {
      "grad_norm": 0.13052290678024292,
      "learning_rate": 0.00024585,
      "loss": 0.4734,
      "step": 1640
    },
    {
      "gate_value": 0.03787413239479065,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 1640
    },
    {
      "grad_norm": 0.04076918587088585,
      "learning_rate": 0.00024734999999999997,
      "loss": 0.485,
      "step": 1650
    },
    {
      "gate_value": 0.03815087303519249,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 1650
    },
    {
      "grad_norm": 0.2749229669570923,
      "learning_rate": 0.00024885,
      "loss": 0.457,
      "step": 1660
    },
    {
      "gate_value": 0.03850769251585007,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 1660
    },
    {
      "grad_norm": 0.2708996534347534,
      "learning_rate": 0.00025035,
      "loss": 0.4775,
      "step": 1670
    },
    {
      "gate_value": 0.03835766762495041,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 1670
    },
    {
      "grad_norm": 0.08414936065673828,
      "learning_rate": 0.00025184999999999997,
      "loss": 0.4748,
      "step": 1680
    },
    {
      "gate_value": 0.03850135579705238,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 1680
    },
    {
      "grad_norm": 0.04802856966853142,
      "learning_rate": 0.00025335,
      "loss": 0.4668,
      "step": 1690
    },
    {
      "gate_value": 0.03900991007685661,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 1690
    },
    {
      "grad_norm": 0.24531228840351105,
      "learning_rate": 0.00025485,
      "loss": 0.4846,
      "step": 1700
    },
    {
      "gate_value": 0.03973180428147316,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 1700
    },
    {
      "grad_norm": 0.08588869124650955,
      "learning_rate": 0.00025634999999999997,
      "loss": 0.4847,
      "step": 1710
    },
    {
      "gate_value": 0.0403984971344471,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 1710
    },
    {
      "grad_norm": 0.2342216819524765,
      "learning_rate": 0.00025785,
      "loss": 0.4727,
      "step": 1720
    },
    {
      "gate_value": 0.04085838794708252,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 1720
    },
    {
      "grad_norm": 0.06164858862757683,
      "learning_rate": 0.00025935,
      "loss": 0.483,
      "step": 1730
    },
    {
      "gate_value": 0.04160254821181297,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 1730
    },
    {
      "grad_norm": 0.10281935334205627,
      "learning_rate": 0.00026084999999999997,
      "loss": 0.4664,
      "step": 1740
    },
    {
      "gate_value": 0.04127310961484909,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 1740
    },
    {
      "grad_norm": 0.18388307094573975,
      "learning_rate": 0.00026235,
      "loss": 0.4757,
      "step": 1750
    },
    {
      "gate_value": 0.041262272745370865,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 1750
    },
    {
      "grad_norm": 0.08633152395486832,
      "learning_rate": 0.00026384999999999994,
      "loss": 0.4685,
      "step": 1760
    },
    {
      "gate_value": 0.041280150413513184,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 1760
    },
    {
      "grad_norm": 0.05839018523693085,
      "learning_rate": 0.00026534999999999997,
      "loss": 0.5055,
      "step": 1770
    },
    {
      "gate_value": 0.0408598892390728,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 1770
    },
    {
      "grad_norm": 0.1357850283384323,
      "learning_rate": 0.00026684999999999995,
      "loss": 0.4611,
      "step": 1780
    },
    {
      "gate_value": 0.04074737802147865,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 1780
    },
    {
      "grad_norm": 0.26462557911872864,
      "learning_rate": 0.00026835,
      "loss": 0.4787,
      "step": 1790
    },
    {
      "gate_value": 0.04107224568724632,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 1790
    },
    {
      "grad_norm": 0.11214753240346909,
      "learning_rate": 0.00026984999999999997,
      "loss": 0.4654,
      "step": 1800
    },
    {
      "gate_value": 0.04110245779156685,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 1800
    },
    {
      "grad_norm": 0.095204658806324,
      "learning_rate": 0.00027134999999999995,
      "loss": 0.4927,
      "step": 1810
    },
    {
      "gate_value": 0.04148384928703308,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 1810
    },
    {
      "grad_norm": 0.22705064713954926,
      "learning_rate": 0.00027285,
      "loss": 0.4727,
      "step": 1820
    },
    {
      "gate_value": 0.04165264591574669,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 1820
    },
    {
      "grad_norm": 0.051133785396814346,
      "learning_rate": 0.00027435,
      "loss": 0.4743,
      "step": 1830
    },
    {
      "gate_value": 0.041887927800416946,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 1830
    },
    {
      "grad_norm": 0.024134185165166855,
      "learning_rate": 0.00027584999999999996,
      "loss": 0.4624,
      "step": 1840
    },
    {
      "gate_value": 0.04222255200147629,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 1840
    },
    {
      "grad_norm": 0.06055133417248726,
      "learning_rate": 0.00027735,
      "loss": 0.4866,
      "step": 1850
    },
    {
      "gate_value": 0.04211531952023506,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 1850
    },
    {
      "grad_norm": 0.10934657603502274,
      "learning_rate": 0.00027885,
      "loss": 0.4649,
      "step": 1860
    },
    {
      "gate_value": 0.042022984474897385,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 1860
    },
    {
      "grad_norm": 0.1160777285695076,
      "learning_rate": 0.00028034999999999996,
      "loss": 0.4423,
      "step": 1870
    },
    {
      "gate_value": 0.04227515682578087,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 1870
    },
    {
      "grad_norm": 0.12855137884616852,
      "learning_rate": 0.00028185,
      "loss": 0.4899,
      "step": 1880
    },
    {
      "gate_value": 0.04226217046380043,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 1880
    },
    {
      "grad_norm": 0.05965856835246086,
      "learning_rate": 0.00028335,
      "loss": 0.4737,
      "step": 1890
    },
    {
      "gate_value": 0.041885219514369965,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 1890
    },
    {
      "grad_norm": 0.19230695068836212,
      "learning_rate": 0.00028484999999999996,
      "loss": 0.4779,
      "step": 1900
    },
    {
      "gate_value": 0.04192354902625084,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 1900
    },
    {
      "grad_norm": 0.05444691330194473,
      "learning_rate": 0.00028635,
      "loss": 0.4896,
      "step": 1910
    },
    {
      "gate_value": 0.04169577360153198,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 1910
    },
    {
      "grad_norm": 0.030353045091032982,
      "learning_rate": 0.00028785,
      "loss": 0.4935,
      "step": 1920
    },
    {
      "gate_value": 0.04207930713891983,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 1920
    },
    {
      "grad_norm": 0.15619803965091705,
      "learning_rate": 0.00028934999999999996,
      "loss": 0.4779,
      "step": 1930
    },
    {
      "gate_value": 0.04228321462869644,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 1930
    },
    {
      "grad_norm": 0.11080749332904816,
      "learning_rate": 0.00029085,
      "loss": 0.4513,
      "step": 1940
    },
    {
      "gate_value": 0.0428403876721859,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 1940
    },
    {
      "grad_norm": 0.1745726615190506,
      "learning_rate": 0.00029235,
      "loss": 0.4838,
      "step": 1950
    },
    {
      "gate_value": 0.04332621768116951,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 1950
    },
    {
      "grad_norm": 0.11912817507982254,
      "learning_rate": 0.00029384999999999996,
      "loss": 0.4669,
      "step": 1960
    },
    {
      "gate_value": 0.0436142161488533,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 1960
    },
    {
      "grad_norm": 0.05294900760054588,
      "learning_rate": 0.00029535,
      "loss": 0.4674,
      "step": 1970
    },
    {
      "gate_value": 0.0439755953848362,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 1970
    },
    {
      "grad_norm": 0.03383156657218933,
      "learning_rate": 0.00029685,
      "loss": 0.4676,
      "step": 1980
    },
    {
      "gate_value": 0.0445464663207531,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 1980
    },
    {
      "grad_norm": 0.14340122044086456,
      "learning_rate": 0.00029835,
      "loss": 0.484,
      "step": 1990
    },
    {
      "gate_value": 0.044701505452394485,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 1990
    },
    {
      "grad_norm": 0.1341182142496109,
      "learning_rate": 0.00029985,
      "loss": 0.4739,
      "step": 2000
    },
    {
      "gate_value": 0.044620223343372345,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 2000
    },
    {
      "grad_norm": 0.15212862193584442,
      "learning_rate": 0.00029999995847794736,
      "loss": 0.4748,
      "step": 2010
    },
    {
      "gate_value": 0.0448896661400795,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 2010
    },
    {
      "grad_norm": 0.1537584662437439,
      "learning_rate": 0.0002999998149449555,
      "loss": 0.4752,
      "step": 2020
    },
    {
      "gate_value": 0.045422472059726715,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2020
    },
    {
      "grad_norm": 0.07360043376684189,
      "learning_rate": 0.0002999995688885045,
      "loss": 0.4605,
      "step": 2030
    },
    {
      "gate_value": 0.04550457373261452,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 2030
    },
    {
      "grad_norm": 0.030054304748773575,
      "learning_rate": 0.0002999992203087627,
      "loss": 0.4835,
      "step": 2040
    },
    {
      "gate_value": 0.04544057324528694,
      "icl_sequence_length": 56,
      "num_contexts": 3,
      "step": 2040
    },
    {
      "grad_norm": 0.03757965564727783,
      "learning_rate": 0.00029999876920596807,
      "loss": 0.4776,
      "step": 2050
    },
    {
      "gate_value": 0.04557052254676819,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 2050
    },
    {
      "grad_norm": 0.2675529420375824,
      "learning_rate": 0.0002999982155804292,
      "loss": 0.4698,
      "step": 2060
    },
    {
      "gate_value": 0.046622686088085175,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 2060
    },
    {
      "grad_norm": 0.26907795667648315,
      "learning_rate": 0.0002999975594325243,
      "loss": 0.4853,
      "step": 2070
    },
    {
      "gate_value": 0.04718781262636185,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 2070
    },
    {
      "grad_norm": 0.07856488227844238,
      "learning_rate": 0.00029999680076270204,
      "loss": 0.4859,
      "step": 2080
    },
    {
      "gate_value": 0.04703153297305107,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 2080
    },
    {
      "grad_norm": 0.12983615696430206,
      "learning_rate": 0.00029999593957148073,
      "loss": 0.4663,
      "step": 2090
    },
    {
      "gate_value": 0.046463314443826675,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2090
    },
    {
      "grad_norm": 0.08406257629394531,
      "learning_rate": 0.00029999497585944917,
      "loss": 0.4668,
      "step": 2100
    },
    {
      "gate_value": 0.046239107847213745,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 2100
    },
    {
      "grad_norm": 0.0722755640745163,
      "learning_rate": 0.0002999939096272659,
      "loss": 0.4527,
      "step": 2110
    },
    {
      "gate_value": 0.04589474946260452,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2110
    },
    {
      "grad_norm": 0.18733340501785278,
      "learning_rate": 0.0002999927408756598,
      "loss": 0.4601,
      "step": 2120
    },
    {
      "gate_value": 0.04604862630367279,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 2120
    },
    {
      "grad_norm": 0.032498251646757126,
      "learning_rate": 0.0002999914696054297,
      "loss": 0.4585,
      "step": 2130
    },
    {
      "gate_value": 0.047017332166433334,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 2130
    },
    {
      "grad_norm": 0.04576157405972481,
      "learning_rate": 0.0002999900958174444,
      "loss": 0.476,
      "step": 2140
    },
    {
      "gate_value": 0.047887593507766724,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 2140
    },
    {
      "grad_norm": 0.034247320145368576,
      "learning_rate": 0.00029998861951264296,
      "loss": 0.4756,
      "step": 2150
    },
    {
      "gate_value": 0.04841304570436478,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 2150
    },
    {
      "grad_norm": 0.022174621000885963,
      "learning_rate": 0.00029998704069203436,
      "loss": 0.4666,
      "step": 2160
    },
    {
      "gate_value": 0.048587359488010406,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 2160
    },
    {
      "grad_norm": 0.0923171117901802,
      "learning_rate": 0.0002999853593566978,
      "loss": 0.4671,
      "step": 2170
    },
    {
      "gate_value": 0.04884525388479233,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 2170
    },
    {
      "grad_norm": 0.026745932176709175,
      "learning_rate": 0.00029998357550778236,
      "loss": 0.4598,
      "step": 2180
    },
    {
      "gate_value": 0.04901081696152687,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 2180
    },
    {
      "grad_norm": 0.029031245037913322,
      "learning_rate": 0.00029998168914650733,
      "loss": 0.4664,
      "step": 2190
    },
    {
      "gate_value": 0.049416057765483856,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 2190
    },
    {
      "grad_norm": 0.11745218932628632,
      "learning_rate": 0.000299979700274162,
      "loss": 0.4619,
      "step": 2200
    },
    {
      "gate_value": 0.04974166676402092,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 2200
    },
    {
      "grad_norm": 0.18687647581100464,
      "learning_rate": 0.0002999776088921058,
      "loss": 0.4771,
      "step": 2210
    },
    {
      "gate_value": 0.04972897097468376,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 2210
    },
    {
      "grad_norm": 0.2829289436340332,
      "learning_rate": 0.00029997541500176804,
      "loss": 0.4785,
      "step": 2220
    },
    {
      "gate_value": 0.04986254498362541,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 2220
    },
    {
      "grad_norm": 0.12327824532985687,
      "learning_rate": 0.0002999731186046484,
      "loss": 0.4658,
      "step": 2230
    },
    {
      "gate_value": 0.05015156418085098,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 2230
    },
    {
      "grad_norm": 0.07140158116817474,
      "learning_rate": 0.00029997071970231623,
      "loss": 0.4648,
      "step": 2240
    },
    {
      "gate_value": 0.04988813400268555,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2240
    },
    {
      "grad_norm": 0.10163920372724533,
      "learning_rate": 0.0002999682182964114,
      "loss": 0.4697,
      "step": 2250
    },
    {
      "gate_value": 0.04962354525923729,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 2250
    },
    {
      "grad_norm": 0.2764798402786255,
      "learning_rate": 0.00029996561438864344,
      "loss": 0.4698,
      "step": 2260
    },
    {
      "gate_value": 0.04965106397867203,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 2260
    },
    {
      "grad_norm": 0.024615973234176636,
      "learning_rate": 0.00029996290798079214,
      "loss": 0.4581,
      "step": 2270
    },
    {
      "gate_value": 0.049827586859464645,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 2270
    },
    {
      "grad_norm": 0.1376759111881256,
      "learning_rate": 0.0002999600990747073,
      "loss": 0.4773,
      "step": 2280
    },
    {
      "gate_value": 0.04991050437092781,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 2280
    },
    {
      "grad_norm": 0.04156076908111572,
      "learning_rate": 0.0002999571876723088,
      "loss": 0.4807,
      "step": 2290
    },
    {
      "gate_value": 0.049750544130802155,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 2290
    },
    {
      "grad_norm": 0.10670457035303116,
      "learning_rate": 0.00029995417377558654,
      "loss": 0.469,
      "step": 2300
    },
    {
      "gate_value": 0.0495314821600914,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 2300
    },
    {
      "grad_norm": 0.1581500917673111,
      "learning_rate": 0.0002999510573866005,
      "loss": 0.4652,
      "step": 2310
    },
    {
      "gate_value": 0.0497625507414341,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 2310
    },
    {
      "grad_norm": 0.0921480730175972,
      "learning_rate": 0.00029994783850748063,
      "loss": 0.4724,
      "step": 2320
    },
    {
      "gate_value": 0.0499161072075367,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 2320
    },
    {
      "grad_norm": 0.07795640826225281,
      "learning_rate": 0.00029994451714042707,
      "loss": 0.4849,
      "step": 2330
    },
    {
      "gate_value": 0.05015277490019798,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 2330
    },
    {
      "grad_norm": 0.03227244317531586,
      "learning_rate": 0.00029994109328770993,
      "loss": 0.4723,
      "step": 2340
    },
    {
      "gate_value": 0.050364185124635696,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 2340
    },
    {
      "grad_norm": 0.43648087978363037,
      "learning_rate": 0.00029993756695166943,
      "loss": 0.4874,
      "step": 2350
    },
    {
      "gate_value": 0.05060106888413429,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 2350
    },
    {
      "grad_norm": 0.10248932242393494,
      "learning_rate": 0.00029993393813471575,
      "loss": 0.4489,
      "step": 2360
    },
    {
      "gate_value": 0.05064888298511505,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 2360
    },
    {
      "grad_norm": 0.20501983165740967,
      "learning_rate": 0.0002999302068393291,
      "loss": 0.4513,
      "step": 2370
    },
    {
      "gate_value": 0.05121821165084839,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 2370
    },
    {
      "grad_norm": 0.22547470033168793,
      "learning_rate": 0.0002999263730680599,
      "loss": 0.4572,
      "step": 2380
    },
    {
      "gate_value": 0.052095528692007065,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 2380
    },
    {
      "grad_norm": 0.12664879858493805,
      "learning_rate": 0.0002999224368235284,
      "loss": 0.4546,
      "step": 2390
    },
    {
      "gate_value": 0.051816221326589584,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 2390
    },
    {
      "grad_norm": 0.02900230884552002,
      "learning_rate": 0.000299918398108425,
      "loss": 0.4522,
      "step": 2400
    },
    {
      "gate_value": 0.05174678564071655,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 2400
    },
    {
      "grad_norm": 0.06299610435962677,
      "learning_rate": 0.00029991425692551014,
      "loss": 0.4706,
      "step": 2410
    },
    {
      "gate_value": 0.05185849592089653,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 2410
    },
    {
      "grad_norm": 0.20941582322120667,
      "learning_rate": 0.00029991001327761427,
      "loss": 0.4686,
      "step": 2420
    },
    {
      "gate_value": 0.05203567072749138,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 2420
    },
    {
      "grad_norm": 0.08857905119657516,
      "learning_rate": 0.00029990566716763797,
      "loss": 0.4835,
      "step": 2430
    },
    {
      "gate_value": 0.05252448096871376,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 2430
    },
    {
      "grad_norm": 0.023652268573641777,
      "learning_rate": 0.0002999012185985516,
      "loss": 0.48,
      "step": 2440
    },
    {
      "gate_value": 0.05304626375436783,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2440
    },
    {
      "grad_norm": 0.06863145530223846,
      "learning_rate": 0.0002998966675733958,
      "loss": 0.4819,
      "step": 2450
    },
    {
      "gate_value": 0.05308978632092476,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 2450
    },
    {
      "grad_norm": 0.09551398456096649,
      "learning_rate": 0.0002998920140952812,
      "loss": 0.4741,
      "step": 2460
    },
    {
      "gate_value": 0.05287903547286987,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 2460
    },
    {
      "grad_norm": 0.17554223537445068,
      "learning_rate": 0.00029988725816738833,
      "loss": 0.4555,
      "step": 2470
    },
    {
      "gate_value": 0.053737007081508636,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 2470
    },
    {
      "grad_norm": 0.13025638461112976,
      "learning_rate": 0.00029988239979296784,
      "loss": 0.4517,
      "step": 2480
    },
    {
      "gate_value": 0.05436123162508011,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2480
    },
    {
      "grad_norm": 0.07448191940784454,
      "learning_rate": 0.00029987743897534044,
      "loss": 0.4611,
      "step": 2490
    },
    {
      "gate_value": 0.054754164069890976,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 2490
    },
    {
      "grad_norm": 0.353579044342041,
      "learning_rate": 0.00029987237571789675,
      "loss": 0.472,
      "step": 2500
    },
    {
      "gate_value": 0.05532870441675186,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 2500
    },
    {
      "grad_norm": 0.10554645210504532,
      "learning_rate": 0.0002998672100240975,
      "loss": 0.4759,
      "step": 2510
    },
    {
      "gate_value": 0.055256184190511703,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 2510
    },
    {
      "grad_norm": 0.08449801802635193,
      "learning_rate": 0.00029986194189747333,
      "loss": 0.4543,
      "step": 2520
    },
    {
      "gate_value": 0.054673366248607635,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2520
    },
    {
      "grad_norm": 0.06385741382837296,
      "learning_rate": 0.000299856571341625,
      "loss": 0.4588,
      "step": 2530
    },
    {
      "gate_value": 0.05435393378138542,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2530
    },
    {
      "grad_norm": 0.12640225887298584,
      "learning_rate": 0.00029985109836022314,
      "loss": 0.4553,
      "step": 2540
    },
    {
      "gate_value": 0.05447719991207123,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 2540
    },
    {
      "grad_norm": 0.04395339637994766,
      "learning_rate": 0.00029984552295700867,
      "loss": 0.4685,
      "step": 2550
    },
    {
      "gate_value": 0.05475514754652977,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 2550
    },
    {
      "grad_norm": 0.06887280941009521,
      "learning_rate": 0.0002998398451357921,
      "loss": 0.4424,
      "step": 2560
    },
    {
      "gate_value": 0.055225860327482224,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 2560
    },
    {
      "grad_norm": 0.2743040919303894,
      "learning_rate": 0.00029983406490045444,
      "loss": 0.458,
      "step": 2570
    },
    {
      "gate_value": 0.05573255196213722,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 2570
    },
    {
      "grad_norm": 0.1339769959449768,
      "learning_rate": 0.0002998281822549462,
      "loss": 0.4706,
      "step": 2580
    },
    {
      "gate_value": 0.056551918387413025,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 2580
    },
    {
      "grad_norm": 0.09543494880199432,
      "learning_rate": 0.00029982219720328814,
      "loss": 0.456,
      "step": 2590
    },
    {
      "gate_value": 0.05700315535068512,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 2590
    },
    {
      "grad_norm": 0.17091640830039978,
      "learning_rate": 0.0002998161097495711,
      "loss": 0.4662,
      "step": 2600
    },
    {
      "gate_value": 0.05705432966351509,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 2600
    },
    {
      "grad_norm": 0.25809577107429504,
      "learning_rate": 0.00029980991989795566,
      "loss": 0.4694,
      "step": 2610
    },
    {
      "gate_value": 0.057157181203365326,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2610
    },
    {
      "grad_norm": 0.06581462919712067,
      "learning_rate": 0.00029980362765267264,
      "loss": 0.4877,
      "step": 2620
    },
    {
      "gate_value": 0.05712047219276428,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 2620
    },
    {
      "grad_norm": 0.13200043141841888,
      "learning_rate": 0.00029979723301802266,
      "loss": 0.4728,
      "step": 2630
    },
    {
      "gate_value": 0.057480890303850174,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 2630
    },
    {
      "grad_norm": 0.1044127494096756,
      "learning_rate": 0.0002997907359983764,
      "loss": 0.4721,
      "step": 2640
    },
    {
      "gate_value": 0.057946741580963135,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 2640
    },
    {
      "grad_norm": 0.036862898617982864,
      "learning_rate": 0.00029978413659817455,
      "loss": 0.4665,
      "step": 2650
    },
    {
      "gate_value": 0.05749443545937538,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 2650
    },
    {
      "grad_norm": 0.16739346086978912,
      "learning_rate": 0.00029977743482192774,
      "loss": 0.465,
      "step": 2660
    },
    {
      "gate_value": 0.0574820339679718,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 2660
    },
    {
      "grad_norm": 0.06671813130378723,
      "learning_rate": 0.0002997706306742165,
      "loss": 0.471,
      "step": 2670
    },
    {
      "gate_value": 0.0580253079533577,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 2670
    },
    {
      "grad_norm": 0.06606713682413101,
      "learning_rate": 0.0002997637241596915,
      "loss": 0.4842,
      "step": 2680
    },
    {
      "gate_value": 0.058306340128183365,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 2680
    },
    {
      "grad_norm": 0.1716826856136322,
      "learning_rate": 0.0002997567152830732,
      "loss": 0.4661,
      "step": 2690
    },
    {
      "gate_value": 0.058256857097148895,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 2690
    },
    {
      "grad_norm": 0.028431877493858337,
      "learning_rate": 0.0002997496040491521,
      "loss": 0.4693,
      "step": 2700
    },
    {
      "gate_value": 0.05827565863728523,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 2700
    },
    {
      "grad_norm": 0.09539343416690826,
      "learning_rate": 0.0002997423904627887,
      "loss": 0.456,
      "step": 2710
    },
    {
      "gate_value": 0.05831799656152725,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 2710
    },
    {
      "grad_norm": 0.03917115554213524,
      "learning_rate": 0.0002997350745289134,
      "loss": 0.471,
      "step": 2720
    },
    {
      "gate_value": 0.058655787259340286,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 2720
    },
    {
      "grad_norm": 0.17695604264736176,
      "learning_rate": 0.0002997276562525266,
      "loss": 0.4678,
      "step": 2730
    },
    {
      "gate_value": 0.059178613126277924,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 2730
    },
    {
      "grad_norm": 0.07515741884708405,
      "learning_rate": 0.00029972013563869863,
      "loss": 0.475,
      "step": 2740
    },
    {
      "gate_value": 0.059215329587459564,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 2740
    },
    {
      "grad_norm": 0.07590952515602112,
      "learning_rate": 0.00029971251269256965,
      "loss": 0.4602,
      "step": 2750
    },
    {
      "gate_value": 0.05963375046849251,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 2750
    },
    {
      "grad_norm": 0.030797870829701424,
      "learning_rate": 0.00029970478741934997,
      "loss": 0.448,
      "step": 2760
    },
    {
      "gate_value": 0.060271523892879486,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 2760
    },
    {
      "grad_norm": 0.03239508345723152,
      "learning_rate": 0.00029969695982431975,
      "loss": 0.4738,
      "step": 2770
    },
    {
      "gate_value": 0.06078481674194336,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 2770
    },
    {
      "grad_norm": 0.05549526959657669,
      "learning_rate": 0.000299689029912829,
      "loss": 0.4734,
      "step": 2780
    },
    {
      "gate_value": 0.06084667146205902,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 2780
    },
    {
      "grad_norm": 0.03209852799773216,
      "learning_rate": 0.00029968099769029787,
      "loss": 0.4521,
      "step": 2790
    },
    {
      "gate_value": 0.06139722093939781,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 2790
    },
    {
      "grad_norm": 0.050206031650304794,
      "learning_rate": 0.00029967286316221614,
      "loss": 0.4769,
      "step": 2800
    },
    {
      "gate_value": 0.06171596422791481,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 2800
    },
    {
      "grad_norm": 0.4463041126728058,
      "learning_rate": 0.00029966462633414383,
      "loss": 0.463,
      "step": 2810
    },
    {
      "gate_value": 0.061615847051143646,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 2810
    },
    {
      "grad_norm": 0.04375645890831947,
      "learning_rate": 0.0002996562872117106,
      "loss": 0.4443,
      "step": 2820
    },
    {
      "gate_value": 0.061395056545734406,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 2820
    },
    {
      "grad_norm": 0.08162671327590942,
      "learning_rate": 0.00029964784580061634,
      "loss": 0.4716,
      "step": 2830
    },
    {
      "gate_value": 0.06112854182720184,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 2830
    },
    {
      "grad_norm": 0.029063262045383453,
      "learning_rate": 0.0002996393021066305,
      "loss": 0.4406,
      "step": 2840
    },
    {
      "gate_value": 0.061373159289360046,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2840
    },
    {
      "grad_norm": 0.2159145176410675,
      "learning_rate": 0.0002996306561355927,
      "loss": 0.4558,
      "step": 2850
    },
    {
      "gate_value": 0.061609722673892975,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 2850
    },
    {
      "grad_norm": 0.1503428816795349,
      "learning_rate": 0.00029962190789341233,
      "loss": 0.4751,
      "step": 2860
    },
    {
      "gate_value": 0.06192683055996895,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 2860
    },
    {
      "grad_norm": 0.03672568127512932,
      "learning_rate": 0.00029961305738606883,
      "loss": 0.4424,
      "step": 2870
    },
    {
      "gate_value": 0.06256543844938278,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 2870
    },
    {
      "grad_norm": 0.1288597583770752,
      "learning_rate": 0.00029960410461961134,
      "loss": 0.4569,
      "step": 2880
    },
    {
      "gate_value": 0.06346543878316879,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 2880
    },
    {
      "grad_norm": 0.14279378950595856,
      "learning_rate": 0.00029959504960015904,
      "loss": 0.4591,
      "step": 2890
    },
    {
      "gate_value": 0.06372940540313721,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 2890
    },
    {
      "grad_norm": 0.09644819051027298,
      "learning_rate": 0.0002995858923339009,
      "loss": 0.4624,
      "step": 2900
    },
    {
      "gate_value": 0.0635669156908989,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 2900
    },
    {
      "grad_norm": 0.0700480043888092,
      "learning_rate": 0.00029957663282709587,
      "loss": 0.44,
      "step": 2910
    },
    {
      "gate_value": 0.06419818103313446,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 2910
    },
    {
      "grad_norm": 0.14946900308132172,
      "learning_rate": 0.00029956727108607274,
      "loss": 0.4672,
      "step": 2920
    },
    {
      "gate_value": 0.06462504714727402,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2920
    },
    {
      "grad_norm": 0.0538860447704792,
      "learning_rate": 0.0002995578071172302,
      "loss": 0.4541,
      "step": 2930
    },
    {
      "gate_value": 0.06455416232347488,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 2930
    },
    {
      "grad_norm": 0.03887515142560005,
      "learning_rate": 0.0002995482409270367,
      "loss": 0.4544,
      "step": 2940
    },
    {
      "gate_value": 0.06476987153291702,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 2940
    },
    {
      "grad_norm": 0.031727902591228485,
      "learning_rate": 0.00029953857252203067,
      "loss": 0.4748,
      "step": 2950
    },
    {
      "gate_value": 0.06503763049840927,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 2950
    },
    {
      "grad_norm": 0.04176465794444084,
      "learning_rate": 0.00029952880190882035,
      "loss": 0.463,
      "step": 2960
    },
    {
      "gate_value": 0.0653427243232727,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 2960
    },
    {
      "grad_norm": 0.026758193969726562,
      "learning_rate": 0.0002995189290940839,
      "loss": 0.4666,
      "step": 2970
    },
    {
      "gate_value": 0.06526095420122147,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 2970
    },
    {
      "grad_norm": 0.15899816155433655,
      "learning_rate": 0.0002995089540845694,
      "loss": 0.4504,
      "step": 2980
    },
    {
      "gate_value": 0.06506571173667908,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 2980
    },
    {
      "grad_norm": 0.2439725548028946,
      "learning_rate": 0.0002994988768870945,
      "loss": 0.4547,
      "step": 2990
    },
    {
      "gate_value": 0.06512710452079773,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 2990
    },
    {
      "grad_norm": 0.18706493079662323,
      "learning_rate": 0.00029948869750854695,
      "loss": 0.4626,
      "step": 3000
    },
    {
      "gate_value": 0.06544939428567886,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 3000
    },
    {
      "grad_norm": 0.16449908912181854,
      "learning_rate": 0.0002994784159558842,
      "loss": 0.4509,
      "step": 3010
    },
    {
      "gate_value": 0.06566469371318817,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3010
    },
    {
      "grad_norm": 0.07561691105365753,
      "learning_rate": 0.00029946803223613374,
      "loss": 0.4484,
      "step": 3020
    },
    {
      "gate_value": 0.06594062596559525,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3020
    },
    {
      "grad_norm": 0.10772091150283813,
      "learning_rate": 0.0002994575463563925,
      "loss": 0.4524,
      "step": 3030
    },
    {
      "gate_value": 0.06642261147499084,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 3030
    },
    {
      "grad_norm": 0.08066357672214508,
      "learning_rate": 0.00029944695832382777,
      "loss": 0.4494,
      "step": 3040
    },
    {
      "gate_value": 0.06684261560440063,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 3040
    },
    {
      "grad_norm": 0.0352531373500824,
      "learning_rate": 0.00029943626814567617,
      "loss": 0.4564,
      "step": 3050
    },
    {
      "gate_value": 0.06724800169467926,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3050
    },
    {
      "grad_norm": 0.05193459987640381,
      "learning_rate": 0.0002994254758292444,
      "loss": 0.4642,
      "step": 3060
    },
    {
      "gate_value": 0.06739187985658646,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 3060
    },
    {
      "grad_norm": 0.05352374166250229,
      "learning_rate": 0.0002994145813819089,
      "loss": 0.4611,
      "step": 3070
    },
    {
      "gate_value": 0.06732220947742462,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3070
    },
    {
      "grad_norm": 0.03663728013634682,
      "learning_rate": 0.0002994035848111159,
      "loss": 0.459,
      "step": 3080
    },
    {
      "gate_value": 0.06713177263736725,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 3080
    },
    {
      "grad_norm": 0.08161330968141556,
      "learning_rate": 0.00029939248612438147,
      "loss": 0.4586,
      "step": 3090
    },
    {
      "gate_value": 0.06736379861831665,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 3090
    },
    {
      "grad_norm": 0.1583651602268219,
      "learning_rate": 0.0002993812853292915,
      "loss": 0.4577,
      "step": 3100
    },
    {
      "gate_value": 0.06740887463092804,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 3100
    },
    {
      "grad_norm": 0.07588821649551392,
      "learning_rate": 0.00029936998243350153,
      "loss": 0.4599,
      "step": 3110
    },
    {
      "gate_value": 0.06748247146606445,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3110
    },
    {
      "grad_norm": 0.04477818310260773,
      "learning_rate": 0.00029935857744473705,
      "loss": 0.438,
      "step": 3120
    },
    {
      "gate_value": 0.06831636279821396,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 3120
    },
    {
      "grad_norm": 0.11088748276233673,
      "learning_rate": 0.0002993470703707933,
      "loss": 0.4511,
      "step": 3130
    },
    {
      "gate_value": 0.06878136098384857,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 3130
    },
    {
      "grad_norm": 0.07485643774271011,
      "learning_rate": 0.0002993354612195352,
      "loss": 0.4567,
      "step": 3140
    },
    {
      "gate_value": 0.06921064108610153,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3140
    },
    {
      "grad_norm": 0.04625440016388893,
      "learning_rate": 0.0002993237499988975,
      "loss": 0.45,
      "step": 3150
    },
    {
      "gate_value": 0.06880256533622742,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 3150
    },
    {
      "grad_norm": 0.037098273634910583,
      "learning_rate": 0.0002993119367168847,
      "loss": 0.4516,
      "step": 3160
    },
    {
      "gate_value": 0.06890203058719635,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3160
    },
    {
      "grad_norm": 0.1960594356060028,
      "learning_rate": 0.0002993000213815711,
      "loss": 0.4566,
      "step": 3170
    },
    {
      "gate_value": 0.06906116008758545,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 3170
    },
    {
      "grad_norm": 0.0907864198088646,
      "learning_rate": 0.0002992880040011007,
      "loss": 0.4524,
      "step": 3180
    },
    {
      "gate_value": 0.06909805536270142,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 3180
    },
    {
      "grad_norm": 0.037198636680841446,
      "learning_rate": 0.0002992758845836873,
      "loss": 0.4641,
      "step": 3190
    },
    {
      "gate_value": 0.06939682364463806,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 3190
    },
    {
      "grad_norm": 0.24401098489761353,
      "learning_rate": 0.00029926366313761424,
      "loss": 0.4631,
      "step": 3200
    },
    {
      "gate_value": 0.06982459127902985,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 3200
    },
    {
      "grad_norm": 0.0669318288564682,
      "learning_rate": 0.000299251339671235,
      "loss": 0.4619,
      "step": 3210
    },
    {
      "gate_value": 0.07002398371696472,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 3210
    },
    {
      "grad_norm": 0.13897764682769775,
      "learning_rate": 0.0002992389141929724,
      "loss": 0.4531,
      "step": 3220
    },
    {
      "gate_value": 0.07005146890878677,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 3220
    },
    {
      "grad_norm": 0.04050607606768608,
      "learning_rate": 0.00029922638671131926,
      "loss": 0.4563,
      "step": 3230
    },
    {
      "gate_value": 0.0697348564863205,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 3230
    },
    {
      "grad_norm": 0.11345645785331726,
      "learning_rate": 0.0002992137572348379,
      "loss": 0.4592,
      "step": 3240
    },
    {
      "gate_value": 0.06972472369670868,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 3240
    },
    {
      "grad_norm": 0.11163768172264099,
      "learning_rate": 0.00029920102577216047,
      "loss": 0.4337,
      "step": 3250
    },
    {
      "gate_value": 0.0699225589632988,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 3250
    },
    {
      "grad_norm": 0.09180065244436264,
      "learning_rate": 0.0002991881923319888,
      "loss": 0.4614,
      "step": 3260
    },
    {
      "gate_value": 0.06991465389728546,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3260
    },
    {
      "grad_norm": 0.16114754974842072,
      "learning_rate": 0.00029917525692309445,
      "loss": 0.4514,
      "step": 3270
    },
    {
      "gate_value": 0.06968920677900314,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3270
    },
    {
      "grad_norm": 0.08679977059364319,
      "learning_rate": 0.0002991622195543186,
      "loss": 0.4711,
      "step": 3280
    },
    {
      "gate_value": 0.069790318608284,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3280
    },
    {
      "grad_norm": 0.04381372407078743,
      "learning_rate": 0.0002991490802345722,
      "loss": 0.4695,
      "step": 3290
    },
    {
      "gate_value": 0.07006606459617615,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 3290
    },
    {
      "grad_norm": 0.09389933198690414,
      "learning_rate": 0.0002991358389728359,
      "loss": 0.4612,
      "step": 3300
    },
    {
      "gate_value": 0.07035694271326065,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 3300
    },
    {
      "grad_norm": 0.045784782618284225,
      "learning_rate": 0.00029912249577815987,
      "loss": 0.4487,
      "step": 3310
    },
    {
      "gate_value": 0.07085049897432327,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 3310
    },
    {
      "grad_norm": 0.06604770570993423,
      "learning_rate": 0.0002991090506596641,
      "loss": 0.4448,
      "step": 3320
    },
    {
      "gate_value": 0.07110884040594101,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 3320
    },
    {
      "grad_norm": 0.15612971782684326,
      "learning_rate": 0.0002990955036265383,
      "loss": 0.449,
      "step": 3330
    },
    {
      "gate_value": 0.07128405570983887,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 3330
    },
    {
      "grad_norm": 0.13052192330360413,
      "learning_rate": 0.0002990818546880416,
      "loss": 0.4533,
      "step": 3340
    },
    {
      "gate_value": 0.07144956290721893,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 3340
    },
    {
      "grad_norm": 0.10136935114860535,
      "learning_rate": 0.000299068103853503,
      "loss": 0.4516,
      "step": 3350
    },
    {
      "gate_value": 0.07161328196525574,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 3350
    },
    {
      "grad_norm": 0.042142871767282486,
      "learning_rate": 0.00029905425113232103,
      "loss": 0.4645,
      "step": 3360
    },
    {
      "gate_value": 0.07148417830467224,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 3360
    },
    {
      "grad_norm": 0.1631345897912979,
      "learning_rate": 0.0002990402965339639,
      "loss": 0.4448,
      "step": 3370
    },
    {
      "gate_value": 0.0716477707028389,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 3370
    },
    {
      "grad_norm": 0.11523299664258957,
      "learning_rate": 0.0002990262400679695,
      "loss": 0.4547,
      "step": 3380
    },
    {
      "gate_value": 0.07228045165538788,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 3380
    },
    {
      "grad_norm": 0.08556123077869415,
      "learning_rate": 0.0002990120817439452,
      "loss": 0.4426,
      "step": 3390
    },
    {
      "gate_value": 0.0726013332605362,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 3390
    },
    {
      "grad_norm": 0.04559708759188652,
      "learning_rate": 0.00029899782157156817,
      "loss": 0.4481,
      "step": 3400
    },
    {
      "gate_value": 0.07277483493089676,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 3400
    },
    {
      "grad_norm": 0.11405778676271439,
      "learning_rate": 0.000298983459560585,
      "loss": 0.4429,
      "step": 3410
    },
    {
      "gate_value": 0.07276701927185059,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3410
    },
    {
      "grad_norm": 0.05555117875337601,
      "learning_rate": 0.00029896899572081216,
      "loss": 0.4561,
      "step": 3420
    },
    {
      "gate_value": 0.0728112980723381,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 3420
    },
    {
      "grad_norm": 0.03393098711967468,
      "learning_rate": 0.00029895443006213536,
      "loss": 0.4507,
      "step": 3430
    },
    {
      "gate_value": 0.07333412766456604,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3430
    },
    {
      "grad_norm": 0.08307201415300369,
      "learning_rate": 0.0002989397625945102,
      "loss": 0.448,
      "step": 3440
    },
    {
      "gate_value": 0.07309713959693909,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 3440
    },
    {
      "grad_norm": 0.04834305867552757,
      "learning_rate": 0.00029892499332796166,
      "loss": 0.4551,
      "step": 3450
    },
    {
      "gate_value": 0.0728369876742363,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 3450
    },
    {
      "grad_norm": 0.03444100171327591,
      "learning_rate": 0.00029891012227258447,
      "loss": 0.462,
      "step": 3460
    },
    {
      "gate_value": 0.07332317531108856,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 3460
    },
    {
      "grad_norm": 0.03249925747513771,
      "learning_rate": 0.00029889514943854284,
      "loss": 0.4605,
      "step": 3470
    },
    {
      "gate_value": 0.07408445328474045,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 3470
    },
    {
      "grad_norm": 0.1658170372247696,
      "learning_rate": 0.0002988800748360706,
      "loss": 0.4629,
      "step": 3480
    },
    {
      "gate_value": 0.0744827389717102,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 3480
    },
    {
      "grad_norm": 0.10207600891590118,
      "learning_rate": 0.00029886489847547114,
      "loss": 0.4501,
      "step": 3490
    },
    {
      "gate_value": 0.07433953136205673,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 3490
    },
    {
      "grad_norm": 0.11472195386886597,
      "learning_rate": 0.00029884962036711717,
      "loss": 0.444,
      "step": 3500
    },
    {
      "gate_value": 0.07421142607927322,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 3500
    },
    {
      "grad_norm": 0.06376820802688599,
      "learning_rate": 0.00029883424052145127,
      "loss": 0.4551,
      "step": 3510
    },
    {
      "gate_value": 0.07423210889101028,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 3510
    },
    {
      "grad_norm": 0.09487133473157883,
      "learning_rate": 0.00029881875894898543,
      "loss": 0.4733,
      "step": 3520
    },
    {
      "gate_value": 0.07424761354923248,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 3520
    },
    {
      "grad_norm": 0.04656476154923439,
      "learning_rate": 0.00029880317566030113,
      "loss": 0.4543,
      "step": 3530
    },
    {
      "gate_value": 0.07421663403511047,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 3530
    },
    {
      "grad_norm": 0.09370381385087967,
      "learning_rate": 0.00029878749066604936,
      "loss": 0.447,
      "step": 3540
    },
    {
      "gate_value": 0.07410692423582077,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3540
    },
    {
      "grad_norm": 0.16475284099578857,
      "learning_rate": 0.0002987717039769507,
      "loss": 0.4584,
      "step": 3550
    },
    {
      "gate_value": 0.0745421051979065,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3550
    },
    {
      "grad_norm": 0.16828690469264984,
      "learning_rate": 0.00029875581560379527,
      "loss": 0.4671,
      "step": 3560
    },
    {
      "gate_value": 0.07483482360839844,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 3560
    },
    {
      "grad_norm": 0.14615994691848755,
      "learning_rate": 0.0002987398255574425,
      "loss": 0.4543,
      "step": 3570
    },
    {
      "gate_value": 0.0748705267906189,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 3570
    },
    {
      "grad_norm": 0.0632835254073143,
      "learning_rate": 0.00029872373384882153,
      "loss": 0.4583,
      "step": 3580
    },
    {
      "gate_value": 0.07477834075689316,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 3580
    },
    {
      "grad_norm": 0.08285976946353912,
      "learning_rate": 0.0002987075404889308,
      "loss": 0.4417,
      "step": 3590
    },
    {
      "gate_value": 0.07480471581220627,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3590
    },
    {
      "grad_norm": 0.033416613936424255,
      "learning_rate": 0.00029869124548883837,
      "loss": 0.4526,
      "step": 3600
    },
    {
      "gate_value": 0.07461623102426529,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 3600
    },
    {
      "grad_norm": 0.032648663967847824,
      "learning_rate": 0.0002986748488596818,
      "loss": 0.4543,
      "step": 3610
    },
    {
      "gate_value": 0.075102798640728,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 3610
    },
    {
      "grad_norm": 0.036768630146980286,
      "learning_rate": 0.0002986583506126679,
      "loss": 0.4596,
      "step": 3620
    },
    {
      "gate_value": 0.07561081647872925,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 3620
    },
    {
      "grad_norm": 0.08868744969367981,
      "learning_rate": 0.0002986417507590731,
      "loss": 0.4374,
      "step": 3630
    },
    {
      "gate_value": 0.07600181549787521,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 3630
    },
    {
      "grad_norm": 0.042598139494657516,
      "learning_rate": 0.0002986250493102433,
      "loss": 0.4493,
      "step": 3640
    },
    {
      "gate_value": 0.0765034481883049,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 3640
    },
    {
      "grad_norm": 0.07039535045623779,
      "learning_rate": 0.00029860824627759376,
      "loss": 0.4465,
      "step": 3650
    },
    {
      "gate_value": 0.07682037353515625,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3650
    },
    {
      "grad_norm": 0.06170298904180527,
      "learning_rate": 0.00029859134167260917,
      "loss": 0.4435,
      "step": 3660
    },
    {
      "gate_value": 0.07736477255821228,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 3660
    },
    {
      "grad_norm": 0.07327844947576523,
      "learning_rate": 0.0002985743355068437,
      "loss": 0.4603,
      "step": 3670
    },
    {
      "gate_value": 0.0773945227265358,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 3670
    },
    {
      "grad_norm": 0.0467827171087265,
      "learning_rate": 0.0002985572277919208,
      "loss": 0.4543,
      "step": 3680
    },
    {
      "gate_value": 0.07693289965391159,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 3680
    },
    {
      "grad_norm": 0.10239314287900925,
      "learning_rate": 0.0002985400185395336,
      "loss": 0.452,
      "step": 3690
    },
    {
      "gate_value": 0.07675682753324509,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 3690
    },
    {
      "grad_norm": 0.032947756350040436,
      "learning_rate": 0.00029852270776144435,
      "loss": 0.4479,
      "step": 3700
    },
    {
      "gate_value": 0.0769607424736023,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 3700
    },
    {
      "grad_norm": 0.05921289697289467,
      "learning_rate": 0.00029850529546948483,
      "loss": 0.4503,
      "step": 3710
    },
    {
      "gate_value": 0.07716976851224899,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 3710
    },
    {
      "grad_norm": 0.1941101998090744,
      "learning_rate": 0.0002984877816755562,
      "loss": 0.438,
      "step": 3720
    },
    {
      "gate_value": 0.07710455358028412,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 3720
    },
    {
      "grad_norm": 0.13056175410747528,
      "learning_rate": 0.0002984701663916289,
      "loss": 0.4458,
      "step": 3730
    },
    {
      "gate_value": 0.077248215675354,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 3730
    },
    {
      "grad_norm": 0.22387555241584778,
      "learning_rate": 0.0002984524496297429,
      "loss": 0.4501,
      "step": 3740
    },
    {
      "gate_value": 0.07783285528421402,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 3740
    },
    {
      "grad_norm": 0.11868324875831604,
      "learning_rate": 0.0002984346314020074,
      "loss": 0.4433,
      "step": 3750
    },
    {
      "gate_value": 0.07809463143348694,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 3750
    },
    {
      "grad_norm": 0.1095101460814476,
      "learning_rate": 0.000298416711720601,
      "loss": 0.4435,
      "step": 3760
    },
    {
      "gate_value": 0.07806791365146637,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3760
    },
    {
      "grad_norm": 0.0775548443198204,
      "learning_rate": 0.0002983986905977716,
      "loss": 0.4514,
      "step": 3770
    },
    {
      "gate_value": 0.07827196270227432,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 3770
    },
    {
      "grad_norm": 0.04364806041121483,
      "learning_rate": 0.00029838056804583644,
      "loss": 0.4645,
      "step": 3780
    },
    {
      "gate_value": 0.07875936478376389,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 3780
    },
    {
      "grad_norm": 0.04026861488819122,
      "learning_rate": 0.00029836234407718226,
      "loss": 0.4511,
      "step": 3790
    },
    {
      "gate_value": 0.07864929735660553,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 3790
    },
    {
      "grad_norm": 0.10821370035409927,
      "learning_rate": 0.00029834401870426484,
      "loss": 0.4273,
      "step": 3800
    },
    {
      "gate_value": 0.07822828739881516,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 3800
    },
    {
      "grad_norm": 0.056853052228689194,
      "learning_rate": 0.00029832559193960947,
      "loss": 0.4466,
      "step": 3810
    },
    {
      "gate_value": 0.07868239283561707,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 3810
    },
    {
      "grad_norm": 0.12730729579925537,
      "learning_rate": 0.0002983070637958106,
      "loss": 0.4359,
      "step": 3820
    },
    {
      "gate_value": 0.07887009531259537,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3820
    },
    {
      "grad_norm": 0.061061352491378784,
      "learning_rate": 0.00029828843428553203,
      "loss": 0.4637,
      "step": 3830
    },
    {
      "gate_value": 0.07961258292198181,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 3830
    },
    {
      "grad_norm": 0.03473556786775589,
      "learning_rate": 0.000298269703421507,
      "loss": 0.4454,
      "step": 3840
    },
    {
      "gate_value": 0.07969710230827332,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 3840
    },
    {
      "grad_norm": 0.11743411421775818,
      "learning_rate": 0.0002982508712165377,
      "loss": 0.4597,
      "step": 3850
    },
    {
      "gate_value": 0.07997772842645645,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 3850
    },
    {
      "grad_norm": 0.12020798772573471,
      "learning_rate": 0.0002982319376834959,
      "loss": 0.4585,
      "step": 3860
    },
    {
      "gate_value": 0.08042223006486893,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 3860
    },
    {
      "grad_norm": 0.14048640429973602,
      "learning_rate": 0.0002982129028353224,
      "loss": 0.4577,
      "step": 3870
    },
    {
      "gate_value": 0.08117994666099548,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 3870
    },
    {
      "grad_norm": 0.08375728875398636,
      "learning_rate": 0.0002981937666850274,
      "loss": 0.4393,
      "step": 3880
    },
    {
      "gate_value": 0.08134108036756516,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 3880
    },
    {
      "grad_norm": 0.15470552444458008,
      "learning_rate": 0.00029817452924569025,
      "loss": 0.4463,
      "step": 3890
    },
    {
      "gate_value": 0.08139047771692276,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3890
    },
    {
      "grad_norm": 0.05757463350892067,
      "learning_rate": 0.00029815519053045955,
      "loss": 0.4409,
      "step": 3900
    },
    {
      "gate_value": 0.08170637488365173,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 3900
    },
    {
      "grad_norm": 0.10937239229679108,
      "learning_rate": 0.0002981357505525532,
      "loss": 0.4497,
      "step": 3910
    },
    {
      "gate_value": 0.08137400448322296,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 3910
    },
    {
      "grad_norm": 0.07456956058740616,
      "learning_rate": 0.0002981162093252581,
      "loss": 0.4574,
      "step": 3920
    },
    {
      "gate_value": 0.0812961757183075,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3920
    },
    {
      "grad_norm": 0.14056850969791412,
      "learning_rate": 0.00029809656686193063,
      "loss": 0.4477,
      "step": 3930
    },
    {
      "gate_value": 0.08145935088396072,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 3930
    },
    {
      "grad_norm": 0.13515639305114746,
      "learning_rate": 0.0002980768231759961,
      "loss": 0.4441,
      "step": 3940
    },
    {
      "gate_value": 0.08180077373981476,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 3940
    },
    {
      "grad_norm": 0.07286737859249115,
      "learning_rate": 0.00029805697828094935,
      "loss": 0.4426,
      "step": 3950
    },
    {
      "gate_value": 0.08208008110523224,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 3950
    },
    {
      "grad_norm": 0.02850443497300148,
      "learning_rate": 0.00029803703219035397,
      "loss": 0.4327,
      "step": 3960
    },
    {
      "gate_value": 0.0816754624247551,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 3960
    },
    {
      "grad_norm": 0.02943161129951477,
      "learning_rate": 0.00029801698491784294,
      "loss": 0.439,
      "step": 3970
    },
    {
      "gate_value": 0.08196882903575897,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 3970
    },
    {
      "grad_norm": 0.09917061030864716,
      "learning_rate": 0.0002979968364771185,
      "loss": 0.4544,
      "step": 3980
    },
    {
      "gate_value": 0.08232785761356354,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 3980
    },
    {
      "grad_norm": 0.09027360379695892,
      "learning_rate": 0.0002979765868819518,
      "loss": 0.4497,
      "step": 3990
    },
    {
      "gate_value": 0.08220331370830536,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 3990
    },
    {
      "grad_norm": 0.03193968906998634,
      "learning_rate": 0.0002979562361461834,
      "loss": 0.449,
      "step": 4000
    },
    {
      "gate_value": 0.08218622207641602,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 4000
    },
    {
      "grad_norm": 0.08486006408929825,
      "learning_rate": 0.00029793578428372264,
      "loss": 0.4432,
      "step": 4010
    },
    {
      "gate_value": 0.08235253393650055,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 4010
    },
    {
      "grad_norm": 0.1432875394821167,
      "learning_rate": 0.00029791523130854827,
      "loss": 0.4585,
      "step": 4020
    },
    {
      "gate_value": 0.0827346071600914,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4020
    },
    {
      "grad_norm": 0.03996672108769417,
      "learning_rate": 0.00029789457723470816,
      "loss": 0.4513,
      "step": 4030
    },
    {
      "gate_value": 0.0829230546951294,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4030
    },
    {
      "grad_norm": 0.025110652670264244,
      "learning_rate": 0.000297873822076319,
      "loss": 0.4506,
      "step": 4040
    },
    {
      "gate_value": 0.08226954191923141,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 4040
    },
    {
      "grad_norm": 0.10905573517084122,
      "learning_rate": 0.00029785296584756684,
      "loss": 0.4371,
      "step": 4050
    },
    {
      "gate_value": 0.08197779953479767,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4050
    },
    {
      "grad_norm": 0.11699944734573364,
      "learning_rate": 0.00029783200856270663,
      "loss": 0.4536,
      "step": 4060
    },
    {
      "gate_value": 0.0821978896856308,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 4060
    },
    {
      "grad_norm": 0.02820429392158985,
      "learning_rate": 0.0002978109502360626,
      "loss": 0.4617,
      "step": 4070
    },
    {
      "gate_value": 0.0826956108212471,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 4070
    },
    {
      "grad_norm": 0.07080373913049698,
      "learning_rate": 0.00029778979088202785,
      "loss": 0.4366,
      "step": 4080
    },
    {
      "gate_value": 0.08400825411081314,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4080
    },
    {
      "grad_norm": 0.046099573373794556,
      "learning_rate": 0.0002977685305150646,
      "loss": 0.4533,
      "step": 4090
    },
    {
      "gate_value": 0.0844913125038147,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4090
    },
    {
      "grad_norm": 0.03396543487906456,
      "learning_rate": 0.0002977471691497041,
      "loss": 0.4376,
      "step": 4100
    },
    {
      "gate_value": 0.08502555638551712,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 4100
    },
    {
      "grad_norm": 0.14182616770267487,
      "learning_rate": 0.0002977257068005467,
      "loss": 0.4663,
      "step": 4110
    },
    {
      "gate_value": 0.08497641980648041,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 4110
    },
    {
      "grad_norm": 0.03911354020237923,
      "learning_rate": 0.00029770414348226164,
      "loss": 0.4472,
      "step": 4120
    },
    {
      "gate_value": 0.08530503511428833,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 4120
    },
    {
      "grad_norm": 0.12039171159267426,
      "learning_rate": 0.0002976824792095873,
      "loss": 0.444,
      "step": 4130
    },
    {
      "gate_value": 0.08518508821725845,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 4130
    },
    {
      "grad_norm": 0.03984901309013367,
      "learning_rate": 0.000297660713997331,
      "loss": 0.4426,
      "step": 4140
    },
    {
      "gate_value": 0.0854906365275383,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 4140
    },
    {
      "grad_norm": 0.04110978543758392,
      "learning_rate": 0.000297638847860369,
      "loss": 0.44,
      "step": 4150
    },
    {
      "gate_value": 0.08582597225904465,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 4150
    },
    {
      "grad_norm": 0.03553116321563721,
      "learning_rate": 0.00029761688081364663,
      "loss": 0.4409,
      "step": 4160
    },
    {
      "gate_value": 0.08649842441082001,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 4160
    },
    {
      "grad_norm": 0.0862579196691513,
      "learning_rate": 0.0002975948128721782,
      "loss": 0.4589,
      "step": 4170
    },
    {
      "gate_value": 0.08685018122196198,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4170
    },
    {
      "grad_norm": 0.048685476183891296,
      "learning_rate": 0.0002975726440510469,
      "loss": 0.4296,
      "step": 4180
    },
    {
      "gate_value": 0.08682183921337128,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4180
    },
    {
      "grad_norm": 0.07612019032239914,
      "learning_rate": 0.0002975503743654049,
      "loss": 0.4376,
      "step": 4190
    },
    {
      "gate_value": 0.0863993689417839,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 4190
    },
    {
      "grad_norm": 0.07147730886936188,
      "learning_rate": 0.00029752800383047335,
      "loss": 0.4444,
      "step": 4200
    },
    {
      "gate_value": 0.08639863133430481,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 4200
    },
    {
      "grad_norm": 0.12619084119796753,
      "learning_rate": 0.0002975055324615423,
      "loss": 0.432,
      "step": 4210
    },
    {
      "gate_value": 0.08714673668146133,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 4210
    },
    {
      "grad_norm": 0.028788521885871887,
      "learning_rate": 0.00029748296027397065,
      "loss": 0.4538,
      "step": 4220
    },
    {
      "gate_value": 0.08739547431468964,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 4220
    },
    {
      "grad_norm": 0.11368781328201294,
      "learning_rate": 0.0002974602872831864,
      "loss": 0.4531,
      "step": 4230
    },
    {
      "gate_value": 0.08731156587600708,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 4230
    },
    {
      "grad_norm": 0.11662064492702484,
      "learning_rate": 0.00029743751350468626,
      "loss": 0.4487,
      "step": 4240
    },
    {
      "gate_value": 0.08763561397790909,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4240
    },
    {
      "grad_norm": 0.14818547666072845,
      "learning_rate": 0.0002974146389540358,
      "loss": 0.4344,
      "step": 4250
    },
    {
      "gate_value": 0.08787613362073898,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4250
    },
    {
      "grad_norm": 0.035743940621614456,
      "learning_rate": 0.0002973916636468698,
      "loss": 0.4496,
      "step": 4260
    },
    {
      "gate_value": 0.08736476302146912,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4260
    },
    {
      "grad_norm": 0.04561284929513931,
      "learning_rate": 0.00029736858759889137,
      "loss": 0.4432,
      "step": 4270
    },
    {
      "gate_value": 0.08761058002710342,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4270
    },
    {
      "grad_norm": 0.06140894815325737,
      "learning_rate": 0.000297345410825873,
      "loss": 0.4497,
      "step": 4280
    },
    {
      "gate_value": 0.08794830739498138,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 4280
    },
    {
      "grad_norm": 0.12474972754716873,
      "learning_rate": 0.0002973221333436557,
      "loss": 0.4489,
      "step": 4290
    },
    {
      "gate_value": 0.08841531723737717,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 4290
    },
    {
      "grad_norm": 0.0846974104642868,
      "learning_rate": 0.00029729875516814946,
      "loss": 0.4488,
      "step": 4300
    },
    {
      "gate_value": 0.08881258219480515,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 4300
    },
    {
      "grad_norm": 0.07251457870006561,
      "learning_rate": 0.000297275276315333,
      "loss": 0.424,
      "step": 4310
    },
    {
      "gate_value": 0.088979572057724,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 4310
    },
    {
      "grad_norm": 0.10169383883476257,
      "learning_rate": 0.00029725169680125385,
      "loss": 0.4563,
      "step": 4320
    },
    {
      "gate_value": 0.0891449972987175,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 4320
    },
    {
      "grad_norm": 0.048810429871082306,
      "learning_rate": 0.00029722801664202843,
      "loss": 0.433,
      "step": 4330
    },
    {
      "gate_value": 0.09007365256547928,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 4330
    },
    {
      "grad_norm": 0.09916849434375763,
      "learning_rate": 0.00029720423585384196,
      "loss": 0.4407,
      "step": 4340
    },
    {
      "gate_value": 0.0902184247970581,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4340
    },
    {
      "grad_norm": 0.07443194836378098,
      "learning_rate": 0.00029718035445294835,
      "loss": 0.4396,
      "step": 4350
    },
    {
      "gate_value": 0.09020433574914932,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 4350
    },
    {
      "grad_norm": 0.06431616097688675,
      "learning_rate": 0.0002971563724556703,
      "loss": 0.4333,
      "step": 4360
    },
    {
      "gate_value": 0.0905480906367302,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 4360
    },
    {
      "grad_norm": 0.07431776076555252,
      "learning_rate": 0.0002971322898783992,
      "loss": 0.4274,
      "step": 4370
    },
    {
      "gate_value": 0.09058386087417603,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 4370
    },
    {
      "grad_norm": 0.07152583450078964,
      "learning_rate": 0.0002971081067375954,
      "loss": 0.4202,
      "step": 4380
    },
    {
      "gate_value": 0.09066424518823624,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 4380
    },
    {
      "grad_norm": 0.10134628415107727,
      "learning_rate": 0.0002970838230497878,
      "loss": 0.436,
      "step": 4390
    },
    {
      "gate_value": 0.09120754152536392,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 4390
    },
    {
      "grad_norm": 0.03166228160262108,
      "learning_rate": 0.000297059438831574,
      "loss": 0.454,
      "step": 4400
    },
    {
      "gate_value": 0.09206748008728027,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4400
    },
    {
      "grad_norm": 0.1006300076842308,
      "learning_rate": 0.0002970349540996205,
      "loss": 0.4661,
      "step": 4410
    },
    {
      "gate_value": 0.09231452643871307,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 4410
    },
    {
      "grad_norm": 0.039432503283023834,
      "learning_rate": 0.0002970103688706623,
      "loss": 0.4409,
      "step": 4420
    },
    {
      "gate_value": 0.0917162299156189,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 4420
    },
    {
      "grad_norm": 0.13075602054595947,
      "learning_rate": 0.00029698568316150327,
      "loss": 0.4256,
      "step": 4430
    },
    {
      "gate_value": 0.09100466966629028,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4430
    },
    {
      "grad_norm": 0.0706099420785904,
      "learning_rate": 0.00029696089698901575,
      "loss": 0.4452,
      "step": 4440
    },
    {
      "gate_value": 0.0907866507768631,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 4440
    },
    {
      "grad_norm": 0.04131682217121124,
      "learning_rate": 0.0002969360103701409,
      "loss": 0.4473,
      "step": 4450
    },
    {
      "gate_value": 0.09110541641712189,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 4450
    },
    {
      "grad_norm": 0.08504730463027954,
      "learning_rate": 0.0002969110233218885,
      "loss": 0.4435,
      "step": 4460
    },
    {
      "gate_value": 0.09132962673902512,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 4460
    },
    {
      "grad_norm": 0.1154944896697998,
      "learning_rate": 0.00029688593586133687,
      "loss": 0.4311,
      "step": 4470
    },
    {
      "gate_value": 0.09130672365427017,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 4470
    },
    {
      "grad_norm": 0.04087964445352554,
      "learning_rate": 0.0002968607480056332,
      "loss": 0.4496,
      "step": 4480
    },
    {
      "gate_value": 0.09184221923351288,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 4480
    },
    {
      "grad_norm": 0.06298123300075531,
      "learning_rate": 0.00029683545977199306,
      "loss": 0.4331,
      "step": 4490
    },
    {
      "gate_value": 0.09272352606058121,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 4490
    },
    {
      "grad_norm": 0.12888146936893463,
      "learning_rate": 0.0002968100711777008,
      "loss": 0.4632,
      "step": 4500
    },
    {
      "gate_value": 0.0925886407494545,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4500
    },
    {
      "grad_norm": 0.04741204157471657,
      "learning_rate": 0.0002967845822401091,
      "loss": 0.4255,
      "step": 4510
    },
    {
      "gate_value": 0.0922568216919899,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 4510
    },
    {
      "grad_norm": 0.0282837375998497,
      "learning_rate": 0.00029675899297663965,
      "loss": 0.4332,
      "step": 4520
    },
    {
      "gate_value": 0.09246893227100372,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4520
    },
    {
      "grad_norm": 0.044491998851299286,
      "learning_rate": 0.00029673330340478234,
      "loss": 0.44,
      "step": 4530
    },
    {
      "gate_value": 0.09226234257221222,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 4530
    },
    {
      "grad_norm": 0.0732334777712822,
      "learning_rate": 0.0002967075135420957,
      "loss": 0.4077,
      "step": 4540
    },
    {
      "gate_value": 0.09291594475507736,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4540
    },
    {
      "grad_norm": 0.16014868021011353,
      "learning_rate": 0.00029668162340620695,
      "loss": 0.4489,
      "step": 4550
    },
    {
      "gate_value": 0.09284153580665588,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 4550
    },
    {
      "grad_norm": 0.19823066890239716,
      "learning_rate": 0.00029665563301481174,
      "loss": 0.4326,
      "step": 4560
    },
    {
      "gate_value": 0.09242615848779678,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 4560
    },
    {
      "grad_norm": 0.1002328023314476,
      "learning_rate": 0.00029662954238567427,
      "loss": 0.4508,
      "step": 4570
    },
    {
      "gate_value": 0.09263605624437332,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4570
    },
    {
      "grad_norm": 0.03767078369855881,
      "learning_rate": 0.00029660335153662717,
      "loss": 0.4481,
      "step": 4580
    },
    {
      "gate_value": 0.09308427572250366,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4580
    },
    {
      "grad_norm": 0.055940892547369,
      "learning_rate": 0.0002965770604855717,
      "loss": 0.4429,
      "step": 4590
    },
    {
      "gate_value": 0.09287799894809723,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 4590
    },
    {
      "grad_norm": 0.06824788451194763,
      "learning_rate": 0.00029655066925047754,
      "loss": 0.4463,
      "step": 4600
    },
    {
      "gate_value": 0.09253347665071487,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 4600
    },
    {
      "grad_norm": 0.11449204385280609,
      "learning_rate": 0.0002965241778493828,
      "loss": 0.4275,
      "step": 4610
    },
    {
      "gate_value": 0.09268172085285187,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 4610
    },
    {
      "grad_norm": 0.13200627267360687,
      "learning_rate": 0.0002964975863003942,
      "loss": 0.4334,
      "step": 4620
    },
    {
      "gate_value": 0.09307952225208282,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4620
    },
    {
      "grad_norm": 0.219753697514534,
      "learning_rate": 0.0002964708946216867,
      "loss": 0.457,
      "step": 4630
    },
    {
      "gate_value": 0.09279076755046844,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 4630
    },
    {
      "grad_norm": 0.07256842404603958,
      "learning_rate": 0.00029644410283150393,
      "loss": 0.4342,
      "step": 4640
    },
    {
      "gate_value": 0.09255042672157288,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 4640
    },
    {
      "grad_norm": 0.03685823827981949,
      "learning_rate": 0.00029641721094815764,
      "loss": 0.4533,
      "step": 4650
    },
    {
      "gate_value": 0.09305860102176666,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 4650
    },
    {
      "grad_norm": 0.04822506383061409,
      "learning_rate": 0.0002963902189900284,
      "loss": 0.4521,
      "step": 4660
    },
    {
      "gate_value": 0.093348428606987,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 4660
    },
    {
      "grad_norm": 0.044017307460308075,
      "learning_rate": 0.00029636312697556484,
      "loss": 0.4566,
      "step": 4670
    },
    {
      "gate_value": 0.09334082901477814,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 4670
    },
    {
      "grad_norm": 0.07033849507570267,
      "learning_rate": 0.0002963359349232841,
      "loss": 0.4526,
      "step": 4680
    },
    {
      "gate_value": 0.09388935565948486,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 4680
    },
    {
      "grad_norm": 0.037495627999305725,
      "learning_rate": 0.00029630864285177166,
      "loss": 0.43,
      "step": 4690
    },
    {
      "gate_value": 0.09439925849437714,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 4690
    },
    {
      "grad_norm": 0.040242332965135574,
      "learning_rate": 0.0002962812507796815,
      "loss": 0.4209,
      "step": 4700
    },
    {
      "gate_value": 0.09456100314855576,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4700
    },
    {
      "grad_norm": 0.12894514203071594,
      "learning_rate": 0.0002962537587257358,
      "loss": 0.4321,
      "step": 4710
    },
    {
      "gate_value": 0.09533104300498962,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 4710
    },
    {
      "grad_norm": 0.08391169458627701,
      "learning_rate": 0.0002962261667087251,
      "loss": 0.4371,
      "step": 4720
    },
    {
      "gate_value": 0.09611310064792633,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 4720
    },
    {
      "grad_norm": 0.08664949983358383,
      "learning_rate": 0.00029619847474750825,
      "loss": 0.4519,
      "step": 4730
    },
    {
      "gate_value": 0.09609542787075043,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 4730
    },
    {
      "grad_norm": 0.08422094583511353,
      "learning_rate": 0.0002961706828610125,
      "loss": 0.441,
      "step": 4740
    },
    {
      "gate_value": 0.09666085988283157,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 4740
    },
    {
      "grad_norm": 0.10419321805238724,
      "learning_rate": 0.00029614279106823327,
      "loss": 0.4484,
      "step": 4750
    },
    {
      "gate_value": 0.09734027087688446,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 4750
    },
    {
      "grad_norm": 0.07651486992835999,
      "learning_rate": 0.0002961147993882344,
      "loss": 0.4208,
      "step": 4760
    },
    {
      "gate_value": 0.09778045862913132,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 4760
    },
    {
      "grad_norm": 0.030980784446001053,
      "learning_rate": 0.0002960867078401479,
      "loss": 0.4397,
      "step": 4770
    },
    {
      "gate_value": 0.09714986383914948,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 4770
    },
    {
      "grad_norm": 0.14434650540351868,
      "learning_rate": 0.0002960585164431742,
      "loss": 0.4332,
      "step": 4780
    },
    {
      "gate_value": 0.09717854857444763,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 4780
    },
    {
      "grad_norm": 0.042256616055965424,
      "learning_rate": 0.00029603022521658174,
      "loss": 0.4431,
      "step": 4790
    },
    {
      "gate_value": 0.09656146913766861,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4790
    },
    {
      "grad_norm": 0.06562954187393188,
      "learning_rate": 0.0002960018341797073,
      "loss": 0.4455,
      "step": 4800
    },
    {
      "gate_value": 0.09680372476577759,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 4800
    },
    {
      "grad_norm": 0.0575077123939991,
      "learning_rate": 0.0002959733433519559,
      "loss": 0.4349,
      "step": 4810
    },
    {
      "gate_value": 0.09677128493785858,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 4810
    },
    {
      "grad_norm": 0.04724402353167534,
      "learning_rate": 0.0002959447527528008,
      "loss": 0.4317,
      "step": 4820
    },
    {
      "gate_value": 0.0971720814704895,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 4820
    },
    {
      "grad_norm": 0.08383426070213318,
      "learning_rate": 0.00029591606240178336,
      "loss": 0.4491,
      "step": 4830
    },
    {
      "gate_value": 0.09781701862812042,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4830
    },
    {
      "grad_norm": 0.045405976474285126,
      "learning_rate": 0.00029588727231851317,
      "loss": 0.4264,
      "step": 4840
    },
    {
      "gate_value": 0.09881711006164551,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 4840
    },
    {
      "grad_norm": 0.04647388681769371,
      "learning_rate": 0.00029585838252266797,
      "loss": 0.4422,
      "step": 4850
    },
    {
      "gate_value": 0.09951039403676987,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 4850
    },
    {
      "grad_norm": 0.049380771815776825,
      "learning_rate": 0.0002958293930339937,
      "loss": 0.4484,
      "step": 4860
    },
    {
      "gate_value": 0.09975112974643707,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 4860
    },
    {
      "grad_norm": 0.11173932999372482,
      "learning_rate": 0.00029580030387230436,
      "loss": 0.429,
      "step": 4870
    },
    {
      "gate_value": 0.10008491575717926,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 4870
    },
    {
      "grad_norm": 0.038873374462127686,
      "learning_rate": 0.00029577111505748216,
      "loss": 0.4521,
      "step": 4880
    },
    {
      "gate_value": 0.10003317147493362,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 4880
    },
    {
      "grad_norm": 0.036797404289245605,
      "learning_rate": 0.00029574182660947735,
      "loss": 0.4587,
      "step": 4890
    },
    {
      "gate_value": 0.10025697946548462,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 4890
    },
    {
      "grad_norm": 0.04378237947821617,
      "learning_rate": 0.00029571243854830835,
      "loss": 0.4413,
      "step": 4900
    },
    {
      "gate_value": 0.10004623234272003,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 4900
    },
    {
      "grad_norm": 0.059293054044246674,
      "learning_rate": 0.00029568295089406154,
      "loss": 0.4358,
      "step": 4910
    },
    {
      "gate_value": 0.10010574758052826,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 4910
    },
    {
      "grad_norm": 0.05134032666683197,
      "learning_rate": 0.00029565336366689146,
      "loss": 0.4418,
      "step": 4920
    },
    {
      "gate_value": 0.10006356984376907,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 4920
    },
    {
      "grad_norm": 0.06267619878053665,
      "learning_rate": 0.00029562367688702084,
      "loss": 0.4357,
      "step": 4930
    },
    {
      "gate_value": 0.09998317062854767,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 4930
    },
    {
      "grad_norm": 0.07684914022684097,
      "learning_rate": 0.0002955938905747402,
      "loss": 0.4332,
      "step": 4940
    },
    {
      "gate_value": 0.10052120685577393,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 4940
    },
    {
      "grad_norm": 0.10818302631378174,
      "learning_rate": 0.00029556400475040813,
      "loss": 0.4445,
      "step": 4950
    },
    {
      "gate_value": 0.10102952271699905,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 4950
    },
    {
      "grad_norm": 0.03710390627384186,
      "learning_rate": 0.0002955340194344515,
      "loss": 0.4484,
      "step": 4960
    },
    {
      "gate_value": 0.10203094780445099,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 4960
    },
    {
      "grad_norm": 0.07414698600769043,
      "learning_rate": 0.00029550393464736484,
      "loss": 0.4415,
      "step": 4970
    },
    {
      "gate_value": 0.10241258144378662,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 4970
    },
    {
      "grad_norm": 0.04050971195101738,
      "learning_rate": 0.0002954737504097109,
      "loss": 0.4406,
      "step": 4980
    },
    {
      "gate_value": 0.10247789323329926,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 4980
    },
    {
      "grad_norm": 0.10987893491983414,
      "learning_rate": 0.00029544346674212026,
      "loss": 0.4274,
      "step": 4990
    },
    {
      "gate_value": 0.10240554064512253,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 4990
    },
    {
      "grad_norm": 0.05541801080107689,
      "learning_rate": 0.0002954130836652916,
      "loss": 0.4386,
      "step": 5000
    },
    {
      "gate_value": 0.1022152230143547,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 5000
    },
    {
      "grad_norm": 0.03097173199057579,
      "learning_rate": 0.00029538260119999133,
      "loss": 0.4319,
      "step": 5010
    },
    {
      "gate_value": 0.10129179060459137,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 5010
    },
    {
      "grad_norm": 0.161233589053154,
      "learning_rate": 0.0002953520193670541,
      "loss": 0.4525,
      "step": 5020
    },
    {
      "gate_value": 0.10048361122608185,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 5020
    },
    {
      "grad_norm": 0.06552668660879135,
      "learning_rate": 0.0002953213381873822,
      "loss": 0.4534,
      "step": 5030
    },
    {
      "gate_value": 0.10061473399400711,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 5030
    },
    {
      "grad_norm": 0.08582403510808945,
      "learning_rate": 0.0002952905576819459,
      "loss": 0.4358,
      "step": 5040
    },
    {
      "gate_value": 0.10137443244457245,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 5040
    },
    {
      "grad_norm": 0.046412449330091476,
      "learning_rate": 0.00029525967787178347,
      "loss": 0.4432,
      "step": 5050
    },
    {
      "gate_value": 0.10161323845386505,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 5050
    },
    {
      "grad_norm": 0.10222003608942032,
      "learning_rate": 0.00029522869877800093,
      "loss": 0.442,
      "step": 5060
    },
    {
      "gate_value": 0.1019650399684906,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5060
    },
    {
      "grad_norm": 0.06821785122156143,
      "learning_rate": 0.00029519762042177225,
      "loss": 0.4388,
      "step": 5070
    },
    {
      "gate_value": 0.10277073085308075,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5070
    },
    {
      "grad_norm": 0.050177618861198425,
      "learning_rate": 0.0002951664428243391,
      "loss": 0.4466,
      "step": 5080
    },
    {
      "gate_value": 0.1031922847032547,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 5080
    },
    {
      "grad_norm": 0.04561970382928848,
      "learning_rate": 0.00029513516600701106,
      "loss": 0.4523,
      "step": 5090
    },
    {
      "gate_value": 0.10391707718372345,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 5090
    },
    {
      "grad_norm": 0.04120245203375816,
      "learning_rate": 0.0002951037899911657,
      "loss": 0.4512,
      "step": 5100
    },
    {
      "gate_value": 0.10400953143835068,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 5100
    },
    {
      "grad_norm": 0.03396369889378548,
      "learning_rate": 0.0002950723147982481,
      "loss": 0.4368,
      "step": 5110
    },
    {
      "gate_value": 0.10385365039110184,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 5110
    },
    {
      "grad_norm": 0.041298843920230865,
      "learning_rate": 0.0002950407404497712,
      "loss": 0.428,
      "step": 5120
    },
    {
      "gate_value": 0.1042807549238205,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 5120
    },
    {
      "grad_norm": 0.05203133821487427,
      "learning_rate": 0.00029500906696731596,
      "loss": 0.4465,
      "step": 5130
    },
    {
      "gate_value": 0.1041208803653717,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5130
    },
    {
      "grad_norm": 0.08747325837612152,
      "learning_rate": 0.0002949772943725307,
      "loss": 0.4433,
      "step": 5140
    },
    {
      "gate_value": 0.1040065661072731,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5140
    },
    {
      "grad_norm": 0.13491439819335938,
      "learning_rate": 0.00029494542268713184,
      "loss": 0.4409,
      "step": 5150
    },
    {
      "gate_value": 0.10431725531816483,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 5150
    },
    {
      "grad_norm": 0.11412998288869858,
      "learning_rate": 0.00029491345193290337,
      "loss": 0.4416,
      "step": 5160
    },
    {
      "gate_value": 0.10433990508317947,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 5160
    },
    {
      "grad_norm": 0.05630851164460182,
      "learning_rate": 0.00029488138213169693,
      "loss": 0.4299,
      "step": 5170
    },
    {
      "gate_value": 0.10372508317232132,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 5170
    },
    {
      "grad_norm": 0.13014192879199982,
      "learning_rate": 0.00029484921330543193,
      "loss": 0.4541,
      "step": 5180
    },
    {
      "gate_value": 0.10315015912055969,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 5180
    },
    {
      "grad_norm": 0.10586561262607574,
      "learning_rate": 0.0002948169454760955,
      "loss": 0.4307,
      "step": 5190
    },
    {
      "gate_value": 0.10308712720870972,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 5190
    },
    {
      "grad_norm": 0.06097865477204323,
      "learning_rate": 0.00029478457866574236,
      "loss": 0.4269,
      "step": 5200
    },
    {
      "gate_value": 0.10377305746078491,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 5200
    },
    {
      "grad_norm": 0.07691788673400879,
      "learning_rate": 0.000294752112896495,
      "loss": 0.4395,
      "step": 5210
    },
    {
      "gate_value": 0.10425736010074615,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 5210
    },
    {
      "grad_norm": 0.10720237344503403,
      "learning_rate": 0.00029471954819054334,
      "loss": 0.4359,
      "step": 5220
    },
    {
      "gate_value": 0.10501399636268616,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 5220
    },
    {
      "grad_norm": 0.034056954085826874,
      "learning_rate": 0.0002946868845701451,
      "loss": 0.4315,
      "step": 5230
    },
    {
      "gate_value": 0.10612098127603531,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 5230
    },
    {
      "grad_norm": 0.07588791847229004,
      "learning_rate": 0.00029465412205762566,
      "loss": 0.4372,
      "step": 5240
    },
    {
      "gate_value": 0.10589048266410828,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 5240
    },
    {
      "grad_norm": 0.06615625321865082,
      "learning_rate": 0.0002946212606753777,
      "loss": 0.4227,
      "step": 5250
    },
    {
      "gate_value": 0.10498031228780746,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 5250
    },
    {
      "grad_norm": 0.08545836061239243,
      "learning_rate": 0.00029458830044586185,
      "loss": 0.4582,
      "step": 5260
    },
    {
      "gate_value": 0.10491736233234406,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5260
    },
    {
      "grad_norm": 0.03687456622719765,
      "learning_rate": 0.000294555241391606,
      "loss": 0.451,
      "step": 5270
    },
    {
      "gate_value": 0.10536529868841171,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 5270
    },
    {
      "grad_norm": 0.051188874989748,
      "learning_rate": 0.00029452208353520574,
      "loss": 0.4175,
      "step": 5280
    },
    {
      "gate_value": 0.10574186593294144,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 5280
    },
    {
      "grad_norm": 0.13268040120601654,
      "learning_rate": 0.0002944888268993241,
      "loss": 0.428,
      "step": 5290
    },
    {
      "gate_value": 0.10658949613571167,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 5290
    },
    {
      "grad_norm": 0.04050949588418007,
      "learning_rate": 0.00029445547150669176,
      "loss": 0.4307,
      "step": 5300
    },
    {
      "gate_value": 0.10709504038095474,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5300
    },
    {
      "grad_norm": 0.0831289142370224,
      "learning_rate": 0.0002944220173801068,
      "loss": 0.4574,
      "step": 5310
    },
    {
      "gate_value": 0.10686801373958588,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 5310
    },
    {
      "grad_norm": 0.10428035259246826,
      "learning_rate": 0.00029438846454243477,
      "loss": 0.4388,
      "step": 5320
    },
    {
      "gate_value": 0.10688181966543198,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 5320
    },
    {
      "grad_norm": 0.08142989128828049,
      "learning_rate": 0.00029435481301660866,
      "loss": 0.4444,
      "step": 5330
    },
    {
      "gate_value": 0.10656416416168213,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 5330
    },
    {
      "grad_norm": 0.13853947818279266,
      "learning_rate": 0.0002943210628256291,
      "loss": 0.4314,
      "step": 5340
    },
    {
      "gate_value": 0.10676518827676773,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 5340
    },
    {
      "grad_norm": 0.0938250795006752,
      "learning_rate": 0.00029428721399256397,
      "loss": 0.4205,
      "step": 5350
    },
    {
      "gate_value": 0.10675190389156342,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 5350
    },
    {
      "grad_norm": 0.11818115413188934,
      "learning_rate": 0.0002942532665405486,
      "loss": 0.4318,
      "step": 5360
    },
    {
      "gate_value": 0.10718467086553574,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 5360
    },
    {
      "grad_norm": 0.12013489753007889,
      "learning_rate": 0.0002942192204927858,
      "loss": 0.4316,
      "step": 5370
    },
    {
      "gate_value": 0.10709141939878464,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 5370
    },
    {
      "grad_norm": 0.08047773689031601,
      "learning_rate": 0.0002941850758725457,
      "loss": 0.4303,
      "step": 5380
    },
    {
      "gate_value": 0.10777509957551956,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5380
    },
    {
      "grad_norm": 0.11209923028945923,
      "learning_rate": 0.0002941508327031658,
      "loss": 0.4221,
      "step": 5390
    },
    {
      "gate_value": 0.1086001768708229,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 5390
    },
    {
      "grad_norm": 0.08195088803768158,
      "learning_rate": 0.00029411649100805103,
      "loss": 0.4627,
      "step": 5400
    },
    {
      "gate_value": 0.1096612736582756,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 5400
    },
    {
      "grad_norm": 0.05236297845840454,
      "learning_rate": 0.0002940820508106735,
      "loss": 0.4388,
      "step": 5410
    },
    {
      "gate_value": 0.10971608757972717,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 5410
    },
    {
      "grad_norm": 0.0675286278128624,
      "learning_rate": 0.00029404751213457295,
      "loss": 0.4144,
      "step": 5420
    },
    {
      "gate_value": 0.1095128282904625,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 5420
    },
    {
      "grad_norm": 0.05094461515545845,
      "learning_rate": 0.00029401287500335614,
      "loss": 0.4636,
      "step": 5430
    },
    {
      "gate_value": 0.10925433784723282,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 5430
    },
    {
      "grad_norm": 0.03384535387158394,
      "learning_rate": 0.00029397813944069724,
      "loss": 0.4379,
      "step": 5440
    },
    {
      "gate_value": 0.1095501184463501,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 5440
    },
    {
      "grad_norm": 0.055499665439128876,
      "learning_rate": 0.0002939433054703376,
      "loss": 0.4255,
      "step": 5450
    },
    {
      "gate_value": 0.1098218709230423,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 5450
    },
    {
      "grad_norm": 0.07142087072134018,
      "learning_rate": 0.00029390837311608605,
      "loss": 0.4477,
      "step": 5460
    },
    {
      "gate_value": 0.10990960896015167,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 5460
    },
    {
      "grad_norm": 0.08653406798839569,
      "learning_rate": 0.0002938733424018184,
      "loss": 0.4494,
      "step": 5470
    },
    {
      "gate_value": 0.10960792005062103,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5470
    },
    {
      "grad_norm": 0.10651569813489914,
      "learning_rate": 0.00029383821335147786,
      "loss": 0.4468,
      "step": 5480
    },
    {
      "gate_value": 0.10989756137132645,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 5480
    },
    {
      "grad_norm": 0.036602914333343506,
      "learning_rate": 0.00029380298598907485,
      "loss": 0.4284,
      "step": 5490
    },
    {
      "gate_value": 0.10953273624181747,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5490
    },
    {
      "grad_norm": 0.04981496185064316,
      "learning_rate": 0.00029376766033868684,
      "loss": 0.4464,
      "step": 5500
    },
    {
      "gate_value": 0.10933338105678558,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 5500
    },
    {
      "grad_norm": 0.03607625514268875,
      "learning_rate": 0.0002937322364244587,
      "loss": 0.4399,
      "step": 5510
    },
    {
      "gate_value": 0.11001531779766083,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 5510
    },
    {
      "grad_norm": 0.048102978616952896,
      "learning_rate": 0.0002936967142706022,
      "loss": 0.4351,
      "step": 5520
    },
    {
      "gate_value": 0.1107465997338295,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 5520
    },
    {
      "grad_norm": 0.07479461282491684,
      "learning_rate": 0.00029366109390139655,
      "loss": 0.4458,
      "step": 5530
    },
    {
      "gate_value": 0.11124894767999649,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 5530
    },
    {
      "grad_norm": 0.07189257442951202,
      "learning_rate": 0.00029362537534118787,
      "loss": 0.4433,
      "step": 5540
    },
    {
      "gate_value": 0.11063691973686218,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 5540
    },
    {
      "grad_norm": 0.047667935490608215,
      "learning_rate": 0.00029358955861438936,
      "loss": 0.4475,
      "step": 5550
    },
    {
      "gate_value": 0.11002617329359055,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 5550
    },
    {
      "grad_norm": 0.0924759954214096,
      "learning_rate": 0.00029355364374548156,
      "loss": 0.4446,
      "step": 5560
    },
    {
      "gate_value": 0.10976750403642654,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5560
    },
    {
      "grad_norm": 0.06293515115976334,
      "learning_rate": 0.0002935176307590119,
      "loss": 0.4346,
      "step": 5570
    },
    {
      "gate_value": 0.10967667400836945,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 5570
    },
    {
      "grad_norm": 0.06401614844799042,
      "learning_rate": 0.0002934815196795949,
      "loss": 0.4189,
      "step": 5580
    },
    {
      "gate_value": 0.1099473237991333,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 5580
    },
    {
      "grad_norm": 0.09545626491308212,
      "learning_rate": 0.0002934453105319121,
      "loss": 0.4354,
      "step": 5590
    },
    {
      "gate_value": 0.11013666540384293,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5590
    },
    {
      "grad_norm": 0.04125199839472771,
      "learning_rate": 0.0002934090033407122,
      "loss": 0.4525,
      "step": 5600
    },
    {
      "gate_value": 0.1106431782245636,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 5600
    },
    {
      "grad_norm": 0.04976991191506386,
      "learning_rate": 0.0002933725981308108,
      "loss": 0.436,
      "step": 5610
    },
    {
      "gate_value": 0.11142835021018982,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 5610
    },
    {
      "grad_norm": 0.08596864342689514,
      "learning_rate": 0.0002933360949270905,
      "loss": 0.4333,
      "step": 5620
    },
    {
      "gate_value": 0.11180674284696579,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5620
    },
    {
      "grad_norm": 0.06628672033548355,
      "learning_rate": 0.0002932994937545009,
      "loss": 0.4294,
      "step": 5630
    },
    {
      "gate_value": 0.11271046847105026,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 5630
    },
    {
      "grad_norm": 0.046628884971141815,
      "learning_rate": 0.0002932627946380585,
      "loss": 0.4305,
      "step": 5640
    },
    {
      "gate_value": 0.11313065141439438,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5640
    },
    {
      "grad_norm": 0.04577811062335968,
      "learning_rate": 0.0002932259976028469,
      "loss": 0.438,
      "step": 5650
    },
    {
      "gate_value": 0.11235907673835754,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5650
    },
    {
      "grad_norm": 0.09892956912517548,
      "learning_rate": 0.0002931891026740165,
      "loss": 0.4443,
      "step": 5660
    },
    {
      "gate_value": 0.1117844209074974,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 5660
    },
    {
      "grad_norm": 0.04517688229680061,
      "learning_rate": 0.00029315210987678457,
      "loss": 0.4508,
      "step": 5670
    },
    {
      "gate_value": 0.11167077720165253,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 5670
    },
    {
      "grad_norm": 0.08779658377170563,
      "learning_rate": 0.0002931150192364354,
      "loss": 0.4537,
      "step": 5680
    },
    {
      "gate_value": 0.11213234066963196,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5680
    },
    {
      "grad_norm": 0.13340982794761658,
      "learning_rate": 0.00029307783077832004,
      "loss": 0.4249,
      "step": 5690
    },
    {
      "gate_value": 0.11261526495218277,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5690
    },
    {
      "grad_norm": 0.052928730845451355,
      "learning_rate": 0.0002930405445278565,
      "loss": 0.4211,
      "step": 5700
    },
    {
      "gate_value": 0.1126897856593132,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 5700
    },
    {
      "grad_norm": 0.06828558444976807,
      "learning_rate": 0.0002930031605105296,
      "loss": 0.4366,
      "step": 5710
    },
    {
      "gate_value": 0.11276299506425858,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 5710
    },
    {
      "grad_norm": 0.04455326870083809,
      "learning_rate": 0.0002929656787518909,
      "loss": 0.4332,
      "step": 5720
    },
    {
      "gate_value": 0.11292887479066849,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 5720
    },
    {
      "grad_norm": 0.14097444713115692,
      "learning_rate": 0.00029292809927755886,
      "loss": 0.4324,
      "step": 5730
    },
    {
      "gate_value": 0.11304690688848495,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5730
    },
    {
      "grad_norm": 0.03994961827993393,
      "learning_rate": 0.00029289042211321875,
      "loss": 0.4259,
      "step": 5740
    },
    {
      "gate_value": 0.11258487403392792,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 5740
    },
    {
      "grad_norm": 0.03657664358615875,
      "learning_rate": 0.0002928526472846224,
      "loss": 0.4258,
      "step": 5750
    },
    {
      "gate_value": 0.1123301312327385,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 5750
    },
    {
      "grad_norm": 0.07616100460290909,
      "learning_rate": 0.00029281477481758874,
      "loss": 0.4223,
      "step": 5760
    },
    {
      "gate_value": 0.11309382319450378,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 5760
    },
    {
      "grad_norm": 0.08438175171613693,
      "learning_rate": 0.0002927768047380031,
      "loss": 0.4414,
      "step": 5770
    },
    {
      "gate_value": 0.11341430991888046,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 5770
    },
    {
      "grad_norm": 0.03430478647351265,
      "learning_rate": 0.00029273873707181777,
      "loss": 0.4171,
      "step": 5780
    },
    {
      "gate_value": 0.11293953657150269,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5780
    },
    {
      "grad_norm": 0.08205977082252502,
      "learning_rate": 0.0002927005718450516,
      "loss": 0.4264,
      "step": 5790
    },
    {
      "gate_value": 0.11254054307937622,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5790
    },
    {
      "grad_norm": 0.07341016829013824,
      "learning_rate": 0.0002926623090837901,
      "loss": 0.4223,
      "step": 5800
    },
    {
      "gate_value": 0.11329421401023865,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5800
    },
    {
      "grad_norm": 0.06489834934473038,
      "learning_rate": 0.00029262394881418563,
      "loss": 0.4621,
      "step": 5810
    },
    {
      "gate_value": 0.11428499221801758,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 5810
    },
    {
      "grad_norm": 0.1126764789223671,
      "learning_rate": 0.00029258549106245697,
      "loss": 0.4218,
      "step": 5820
    },
    {
      "gate_value": 0.11484973132610321,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 5820
    },
    {
      "grad_norm": 0.04321017116308212,
      "learning_rate": 0.0002925469358548897,
      "loss": 0.4216,
      "step": 5830
    },
    {
      "gate_value": 0.11467601358890533,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 5830
    },
    {
      "grad_norm": 0.12950527667999268,
      "learning_rate": 0.0002925082832178359,
      "loss": 0.4205,
      "step": 5840
    },
    {
      "gate_value": 0.11512420326471329,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 5840
    },
    {
      "grad_norm": 0.03990564122796059,
      "learning_rate": 0.0002924695331777142,
      "loss": 0.4191,
      "step": 5850
    },
    {
      "gate_value": 0.11604516208171844,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 5850
    },
    {
      "grad_norm": 0.03825264424085617,
      "learning_rate": 0.00029243068576101014,
      "loss": 0.4208,
      "step": 5860
    },
    {
      "gate_value": 0.1162797287106514,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5860
    },
    {
      "grad_norm": 0.16146664321422577,
      "learning_rate": 0.0002923917409942753,
      "loss": 0.4407,
      "step": 5870
    },
    {
      "gate_value": 0.11613252013921738,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 5870
    },
    {
      "grad_norm": 0.06876587122678757,
      "learning_rate": 0.0002923526989041282,
      "loss": 0.4334,
      "step": 5880
    },
    {
      "gate_value": 0.1163325235247612,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 5880
    },
    {
      "grad_norm": 0.12074162811040878,
      "learning_rate": 0.0002923135595172537,
      "loss": 0.4231,
      "step": 5890
    },
    {
      "gate_value": 0.11657026410102844,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 5890
    },
    {
      "grad_norm": 0.04003237560391426,
      "learning_rate": 0.0002922743228604032,
      "loss": 0.4234,
      "step": 5900
    },
    {
      "gate_value": 0.11663009226322174,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 5900
    },
    {
      "grad_norm": 0.04541704058647156,
      "learning_rate": 0.0002922349889603946,
      "loss": 0.4436,
      "step": 5910
    },
    {
      "gate_value": 0.11671672016382217,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 5910
    },
    {
      "grad_norm": 0.06754721701145172,
      "learning_rate": 0.00029219555784411224,
      "loss": 0.4353,
      "step": 5920
    },
    {
      "gate_value": 0.1171480193734169,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 5920
    },
    {
      "grad_norm": 0.04595010727643967,
      "learning_rate": 0.0002921560295385069,
      "loss": 0.4295,
      "step": 5930
    },
    {
      "gate_value": 0.11806542426347733,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 5930
    },
    {
      "grad_norm": 0.04498855024576187,
      "learning_rate": 0.00029211640407059586,
      "loss": 0.437,
      "step": 5940
    },
    {
      "gate_value": 0.11824111640453339,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 5940
    },
    {
      "grad_norm": 0.07534932345151901,
      "learning_rate": 0.0002920766814674627,
      "loss": 0.4199,
      "step": 5950
    },
    {
      "gate_value": 0.11814270913600922,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 5950
    },
    {
      "grad_norm": 0.05595272779464722,
      "learning_rate": 0.00029203686175625747,
      "loss": 0.438,
      "step": 5960
    },
    {
      "gate_value": 0.11774881929159164,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 5960
    },
    {
      "grad_norm": 0.0870402529835701,
      "learning_rate": 0.0002919969449641965,
      "loss": 0.4298,
      "step": 5970
    },
    {
      "gate_value": 0.1175171509385109,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 5970
    },
    {
      "grad_norm": 0.03920748084783554,
      "learning_rate": 0.00029195693111856263,
      "loss": 0.4324,
      "step": 5980
    },
    {
      "gate_value": 0.11840686947107315,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 5980
    },
    {
      "grad_norm": 0.10713187605142593,
      "learning_rate": 0.00029191682024670495,
      "loss": 0.4407,
      "step": 5990
    },
    {
      "gate_value": 0.11843453347682953,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 5990
    },
    {
      "grad_norm": 0.1207190528512001,
      "learning_rate": 0.00029187661237603876,
      "loss": 0.423,
      "step": 6000
    },
    {
      "gate_value": 0.11844220012426376,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 6000
    },
    {
      "grad_norm": 0.049522001296281815,
      "learning_rate": 0.0002918363075340459,
      "loss": 0.4376,
      "step": 6010
    },
    {
      "gate_value": 0.11861016601324081,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 6010
    },
    {
      "grad_norm": 0.05734136328101158,
      "learning_rate": 0.00029179590574827426,
      "loss": 0.4061,
      "step": 6020
    },
    {
      "gate_value": 0.11852915585041046,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 6020
    },
    {
      "grad_norm": 0.09304312616586685,
      "learning_rate": 0.00029175540704633803,
      "loss": 0.4322,
      "step": 6030
    },
    {
      "gate_value": 0.11866278946399689,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 6030
    },
    {
      "grad_norm": 0.05417775362730026,
      "learning_rate": 0.00029171481145591786,
      "loss": 0.422,
      "step": 6040
    },
    {
      "gate_value": 0.11929836869239807,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 6040
    },
    {
      "grad_norm": 0.07325364649295807,
      "learning_rate": 0.00029167411900476027,
      "loss": 0.4098,
      "step": 6050
    },
    {
      "gate_value": 0.11958464235067368,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 6050
    },
    {
      "grad_norm": 0.10998177528381348,
      "learning_rate": 0.0002916333297206783,
      "loss": 0.424,
      "step": 6060
    },
    {
      "gate_value": 0.11943252384662628,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 6060
    },
    {
      "grad_norm": 0.07327587902545929,
      "learning_rate": 0.00029159244363155095,
      "loss": 0.4154,
      "step": 6070
    },
    {
      "gate_value": 0.11973727494478226,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 6070
    },
    {
      "grad_norm": 0.047259047627449036,
      "learning_rate": 0.0002915514607653235,
      "loss": 0.4261,
      "step": 6080
    },
    {
      "gate_value": 0.12109141051769257,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 6080
    },
    {
      "grad_norm": 0.051075082272291183,
      "learning_rate": 0.0002915103811500074,
      "loss": 0.4125,
      "step": 6090
    },
    {
      "gate_value": 0.12103540450334549,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 6090
    },
    {
      "grad_norm": 0.055391453206539154,
      "learning_rate": 0.00029146920481368016,
      "loss": 0.4215,
      "step": 6100
    },
    {
      "gate_value": 0.12145233154296875,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6100
    },
    {
      "grad_norm": 0.07134224474430084,
      "learning_rate": 0.0002914279317844854,
      "loss": 0.4245,
      "step": 6110
    },
    {
      "gate_value": 0.12148108333349228,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6110
    },
    {
      "grad_norm": 0.16808076202869415,
      "learning_rate": 0.0002913865620906328,
      "loss": 0.4159,
      "step": 6120
    },
    {
      "gate_value": 0.12126853317022324,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 6120
    },
    {
      "grad_norm": 0.22057221829891205,
      "learning_rate": 0.00029134509576039824,
      "loss": 0.4278,
      "step": 6130
    },
    {
      "gate_value": 0.12128766626119614,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 6130
    },
    {
      "grad_norm": 0.09868735820055008,
      "learning_rate": 0.0002913035328221236,
      "loss": 0.4194,
      "step": 6140
    },
    {
      "gate_value": 0.12114259600639343,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 6140
    },
    {
      "grad_norm": 0.08045390993356705,
      "learning_rate": 0.0002912618733042166,
      "loss": 0.4242,
      "step": 6150
    },
    {
      "gate_value": 0.12184974551200867,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6150
    },
    {
      "grad_norm": 0.04861131310462952,
      "learning_rate": 0.00029122011723515124,
      "loss": 0.435,
      "step": 6160
    },
    {
      "gate_value": 0.1218714788556099,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 6160
    },
    {
      "grad_norm": 0.09223391115665436,
      "learning_rate": 0.00029117826464346736,
      "loss": 0.429,
      "step": 6170
    },
    {
      "gate_value": 0.1220831573009491,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 6170
    },
    {
      "grad_norm": 0.15670324862003326,
      "learning_rate": 0.00029113631555777083,
      "loss": 0.4133,
      "step": 6180
    },
    {
      "gate_value": 0.12261885404586792,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 6180
    },
    {
      "grad_norm": 0.053096942603588104,
      "learning_rate": 0.0002910942700067335,
      "loss": 0.4304,
      "step": 6190
    },
    {
      "gate_value": 0.12274584919214249,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6190
    },
    {
      "grad_norm": 0.10016176104545593,
      "learning_rate": 0.000291052128019093,
      "loss": 0.4395,
      "step": 6200
    },
    {
      "gate_value": 0.12318729609251022,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6200
    },
    {
      "grad_norm": 0.21103353798389435,
      "learning_rate": 0.0002910098896236531,
      "loss": 0.4286,
      "step": 6210
    },
    {
      "gate_value": 0.12366920709609985,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 6210
    },
    {
      "grad_norm": 0.08487042039632797,
      "learning_rate": 0.0002909675548492832,
      "loss": 0.4232,
      "step": 6220
    },
    {
      "gate_value": 0.12395383417606354,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 6220
    },
    {
      "grad_norm": 0.055605944246053696,
      "learning_rate": 0.0002909251237249189,
      "loss": 0.4357,
      "step": 6230
    },
    {
      "gate_value": 0.12388145923614502,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 6230
    },
    {
      "grad_norm": 0.09035728126764297,
      "learning_rate": 0.00029088259627956133,
      "loss": 0.4323,
      "step": 6240
    },
    {
      "gate_value": 0.12410308420658112,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6240
    },
    {
      "grad_norm": 0.05201800540089607,
      "learning_rate": 0.00029083997254227765,
      "loss": 0.4266,
      "step": 6250
    },
    {
      "gate_value": 0.1233508288860321,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 6250
    },
    {
      "grad_norm": 0.061280108988285065,
      "learning_rate": 0.0002907972525422008,
      "loss": 0.4351,
      "step": 6260
    },
    {
      "gate_value": 0.12278716266155243,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 6260
    },
    {
      "grad_norm": 0.05171182006597519,
      "learning_rate": 0.00029075443630852945,
      "loss": 0.421,
      "step": 6270
    },
    {
      "gate_value": 0.12314655631780624,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6270
    },
    {
      "grad_norm": 0.0474497452378273,
      "learning_rate": 0.00029071152387052815,
      "loss": 0.4292,
      "step": 6280
    },
    {
      "gate_value": 0.12386977672576904,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 6280
    },
    {
      "grad_norm": 0.06959555298089981,
      "learning_rate": 0.0002906685152575271,
      "loss": 0.4106,
      "step": 6290
    },
    {
      "gate_value": 0.12407379597425461,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 6290
    },
    {
      "grad_norm": 0.04566885530948639,
      "learning_rate": 0.00029062541049892227,
      "loss": 0.4241,
      "step": 6300
    },
    {
      "gate_value": 0.1249006986618042,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 6300
    },
    {
      "grad_norm": 0.052750058472156525,
      "learning_rate": 0.0002905822096241754,
      "loss": 0.4299,
      "step": 6310
    },
    {
      "gate_value": 0.12427221238613129,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 6310
    },
    {
      "grad_norm": 0.03595830127596855,
      "learning_rate": 0.0002905389126628139,
      "loss": 0.4322,
      "step": 6320
    },
    {
      "gate_value": 0.12405462563037872,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6320
    },
    {
      "grad_norm": 0.03769196197390556,
      "learning_rate": 0.0002904955196444307,
      "loss": 0.4129,
      "step": 6330
    },
    {
      "gate_value": 0.12411148101091385,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 6330
    },
    {
      "grad_norm": 0.12260931730270386,
      "learning_rate": 0.0002904520305986847,
      "loss": 0.4202,
      "step": 6340
    },
    {
      "gate_value": 0.12482518702745438,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 6340
    },
    {
      "grad_norm": 0.09134802967309952,
      "learning_rate": 0.00029040844555530015,
      "loss": 0.4243,
      "step": 6350
    },
    {
      "gate_value": 0.12580612301826477,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 6350
    },
    {
      "grad_norm": 0.14725758135318756,
      "learning_rate": 0.00029036476454406704,
      "loss": 0.426,
      "step": 6360
    },
    {
      "gate_value": 0.12593281269073486,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 6360
    },
    {
      "grad_norm": 0.11491762846708298,
      "learning_rate": 0.0002903209875948409,
      "loss": 0.4294,
      "step": 6370
    },
    {
      "gate_value": 0.12502771615982056,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 6370
    },
    {
      "grad_norm": 0.0558735616505146,
      "learning_rate": 0.0002902771147375429,
      "loss": 0.4401,
      "step": 6380
    },
    {
      "gate_value": 0.12525469064712524,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6380
    },
    {
      "grad_norm": 0.07469120621681213,
      "learning_rate": 0.0002902331460021597,
      "loss": 0.4246,
      "step": 6390
    },
    {
      "gate_value": 0.12590870261192322,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 6390
    },
    {
      "grad_norm": 0.11152400076389313,
      "learning_rate": 0.00029018908141874354,
      "loss": 0.4326,
      "step": 6400
    },
    {
      "gate_value": 0.12628869712352753,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 6400
    },
    {
      "grad_norm": 0.1487109363079071,
      "learning_rate": 0.0002901449210174122,
      "loss": 0.4229,
      "step": 6410
    },
    {
      "gate_value": 0.12687274813652039,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 6410
    },
    {
      "grad_norm": 0.08015327900648117,
      "learning_rate": 0.00029010066482834874,
      "loss": 0.4251,
      "step": 6420
    },
    {
      "gate_value": 0.12663473188877106,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6420
    },
    {
      "grad_norm": 0.06717225909233093,
      "learning_rate": 0.00029005631288180197,
      "loss": 0.4367,
      "step": 6430
    },
    {
      "gate_value": 0.12608718872070312,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 6430
    },
    {
      "grad_norm": 0.3730805516242981,
      "learning_rate": 0.000290011865208086,
      "loss": 0.4288,
      "step": 6440
    },
    {
      "gate_value": 0.1260022222995758,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 6440
    },
    {
      "grad_norm": 0.05443867668509483,
      "learning_rate": 0.0002899673218375804,
      "loss": 0.4078,
      "step": 6450
    },
    {
      "gate_value": 0.126522958278656,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 6450
    },
    {
      "grad_norm": 0.06876882910728455,
      "learning_rate": 0.00028992268280073015,
      "loss": 0.4211,
      "step": 6460
    },
    {
      "gate_value": 0.12633737921714783,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 6460
    },
    {
      "grad_norm": 0.035338129848241806,
      "learning_rate": 0.00028987794812804555,
      "loss": 0.4314,
      "step": 6470
    },
    {
      "gate_value": 0.12628173828125,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6470
    },
    {
      "grad_norm": 0.04577523469924927,
      "learning_rate": 0.00028983311785010237,
      "loss": 0.4155,
      "step": 6480
    },
    {
      "gate_value": 0.12714733183383942,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 6480
    },
    {
      "grad_norm": 0.16886325180530548,
      "learning_rate": 0.0002897881919975417,
      "loss": 0.4331,
      "step": 6490
    },
    {
      "gate_value": 0.1283843070268631,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6490
    },
    {
      "grad_norm": 0.04098428413271904,
      "learning_rate": 0.00028974317060106997,
      "loss": 0.4384,
      "step": 6500
    },
    {
      "gate_value": 0.12877629697322845,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 6500
    },
    {
      "grad_norm": 0.04261775314807892,
      "learning_rate": 0.0002896980536914588,
      "loss": 0.4219,
      "step": 6510
    },
    {
      "gate_value": 0.12851965427398682,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 6510
    },
    {
      "grad_norm": 0.09274916350841522,
      "learning_rate": 0.0002896528412995452,
      "loss": 0.4393,
      "step": 6520
    },
    {
      "gate_value": 0.12854821979999542,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 6520
    },
    {
      "grad_norm": 0.04320213571190834,
      "learning_rate": 0.00028960753345623144,
      "loss": 0.4236,
      "step": 6530
    },
    {
      "gate_value": 0.1293092519044876,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 6530
    },
    {
      "grad_norm": 0.07464089244604111,
      "learning_rate": 0.000289562130192485,
      "loss": 0.4226,
      "step": 6540
    },
    {
      "gate_value": 0.12948086857795715,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 6540
    },
    {
      "grad_norm": 0.0393485464155674,
      "learning_rate": 0.0002895166315393385,
      "loss": 0.4291,
      "step": 6550
    },
    {
      "gate_value": 0.1301802545785904,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 6550
    },
    {
      "grad_norm": 0.062148161232471466,
      "learning_rate": 0.00028947103752788994,
      "loss": 0.4134,
      "step": 6560
    },
    {
      "gate_value": 0.13075850903987885,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6560
    },
    {
      "grad_norm": 0.04033559188246727,
      "learning_rate": 0.00028942534818930237,
      "loss": 0.42,
      "step": 6570
    },
    {
      "gate_value": 0.13082143664360046,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6570
    },
    {
      "grad_norm": 0.045840464532375336,
      "learning_rate": 0.00028937956355480404,
      "loss": 0.4349,
      "step": 6580
    },
    {
      "gate_value": 0.13069741427898407,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 6580
    },
    {
      "grad_norm": 0.059474021196365356,
      "learning_rate": 0.00028933368365568823,
      "loss": 0.4269,
      "step": 6590
    },
    {
      "gate_value": 0.13073182106018066,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6590
    },
    {
      "grad_norm": 0.04966716840863228,
      "learning_rate": 0.0002892877085233135,
      "loss": 0.4253,
      "step": 6600
    },
    {
      "gate_value": 0.1308085173368454,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6600
    },
    {
      "grad_norm": 0.04608655348420143,
      "learning_rate": 0.0002892416381891034,
      "loss": 0.4356,
      "step": 6610
    },
    {
      "gate_value": 0.13023337721824646,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 6610
    },
    {
      "grad_norm": 0.05291171744465828,
      "learning_rate": 0.0002891954726845466,
      "loss": 0.4167,
      "step": 6620
    },
    {
      "gate_value": 0.1300569474697113,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 6620
    },
    {
      "grad_norm": 0.08100378513336182,
      "learning_rate": 0.0002891492120411967,
      "loss": 0.4268,
      "step": 6630
    },
    {
      "gate_value": 0.13093367218971252,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 6630
    },
    {
      "grad_norm": 0.047419607639312744,
      "learning_rate": 0.00028910285629067255,
      "loss": 0.416,
      "step": 6640
    },
    {
      "gate_value": 0.13165777921676636,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 6640
    },
    {
      "grad_norm": 0.34848541021347046,
      "learning_rate": 0.0002890564054646577,
      "loss": 0.4324,
      "step": 6650
    },
    {
      "gate_value": 0.13223788142204285,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 6650
    },
    {
      "grad_norm": 0.05068269744515419,
      "learning_rate": 0.000289009859594901,
      "loss": 0.4246,
      "step": 6660
    },
    {
      "gate_value": 0.1314765065908432,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6660
    },
    {
      "grad_norm": 0.12157876789569855,
      "learning_rate": 0.00028896321871321604,
      "loss": 0.4196,
      "step": 6670
    },
    {
      "gate_value": 0.13127507269382477,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 6670
    },
    {
      "grad_norm": 0.06361397355794907,
      "learning_rate": 0.0002889164828514814,
      "loss": 0.4387,
      "step": 6680
    },
    {
      "gate_value": 0.1312800794839859,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 6680
    },
    {
      "grad_norm": 0.08461131155490875,
      "learning_rate": 0.00028886965204164065,
      "loss": 0.4159,
      "step": 6690
    },
    {
      "gate_value": 0.13146339356899261,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 6690
    },
    {
      "grad_norm": 0.11955706030130386,
      "learning_rate": 0.0002888227263157022,
      "loss": 0.4184,
      "step": 6700
    },
    {
      "gate_value": 0.13245995342731476,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 6700
    },
    {
      "grad_norm": 0.2205382138490677,
      "learning_rate": 0.00028877570570573936,
      "loss": 0.4241,
      "step": 6710
    },
    {
      "gate_value": 0.13298162817955017,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 6710
    },
    {
      "grad_norm": 0.07158178836107254,
      "learning_rate": 0.0002887285902438902,
      "loss": 0.4351,
      "step": 6720
    },
    {
      "gate_value": 0.13305675983428955,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 6720
    },
    {
      "grad_norm": 0.08344139903783798,
      "learning_rate": 0.0002886813799623578,
      "loss": 0.4173,
      "step": 6730
    },
    {
      "gate_value": 0.132954940199852,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 6730
    },
    {
      "grad_norm": 0.061295922845602036,
      "learning_rate": 0.0002886340748934098,
      "loss": 0.4237,
      "step": 6740
    },
    {
      "gate_value": 0.13344383239746094,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 6740
    },
    {
      "grad_norm": 0.1391419768333435,
      "learning_rate": 0.0002885866750693789,
      "loss": 0.4175,
      "step": 6750
    },
    {
      "gate_value": 0.13365666568279266,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6750
    },
    {
      "grad_norm": 0.08864652365446091,
      "learning_rate": 0.0002885391805226624,
      "loss": 0.422,
      "step": 6760
    },
    {
      "gate_value": 0.13373038172721863,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 6760
    },
    {
      "grad_norm": 0.04955292120575905,
      "learning_rate": 0.0002884915912857223,
      "loss": 0.3975,
      "step": 6770
    },
    {
      "gate_value": 0.13424073159694672,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 6770
    },
    {
      "grad_norm": 0.11102047562599182,
      "learning_rate": 0.0002884439073910855,
      "loss": 0.4205,
      "step": 6780
    },
    {
      "gate_value": 0.1342160701751709,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 6780
    },
    {
      "grad_norm": 0.09720852971076965,
      "learning_rate": 0.00028839612887134346,
      "loss": 0.4218,
      "step": 6790
    },
    {
      "gate_value": 0.13402903079986572,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 6790
    },
    {
      "grad_norm": 0.04908115044236183,
      "learning_rate": 0.0002883482557591523,
      "loss": 0.4236,
      "step": 6800
    },
    {
      "gate_value": 0.13415250182151794,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6800
    },
    {
      "grad_norm": 0.072776198387146,
      "learning_rate": 0.00028830028808723285,
      "loss": 0.4377,
      "step": 6810
    },
    {
      "gate_value": 0.1337648630142212,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 6810
    },
    {
      "grad_norm": 0.04838380590081215,
      "learning_rate": 0.00028825222588837063,
      "loss": 0.4093,
      "step": 6820
    },
    {
      "gate_value": 0.13404372334480286,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 6820
    },
    {
      "grad_norm": 0.04791904240846634,
      "learning_rate": 0.0002882040691954157,
      "loss": 0.4106,
      "step": 6830
    },
    {
      "gate_value": 0.13330905139446259,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 6830
    },
    {
      "grad_norm": 0.13278383016586304,
      "learning_rate": 0.0002881558180412826,
      "loss": 0.4299,
      "step": 6840
    },
    {
      "gate_value": 0.13309334218502045,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 6840
    },
    {
      "grad_norm": 0.05918443202972412,
      "learning_rate": 0.0002881074724589506,
      "loss": 0.4255,
      "step": 6850
    },
    {
      "gate_value": 0.13352347910404205,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 6850
    },
    {
      "grad_norm": 0.08290646970272064,
      "learning_rate": 0.00028805903248146344,
      "loss": 0.4237,
      "step": 6860
    },
    {
      "gate_value": 0.1338224560022354,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 6860
    },
    {
      "grad_norm": 0.03945367410778999,
      "learning_rate": 0.00028801049814192945,
      "loss": 0.403,
      "step": 6870
    },
    {
      "gate_value": 0.13464081287384033,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 6870
    },
    {
      "grad_norm": 0.06238861754536629,
      "learning_rate": 0.0002879618694735213,
      "loss": 0.4148,
      "step": 6880
    },
    {
      "gate_value": 0.13544297218322754,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 6880
    },
    {
      "grad_norm": 0.06959348171949387,
      "learning_rate": 0.00028791314650947626,
      "loss": 0.4279,
      "step": 6890
    },
    {
      "gate_value": 0.1356019675731659,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6890
    },
    {
      "grad_norm": 0.1363728791475296,
      "learning_rate": 0.00028786432928309605,
      "loss": 0.3996,
      "step": 6900
    },
    {
      "gate_value": 0.13566221296787262,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 6900
    },
    {
      "grad_norm": 0.1169843077659607,
      "learning_rate": 0.00028781541782774676,
      "loss": 0.4206,
      "step": 6910
    },
    {
      "gate_value": 0.13622631132602692,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 6910
    },
    {
      "grad_norm": 0.22626587748527527,
      "learning_rate": 0.0002877664121768589,
      "loss": 0.4174,
      "step": 6920
    },
    {
      "gate_value": 0.1364203840494156,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 6920
    },
    {
      "grad_norm": 0.05224921554327011,
      "learning_rate": 0.00028771731236392736,
      "loss": 0.4334,
      "step": 6930
    },
    {
      "gate_value": 0.13636702299118042,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 6930
    },
    {
      "grad_norm": 0.07503295689821243,
      "learning_rate": 0.00028766811842251147,
      "loss": 0.4216,
      "step": 6940
    },
    {
      "gate_value": 0.13606546819210052,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 6940
    },
    {
      "grad_norm": 0.09540563076734543,
      "learning_rate": 0.0002876188303862347,
      "loss": 0.4383,
      "step": 6950
    },
    {
      "gate_value": 0.13603177666664124,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 6950
    },
    {
      "grad_norm": 0.0879683643579483,
      "learning_rate": 0.00028756944828878505,
      "loss": 0.4354,
      "step": 6960
    },
    {
      "gate_value": 0.1359747350215912,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 6960
    },
    {
      "grad_norm": 0.12022677063941956,
      "learning_rate": 0.0002875199721639147,
      "loss": 0.4248,
      "step": 6970
    },
    {
      "gate_value": 0.13565360009670258,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 6970
    },
    {
      "grad_norm": 0.16428843140602112,
      "learning_rate": 0.0002874704020454401,
      "loss": 0.4288,
      "step": 6980
    },
    {
      "gate_value": 0.13588224351406097,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 6980
    },
    {
      "grad_norm": 0.099467933177948,
      "learning_rate": 0.000287420737967242,
      "loss": 0.4192,
      "step": 6990
    },
    {
      "gate_value": 0.13614501059055328,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 6990
    },
    {
      "grad_norm": 0.19383247196674347,
      "learning_rate": 0.00028737097996326533,
      "loss": 0.4215,
      "step": 7000
    },
    {
      "gate_value": 0.1362755447626114,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 7000
    },
    {
      "grad_norm": 0.056437231600284576,
      "learning_rate": 0.0002873211280675191,
      "loss": 0.4208,
      "step": 7010
    },
    {
      "gate_value": 0.13694988191127777,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 7010
    },
    {
      "grad_norm": 0.14202746748924255,
      "learning_rate": 0.0002872711823140768,
      "loss": 0.4161,
      "step": 7020
    },
    {
      "gate_value": 0.13738933205604553,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7020
    },
    {
      "grad_norm": 0.07661598175764084,
      "learning_rate": 0.0002872211427370756,
      "loss": 0.4003,
      "step": 7030
    },
    {
      "gate_value": 0.13746806979179382,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 7030
    },
    {
      "grad_norm": 0.0649000033736229,
      "learning_rate": 0.00028717100937071744,
      "loss": 0.4194,
      "step": 7040
    },
    {
      "gate_value": 0.13786818087100983,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 7040
    },
    {
      "grad_norm": 0.09948837757110596,
      "learning_rate": 0.0002871207822492678,
      "loss": 0.4135,
      "step": 7050
    },
    {
      "gate_value": 0.13835233449935913,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7050
    },
    {
      "grad_norm": 0.058601900935173035,
      "learning_rate": 0.0002870704614070564,
      "loss": 0.4143,
      "step": 7060
    },
    {
      "gate_value": 0.13775308430194855,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 7060
    },
    {
      "grad_norm": 0.06780196726322174,
      "learning_rate": 0.0002870200468784771,
      "loss": 0.4234,
      "step": 7070
    },
    {
      "gate_value": 0.13795600831508636,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 7070
    },
    {
      "grad_norm": 0.38374415040016174,
      "learning_rate": 0.00028696953869798784,
      "loss": 0.441,
      "step": 7080
    },
    {
      "gate_value": 0.13810260593891144,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 7080
    },
    {
      "grad_norm": 0.09250040352344513,
      "learning_rate": 0.00028691893690011044,
      "loss": 0.4311,
      "step": 7090
    },
    {
      "gate_value": 0.13826362788677216,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 7090
    },
    {
      "grad_norm": 0.07739558070898056,
      "learning_rate": 0.00028686824151943067,
      "loss": 0.409,
      "step": 7100
    },
    {
      "gate_value": 0.13859573006629944,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7100
    },
    {
      "grad_norm": 0.055045321583747864,
      "learning_rate": 0.0002868174525905985,
      "loss": 0.4195,
      "step": 7110
    },
    {
      "gate_value": 0.13870957493782043,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 7110
    },
    {
      "grad_norm": 0.07911080121994019,
      "learning_rate": 0.0002867665701483275,
      "loss": 0.4032,
      "step": 7120
    },
    {
      "gate_value": 0.1389653980731964,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 7120
    },
    {
      "grad_norm": 0.10772552341222763,
      "learning_rate": 0.0002867155942273955,
      "loss": 0.4077,
      "step": 7130
    },
    {
      "gate_value": 0.1391928791999817,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 7130
    },
    {
      "grad_norm": 0.10482678562402725,
      "learning_rate": 0.00028666452486264397,
      "loss": 0.3997,
      "step": 7140
    },
    {
      "gate_value": 0.13888011872768402,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 7140
    },
    {
      "grad_norm": 0.1086043193936348,
      "learning_rate": 0.00028661336208897834,
      "loss": 0.4302,
      "step": 7150
    },
    {
      "gate_value": 0.1387721598148346,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 7150
    },
    {
      "grad_norm": 0.04895766079425812,
      "learning_rate": 0.00028656210594136795,
      "loss": 0.4257,
      "step": 7160
    },
    {
      "gate_value": 0.13895726203918457,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7160
    },
    {
      "grad_norm": 0.11810770630836487,
      "learning_rate": 0.00028651075645484583,
      "loss": 0.434,
      "step": 7170
    },
    {
      "gate_value": 0.1390237957239151,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 7170
    },
    {
      "grad_norm": 0.05848957598209381,
      "learning_rate": 0.00028645931366450895,
      "loss": 0.4324,
      "step": 7180
    },
    {
      "gate_value": 0.1383313685655594,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 7180
    },
    {
      "grad_norm": 0.160471111536026,
      "learning_rate": 0.0002864077776055178,
      "loss": 0.4032,
      "step": 7190
    },
    {
      "gate_value": 0.13848836719989777,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7190
    },
    {
      "grad_norm": 0.048046983778476715,
      "learning_rate": 0.000286356148313097,
      "loss": 0.4204,
      "step": 7200
    },
    {
      "gate_value": 0.13866883516311646,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 7200
    },
    {
      "grad_norm": 0.2954862117767334,
      "learning_rate": 0.0002863044258225346,
      "loss": 0.4167,
      "step": 7210
    },
    {
      "gate_value": 0.1384095996618271,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 7210
    },
    {
      "grad_norm": 0.05166821926832199,
      "learning_rate": 0.0002862526101691824,
      "loss": 0.4328,
      "step": 7220
    },
    {
      "gate_value": 0.1390955001115799,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7220
    },
    {
      "grad_norm": 0.05716368183493614,
      "learning_rate": 0.0002862007013884559,
      "loss": 0.4287,
      "step": 7230
    },
    {
      "gate_value": 0.13900712132453918,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 7230
    },
    {
      "grad_norm": 0.13214118778705597,
      "learning_rate": 0.0002861486995158343,
      "loss": 0.4218,
      "step": 7240
    },
    {
      "gate_value": 0.13993008434772491,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 7240
    },
    {
      "grad_norm": 0.052904047071933746,
      "learning_rate": 0.00028609660458686045,
      "loss": 0.4396,
      "step": 7250
    },
    {
      "gate_value": 0.14042124152183533,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 7250
    },
    {
      "grad_norm": 0.13447511196136475,
      "learning_rate": 0.00028604441663714064,
      "loss": 0.4158,
      "step": 7260
    },
    {
      "gate_value": 0.14117123186588287,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7260
    },
    {
      "grad_norm": 0.06425243616104126,
      "learning_rate": 0.00028599213570234486,
      "loss": 0.4152,
      "step": 7270
    },
    {
      "gate_value": 0.14089760184288025,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7270
    },
    {
      "grad_norm": 0.053561557084321976,
      "learning_rate": 0.0002859397618182067,
      "loss": 0.4311,
      "step": 7280
    },
    {
      "gate_value": 0.140683114528656,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 7280
    },
    {
      "grad_norm": 0.1397479921579361,
      "learning_rate": 0.0002858872950205231,
      "loss": 0.4313,
      "step": 7290
    },
    {
      "gate_value": 0.1405588686466217,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7290
    },
    {
      "grad_norm": 0.04807320237159729,
      "learning_rate": 0.0002858347353451548,
      "loss": 0.4294,
      "step": 7300
    },
    {
      "gate_value": 0.14052005112171173,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7300
    },
    {
      "grad_norm": 0.055733080953359604,
      "learning_rate": 0.0002857820828280257,
      "loss": 0.4211,
      "step": 7310
    },
    {
      "gate_value": 0.140594482421875,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7310
    },
    {
      "grad_norm": 0.048516325652599335,
      "learning_rate": 0.00028572933750512327,
      "loss": 0.4204,
      "step": 7320
    },
    {
      "gate_value": 0.14103937149047852,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7320
    },
    {
      "grad_norm": 0.05917530134320259,
      "learning_rate": 0.00028567649941249856,
      "loss": 0.4381,
      "step": 7330
    },
    {
      "gate_value": 0.14163275063037872,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 7330
    },
    {
      "grad_norm": 0.06968928873538971,
      "learning_rate": 0.00028562356858626584,
      "loss": 0.4161,
      "step": 7340
    },
    {
      "gate_value": 0.141965851187706,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 7340
    },
    {
      "grad_norm": 0.07425074279308319,
      "learning_rate": 0.0002855705450626028,
      "loss": 0.4276,
      "step": 7350
    },
    {
      "gate_value": 0.14220909774303436,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 7350
    },
    {
      "grad_norm": 0.057516731321811676,
      "learning_rate": 0.00028551742887775064,
      "loss": 0.4164,
      "step": 7360
    },
    {
      "gate_value": 0.14302530884742737,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7360
    },
    {
      "grad_norm": 0.1238698661327362,
      "learning_rate": 0.0002854642200680137,
      "loss": 0.4192,
      "step": 7370
    },
    {
      "gate_value": 0.1436849981546402,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 7370
    },
    {
      "grad_norm": 0.056957390159368515,
      "learning_rate": 0.00028541091866975967,
      "loss": 0.4155,
      "step": 7380
    },
    {
      "gate_value": 0.14387303590774536,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 7380
    },
    {
      "grad_norm": 0.04767700284719467,
      "learning_rate": 0.0002853575247194195,
      "loss": 0.4141,
      "step": 7390
    },
    {
      "gate_value": 0.14421603083610535,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 7390
    },
    {
      "grad_norm": 0.0633358284831047,
      "learning_rate": 0.0002853040382534876,
      "loss": 0.4307,
      "step": 7400
    },
    {
      "gate_value": 0.14420627057552338,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 7400
    },
    {
      "grad_norm": 0.06384849548339844,
      "learning_rate": 0.0002852504593085214,
      "loss": 0.4199,
      "step": 7410
    },
    {
      "gate_value": 0.14376404881477356,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7410
    },
    {
      "grad_norm": 0.05320499837398529,
      "learning_rate": 0.0002851967879211416,
      "loss": 0.421,
      "step": 7420
    },
    {
      "gate_value": 0.1437148004770279,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 7420
    },
    {
      "grad_norm": 0.038637008517980576,
      "learning_rate": 0.0002851430241280321,
      "loss": 0.4026,
      "step": 7430
    },
    {
      "gate_value": 0.1436859667301178,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7430
    },
    {
      "grad_norm": 0.10469973087310791,
      "learning_rate": 0.0002850891679659399,
      "loss": 0.4215,
      "step": 7440
    },
    {
      "gate_value": 0.14389364421367645,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 7440
    },
    {
      "grad_norm": 0.055326368659734726,
      "learning_rate": 0.0002850352194716752,
      "loss": 0.4067,
      "step": 7450
    },
    {
      "gate_value": 0.1442284882068634,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 7450
    },
    {
      "grad_norm": 0.08267883211374283,
      "learning_rate": 0.00028498117868211133,
      "loss": 0.4256,
      "step": 7460
    },
    {
      "gate_value": 0.14436683058738708,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 7460
    },
    {
      "grad_norm": 0.0570523627102375,
      "learning_rate": 0.00028492704563418467,
      "loss": 0.4225,
      "step": 7470
    },
    {
      "gate_value": 0.14424049854278564,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 7470
    },
    {
      "grad_norm": 0.10873137414455414,
      "learning_rate": 0.00028487282036489454,
      "loss": 0.4094,
      "step": 7480
    },
    {
      "gate_value": 0.14400343596935272,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 7480
    },
    {
      "grad_norm": 0.06366758048534393,
      "learning_rate": 0.0002848185029113034,
      "loss": 0.414,
      "step": 7490
    },
    {
      "gate_value": 0.1446097046136856,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7490
    },
    {
      "grad_norm": 0.07260235399007797,
      "learning_rate": 0.00028476409331053694,
      "loss": 0.4108,
      "step": 7500
    },
    {
      "gate_value": 0.14570096135139465,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7500
    },
    {
      "grad_norm": 0.0475568063557148,
      "learning_rate": 0.00028470959159978334,
      "loss": 0.4275,
      "step": 7510
    },
    {
      "gate_value": 0.1455191820859909,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 7510
    },
    {
      "grad_norm": 0.06414615362882614,
      "learning_rate": 0.0002846549978162941,
      "loss": 0.4183,
      "step": 7520
    },
    {
      "gate_value": 0.14565055072307587,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 7520
    },
    {
      "grad_norm": 0.08015228062868118,
      "learning_rate": 0.0002846003119973837,
      "loss": 0.4263,
      "step": 7530
    },
    {
      "gate_value": 0.14574910700321198,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 7530
    },
    {
      "grad_norm": 0.04416521638631821,
      "learning_rate": 0.00028454553418042915,
      "loss": 0.4165,
      "step": 7540
    },
    {
      "gate_value": 0.14608663320541382,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 7540
    },
    {
      "grad_norm": 0.1554374396800995,
      "learning_rate": 0.00028449066440287065,
      "loss": 0.4153,
      "step": 7550
    },
    {
      "gate_value": 0.14639393985271454,
      "icl_sequence_length": 56,
      "num_contexts": 3,
      "step": 7550
    },
    {
      "grad_norm": 0.07977347075939178,
      "learning_rate": 0.0002844357027022113,
      "loss": 0.4235,
      "step": 7560
    },
    {
      "gate_value": 0.1465524137020111,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7560
    },
    {
      "grad_norm": 0.09290315210819244,
      "learning_rate": 0.00028438064911601673,
      "loss": 0.4218,
      "step": 7570
    },
    {
      "gate_value": 0.14587721228599548,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7570
    },
    {
      "grad_norm": 0.04635458439588547,
      "learning_rate": 0.00028432550368191566,
      "loss": 0.429,
      "step": 7580
    },
    {
      "gate_value": 0.1461162567138672,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 7580
    },
    {
      "grad_norm": 0.04780597984790802,
      "learning_rate": 0.0002842702664375994,
      "loss": 0.4112,
      "step": 7590
    },
    {
      "gate_value": 0.1462957113981247,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 7590
    },
    {
      "grad_norm": 0.08960922062397003,
      "learning_rate": 0.0002842149374208222,
      "loss": 0.4171,
      "step": 7600
    },
    {
      "gate_value": 0.14661051332950592,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7600
    },
    {
      "grad_norm": 0.074715755879879,
      "learning_rate": 0.00028415951666940076,
      "loss": 0.4247,
      "step": 7610
    },
    {
      "gate_value": 0.14733080565929413,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7610
    },
    {
      "grad_norm": 0.044907812029123306,
      "learning_rate": 0.00028410400422121477,
      "loss": 0.3986,
      "step": 7620
    },
    {
      "gate_value": 0.14742633700370789,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7620
    },
    {
      "grad_norm": 0.09000971168279648,
      "learning_rate": 0.00028404840011420643,
      "loss": 0.4254,
      "step": 7630
    },
    {
      "gate_value": 0.14711321890354156,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7630
    },
    {
      "grad_norm": 0.07416932284832001,
      "learning_rate": 0.00028399270438638055,
      "loss": 0.3919,
      "step": 7640
    },
    {
      "gate_value": 0.14702820777893066,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 7640
    },
    {
      "grad_norm": 0.11321912705898285,
      "learning_rate": 0.00028393691707580477,
      "loss": 0.4133,
      "step": 7650
    },
    {
      "gate_value": 0.14669635891914368,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 7650
    },
    {
      "grad_norm": 0.09754902869462967,
      "learning_rate": 0.00028388103822060907,
      "loss": 0.4216,
      "step": 7660
    },
    {
      "gate_value": 0.14687460660934448,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 7660
    },
    {
      "grad_norm": 0.07661180943250656,
      "learning_rate": 0.0002838250678589862,
      "loss": 0.412,
      "step": 7670
    },
    {
      "gate_value": 0.14798735082149506,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7670
    },
    {
      "grad_norm": 0.13970467448234558,
      "learning_rate": 0.0002837690060291913,
      "loss": 0.4249,
      "step": 7680
    },
    {
      "gate_value": 0.14825834333896637,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 7680
    },
    {
      "grad_norm": 0.13393306732177734,
      "learning_rate": 0.0002837128527695422,
      "loss": 0.4272,
      "step": 7690
    },
    {
      "gate_value": 0.1480855494737625,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7690
    },
    {
      "grad_norm": 0.04212678596377373,
      "learning_rate": 0.00028365660811841903,
      "loss": 0.4281,
      "step": 7700
    },
    {
      "gate_value": 0.147964209318161,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 7700
    },
    {
      "grad_norm": 0.046316344290971756,
      "learning_rate": 0.00028360027211426456,
      "loss": 0.3971,
      "step": 7710
    },
    {
      "gate_value": 0.14794059097766876,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7710
    },
    {
      "grad_norm": 0.10596490651369095,
      "learning_rate": 0.00028354384479558384,
      "loss": 0.4446,
      "step": 7720
    },
    {
      "gate_value": 0.14815928041934967,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7720
    },
    {
      "grad_norm": 0.0596347413957119,
      "learning_rate": 0.0002834873262009444,
      "loss": 0.4253,
      "step": 7730
    },
    {
      "gate_value": 0.14768116176128387,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 7730
    },
    {
      "grad_norm": 0.08078193664550781,
      "learning_rate": 0.0002834307163689763,
      "loss": 0.4267,
      "step": 7740
    },
    {
      "gate_value": 0.1484566181898117,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 7740
    },
    {
      "grad_norm": 0.11879950016736984,
      "learning_rate": 0.0002833740153383717,
      "loss": 0.418,
      "step": 7750
    },
    {
      "gate_value": 0.14842697978019714,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 7750
    },
    {
      "grad_norm": 0.06142803281545639,
      "learning_rate": 0.0002833172231478853,
      "loss": 0.4145,
      "step": 7760
    },
    {
      "gate_value": 0.1485043615102768,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 7760
    },
    {
      "grad_norm": 0.046991124749183655,
      "learning_rate": 0.0002832603398363339,
      "loss": 0.4237,
      "step": 7770
    },
    {
      "gate_value": 0.14781975746154785,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7770
    },
    {
      "grad_norm": 0.08748679608106613,
      "learning_rate": 0.00028320336544259686,
      "loss": 0.4058,
      "step": 7780
    },
    {
      "gate_value": 0.14809411764144897,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 7780
    },
    {
      "grad_norm": 0.12719684839248657,
      "learning_rate": 0.0002831463000056156,
      "loss": 0.4112,
      "step": 7790
    },
    {
      "gate_value": 0.14806553721427917,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7790
    },
    {
      "grad_norm": 0.08230043202638626,
      "learning_rate": 0.00028308914356439365,
      "loss": 0.41,
      "step": 7800
    },
    {
      "gate_value": 0.1477605700492859,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 7800
    },
    {
      "grad_norm": 0.053649622946977615,
      "learning_rate": 0.00028303189615799714,
      "loss": 0.4172,
      "step": 7810
    },
    {
      "gate_value": 0.148033007979393,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7810
    },
    {
      "grad_norm": 0.12399522960186005,
      "learning_rate": 0.00028297455782555394,
      "loss": 0.3981,
      "step": 7820
    },
    {
      "gate_value": 0.148561492562294,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7820
    },
    {
      "grad_norm": 0.07105452567338943,
      "learning_rate": 0.00028291712860625443,
      "loss": 0.4428,
      "step": 7830
    },
    {
      "gate_value": 0.14870916306972504,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7830
    },
    {
      "grad_norm": 0.04318685829639435,
      "learning_rate": 0.00028285960853935085,
      "loss": 0.4221,
      "step": 7840
    },
    {
      "gate_value": 0.14865200221538544,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 7840
    },
    {
      "grad_norm": 0.0866786539554596,
      "learning_rate": 0.00028280199766415756,
      "loss": 0.4105,
      "step": 7850
    },
    {
      "gate_value": 0.14928407967090607,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7850
    },
    {
      "grad_norm": 0.10774104297161102,
      "learning_rate": 0.00028274429602005117,
      "loss": 0.4345,
      "step": 7860
    },
    {
      "gate_value": 0.15012405812740326,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7860
    },
    {
      "grad_norm": 0.08425111323595047,
      "learning_rate": 0.0002826865036464701,
      "loss": 0.4122,
      "step": 7870
    },
    {
      "gate_value": 0.15047207474708557,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 7870
    },
    {
      "grad_norm": 0.09827083349227905,
      "learning_rate": 0.00028262862058291496,
      "loss": 0.4299,
      "step": 7880
    },
    {
      "gate_value": 0.1503888964653015,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7880
    },
    {
      "grad_norm": 0.12378929555416107,
      "learning_rate": 0.0002825706468689483,
      "loss": 0.4179,
      "step": 7890
    },
    {
      "gate_value": 0.15025150775909424,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7890
    },
    {
      "grad_norm": 0.11086408793926239,
      "learning_rate": 0.00028251258254419453,
      "loss": 0.4237,
      "step": 7900
    },
    {
      "gate_value": 0.15031926333904266,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 7900
    },
    {
      "grad_norm": 0.06256461143493652,
      "learning_rate": 0.00028245442764834015,
      "loss": 0.4085,
      "step": 7910
    },
    {
      "gate_value": 0.15006960928440094,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 7910
    },
    {
      "grad_norm": 0.1430857628583908,
      "learning_rate": 0.0002823961822211334,
      "loss": 0.4089,
      "step": 7920
    },
    {
      "gate_value": 0.15034839510917664,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 7920
    },
    {
      "grad_norm": 0.0887841060757637,
      "learning_rate": 0.0002823378463023845,
      "loss": 0.4277,
      "step": 7930
    },
    {
      "gate_value": 0.151446133852005,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 7930
    },
    {
      "grad_norm": 0.06118936464190483,
      "learning_rate": 0.00028227941993196564,
      "loss": 0.4175,
      "step": 7940
    },
    {
      "gate_value": 0.1517830491065979,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7940
    },
    {
      "grad_norm": 0.16387613117694855,
      "learning_rate": 0.0002822209031498105,
      "loss": 0.4054,
      "step": 7950
    },
    {
      "gate_value": 0.15202456712722778,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7950
    },
    {
      "grad_norm": 0.046996645629405975,
      "learning_rate": 0.0002821622959959148,
      "loss": 0.4169,
      "step": 7960
    },
    {
      "gate_value": 0.15204818546772003,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 7960
    },
    {
      "grad_norm": 0.15682649612426758,
      "learning_rate": 0.00028210359851033604,
      "loss": 0.4187,
      "step": 7970
    },
    {
      "gate_value": 0.15276777744293213,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 7970
    },
    {
      "grad_norm": 0.15045514702796936,
      "learning_rate": 0.0002820448107331934,
      "loss": 0.409,
      "step": 7980
    },
    {
      "gate_value": 0.15333126485347748,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 7980
    },
    {
      "grad_norm": 0.06689872592687607,
      "learning_rate": 0.0002819859327046677,
      "loss": 0.4305,
      "step": 7990
    },
    {
      "gate_value": 0.1544475108385086,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 7990
    },
    {
      "grad_norm": 0.09526379406452179,
      "learning_rate": 0.0002819269644650015,
      "loss": 0.4142,
      "step": 8000
    },
    {
      "gate_value": 0.15457449853420258,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 8000
    },
    {
      "grad_norm": 0.10891684144735336,
      "learning_rate": 0.0002818679060544991,
      "loss": 0.4212,
      "step": 8010
    },
    {
      "gate_value": 0.15380233526229858,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8010
    },
    {
      "grad_norm": 0.12320034205913544,
      "learning_rate": 0.0002818087575135264,
      "loss": 0.4263,
      "step": 8020
    },
    {
      "gate_value": 0.15353891253471375,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 8020
    },
    {
      "grad_norm": 0.09334403276443481,
      "learning_rate": 0.0002817495188825108,
      "loss": 0.405,
      "step": 8030
    },
    {
      "gate_value": 0.1534106582403183,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8030
    },
    {
      "grad_norm": 0.0722557082772255,
      "learning_rate": 0.00028169019020194135,
      "loss": 0.4269,
      "step": 8040
    },
    {
      "gate_value": 0.15350815653800964,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 8040
    },
    {
      "grad_norm": 0.060502052307128906,
      "learning_rate": 0.00028163077151236864,
      "loss": 0.4046,
      "step": 8050
    },
    {
      "gate_value": 0.15341298282146454,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 8050
    },
    {
      "grad_norm": 0.1101309210062027,
      "learning_rate": 0.00028157126285440485,
      "loss": 0.419,
      "step": 8060
    },
    {
      "gate_value": 0.15298579633235931,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 8060
    },
    {
      "grad_norm": 0.07704024761915207,
      "learning_rate": 0.0002815116642687236,
      "loss": 0.4242,
      "step": 8070
    },
    {
      "gate_value": 0.15233850479125977,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8070
    },
    {
      "grad_norm": 0.08202876150608063,
      "learning_rate": 0.0002814519757960598,
      "loss": 0.4034,
      "step": 8080
    },
    {
      "gate_value": 0.1520369052886963,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8080
    },
    {
      "grad_norm": 0.07245134562253952,
      "learning_rate": 0.0002813921974772101,
      "loss": 0.4011,
      "step": 8090
    },
    {
      "gate_value": 0.1525239646434784,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 8090
    },
    {
      "grad_norm": 0.48970121145248413,
      "learning_rate": 0.00028133232935303234,
      "loss": 0.4046,
      "step": 8100
    },
    {
      "gate_value": 0.1524377316236496,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 8100
    },
    {
      "grad_norm": 0.8370404243469238,
      "learning_rate": 0.0002812723714644459,
      "loss": 0.4219,
      "step": 8110
    },
    {
      "gate_value": 0.15254035592079163,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 8110
    },
    {
      "grad_norm": 0.1069856658577919,
      "learning_rate": 0.0002812123238524314,
      "loss": 0.4224,
      "step": 8120
    },
    {
      "gate_value": 0.1529160439968109,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 8120
    },
    {
      "grad_norm": 0.10440300405025482,
      "learning_rate": 0.00028115218655803075,
      "loss": 0.4176,
      "step": 8130
    },
    {
      "gate_value": 0.1537504643201828,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8130
    },
    {
      "grad_norm": 0.1068665012717247,
      "learning_rate": 0.0002810919596223474,
      "loss": 0.3981,
      "step": 8140
    },
    {
      "gate_value": 0.15441353619098663,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 8140
    },
    {
      "grad_norm": 0.21673692762851715,
      "learning_rate": 0.0002810316430865456,
      "loss": 0.4008,
      "step": 8150
    },
    {
      "gate_value": 0.15410266816616058,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 8150
    },
    {
      "grad_norm": 0.17246730625629425,
      "learning_rate": 0.0002809712369918514,
      "loss": 0.4105,
      "step": 8160
    },
    {
      "gate_value": 0.15456920862197876,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 8160
    },
    {
      "grad_norm": 0.22932332754135132,
      "learning_rate": 0.0002809107413795517,
      "loss": 0.4179,
      "step": 8170
    },
    {
      "gate_value": 0.15458108484745026,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8170
    },
    {
      "grad_norm": 0.10797584801912308,
      "learning_rate": 0.0002808501562909947,
      "loss": 0.4197,
      "step": 8180
    },
    {
      "gate_value": 0.1545572131872177,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 8180
    },
    {
      "grad_norm": 0.19922660291194916,
      "learning_rate": 0.0002807894817675897,
      "loss": 0.4092,
      "step": 8190
    },
    {
      "gate_value": 0.15465989708900452,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8190
    },
    {
      "grad_norm": 0.19957950711250305,
      "learning_rate": 0.00028072871785080717,
      "loss": 0.422,
      "step": 8200
    },
    {
      "gate_value": 0.15546606481075287,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 8200
    },
    {
      "grad_norm": 0.5608637928962708,
      "learning_rate": 0.00028066786458217865,
      "loss": 0.4062,
      "step": 8210
    },
    {
      "gate_value": 0.1552291363477707,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8210
    },
    {
      "grad_norm": 0.07581547647714615,
      "learning_rate": 0.0002806069220032969,
      "loss": 0.4056,
      "step": 8220
    },
    {
      "gate_value": 0.15533456206321716,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8220
    },
    {
      "grad_norm": 0.07695662975311279,
      "learning_rate": 0.0002805458901558154,
      "loss": 0.4266,
      "step": 8230
    },
    {
      "gate_value": 0.15545304119586945,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 8230
    },
    {
      "grad_norm": 0.13097840547561646,
      "learning_rate": 0.00028048476908144903,
      "loss": 0.4256,
      "step": 8240
    },
    {
      "gate_value": 0.15602801740169525,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8240
    },
    {
      "grad_norm": 0.21157461404800415,
      "learning_rate": 0.00028042355882197336,
      "loss": 0.42,
      "step": 8250
    },
    {
      "gate_value": 0.15621256828308105,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 8250
    },
    {
      "grad_norm": 0.21507303416728973,
      "learning_rate": 0.0002803622594192251,
      "loss": 0.4267,
      "step": 8260
    },
    {
      "gate_value": 0.1561337411403656,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 8260
    },
    {
      "grad_norm": 0.08249134570360184,
      "learning_rate": 0.00028030087091510174,
      "loss": 0.4033,
      "step": 8270
    },
    {
      "gate_value": 0.15637017786502838,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8270
    },
    {
      "grad_norm": 0.09764933586120605,
      "learning_rate": 0.0002802393933515618,
      "loss": 0.4019,
      "step": 8280
    },
    {
      "gate_value": 0.15595419704914093,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 8280
    },
    {
      "grad_norm": 0.31223252415657043,
      "learning_rate": 0.00028017782677062456,
      "loss": 0.4018,
      "step": 8290
    },
    {
      "gate_value": 0.1560358852148056,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 8290
    },
    {
      "grad_norm": 0.6203545331954956,
      "learning_rate": 0.0002801161712143702,
      "loss": 0.4075,
      "step": 8300
    },
    {
      "gate_value": 0.15645352005958557,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 8300
    },
    {
      "grad_norm": 0.19463717937469482,
      "learning_rate": 0.0002800544267249398,
      "loss": 0.4133,
      "step": 8310
    },
    {
      "gate_value": 0.1570998579263687,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 8310
    },
    {
      "grad_norm": 0.09994322806596756,
      "learning_rate": 0.00027999259334453503,
      "loss": 0.4043,
      "step": 8320
    },
    {
      "gate_value": 0.15745465457439423,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 8320
    },
    {
      "grad_norm": 0.07742167264223099,
      "learning_rate": 0.0002799306711154185,
      "loss": 0.4041,
      "step": 8330
    },
    {
      "gate_value": 0.15763244032859802,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 8330
    },
    {
      "grad_norm": 0.16606910526752472,
      "learning_rate": 0.0002798686600799134,
      "loss": 0.4279,
      "step": 8340
    },
    {
      "gate_value": 0.15786494314670563,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 8340
    },
    {
      "grad_norm": 0.10738826543092728,
      "learning_rate": 0.00027980656028040373,
      "loss": 0.4096,
      "step": 8350
    },
    {
      "gate_value": 0.15786395967006683,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 8350
    },
    {
      "grad_norm": 0.15681566298007965,
      "learning_rate": 0.0002797443717593341,
      "loss": 0.4092,
      "step": 8360
    },
    {
      "gate_value": 0.15748639404773712,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 8360
    },
    {
      "grad_norm": 0.19968372583389282,
      "learning_rate": 0.0002796820945592098,
      "loss": 0.4107,
      "step": 8370
    },
    {
      "gate_value": 0.1574399322271347,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8370
    },
    {
      "grad_norm": 0.08654113113880157,
      "learning_rate": 0.00027961972872259675,
      "loss": 0.4032,
      "step": 8380
    },
    {
      "gate_value": 0.1580353081226349,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 8380
    },
    {
      "grad_norm": 0.17560425400733948,
      "learning_rate": 0.0002795572742921213,
      "loss": 0.4045,
      "step": 8390
    },
    {
      "gate_value": 0.15868164598941803,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 8390
    },
    {
      "grad_norm": 0.14503097534179688,
      "learning_rate": 0.0002794947313104705,
      "loss": 0.4134,
      "step": 8400
    },
    {
      "gate_value": 0.15872491896152496,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 8400
    },
    {
      "grad_norm": 0.5805841684341431,
      "learning_rate": 0.00027943209982039195,
      "loss": 0.4269,
      "step": 8410
    },
    {
      "gate_value": 0.15834017097949982,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 8410
    },
    {
      "grad_norm": 0.26208043098449707,
      "learning_rate": 0.0002793693798646937,
      "loss": 0.4062,
      "step": 8420
    },
    {
      "gate_value": 0.15807278454303741,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 8420
    },
    {
      "grad_norm": 0.13806024193763733,
      "learning_rate": 0.00027930657148624407,
      "loss": 0.4181,
      "step": 8430
    },
    {
      "gate_value": 0.1582920253276825,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 8430
    },
    {
      "grad_norm": 0.1110081896185875,
      "learning_rate": 0.0002792436747279722,
      "loss": 0.411,
      "step": 8440
    },
    {
      "gate_value": 0.159070685505867,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 8440
    },
    {
      "grad_norm": 0.16463018953800201,
      "learning_rate": 0.0002791806896328673,
      "loss": 0.4149,
      "step": 8450
    },
    {
      "gate_value": 0.15995164215564728,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8450
    },
    {
      "grad_norm": 0.3234344720840454,
      "learning_rate": 0.0002791176162439792,
      "loss": 0.4171,
      "step": 8460
    },
    {
      "gate_value": 0.160800963640213,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 8460
    },
    {
      "grad_norm": 0.14832444489002228,
      "learning_rate": 0.0002790544546044179,
      "loss": 0.4228,
      "step": 8470
    },
    {
      "gate_value": 0.16142258048057556,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8470
    },
    {
      "grad_norm": 0.14149171113967896,
      "learning_rate": 0.00027899120475735373,
      "loss": 0.4124,
      "step": 8480
    },
    {
      "gate_value": 0.16140305995941162,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 8480
    },
    {
      "grad_norm": 0.4263879656791687,
      "learning_rate": 0.00027892786674601745,
      "loss": 0.4111,
      "step": 8490
    },
    {
      "gate_value": 0.16139687597751617,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 8490
    },
    {
      "grad_norm": 0.23564587533473969,
      "learning_rate": 0.0002788644406137,
      "loss": 0.4178,
      "step": 8500
    },
    {
      "gate_value": 0.16113747656345367,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 8500
    },
    {
      "grad_norm": 0.10173361003398895,
      "learning_rate": 0.00027880092640375243,
      "loss": 0.4209,
      "step": 8510
    },
    {
      "gate_value": 0.16043519973754883,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 8510
    },
    {
      "grad_norm": 0.34818899631500244,
      "learning_rate": 0.00027873732415958626,
      "loss": 0.4267,
      "step": 8520
    },
    {
      "gate_value": 0.16010433435440063,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 8520
    },
    {
      "grad_norm": 0.12077205628156662,
      "learning_rate": 0.0002786736339246729,
      "loss": 0.4135,
      "step": 8530
    },
    {
      "gate_value": 0.16009613871574402,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 8530
    },
    {
      "grad_norm": 0.4177238345146179,
      "learning_rate": 0.0002786098557425441,
      "loss": 0.4262,
      "step": 8540
    },
    {
      "gate_value": 0.16055171191692352,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 8540
    },
    {
      "grad_norm": 0.12868152558803558,
      "learning_rate": 0.0002785459896567916,
      "loss": 0.3945,
      "step": 8550
    },
    {
      "gate_value": 0.16079725325107574,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 8550
    },
    {
      "grad_norm": 0.12423283606767654,
      "learning_rate": 0.0002784820357110673,
      "loss": 0.4073,
      "step": 8560
    },
    {
      "gate_value": 0.16120773553848267,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 8560
    },
    {
      "grad_norm": 0.16405817866325378,
      "learning_rate": 0.00027841799394908313,
      "loss": 0.4028,
      "step": 8570
    },
    {
      "gate_value": 0.16189523041248322,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 8570
    },
    {
      "grad_norm": 0.11654902249574661,
      "learning_rate": 0.0002783538644146109,
      "loss": 0.4131,
      "step": 8580
    },
    {
      "gate_value": 0.16290602087974548,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 8580
    },
    {
      "grad_norm": 0.11591009795665741,
      "learning_rate": 0.00027828964715148277,
      "loss": 0.4171,
      "step": 8590
    },
    {
      "gate_value": 0.16335241496562958,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 8590
    },
    {
      "grad_norm": 0.3086751699447632,
      "learning_rate": 0.0002782253422035905,
      "loss": 0.3931,
      "step": 8600
    },
    {
      "gate_value": 0.1638471484184265,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 8600
    },
    {
      "grad_norm": 0.4067281484603882,
      "learning_rate": 0.00027816094961488586,
      "loss": 0.4236,
      "step": 8610
    },
    {
      "gate_value": 0.16402751207351685,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8610
    },
    {
      "grad_norm": 0.4020434021949768,
      "learning_rate": 0.00027809646942938065,
      "loss": 0.4152,
      "step": 8620
    },
    {
      "gate_value": 0.16374509036540985,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8620
    },
    {
      "grad_norm": 0.11254820227622986,
      "learning_rate": 0.0002780319016911465,
      "loss": 0.3941,
      "step": 8630
    },
    {
      "gate_value": 0.16366678476333618,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8630
    },
    {
      "grad_norm": 0.2048700600862503,
      "learning_rate": 0.00027796724644431483,
      "loss": 0.428,
      "step": 8640
    },
    {
      "gate_value": 0.1640690714120865,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8640
    },
    {
      "grad_norm": 0.10903961211442947,
      "learning_rate": 0.0002779025037330768,
      "loss": 0.415,
      "step": 8650
    },
    {
      "gate_value": 0.1633603423833847,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 8650
    },
    {
      "grad_norm": 0.17407450079917908,
      "learning_rate": 0.00027783767360168356,
      "loss": 0.4091,
      "step": 8660
    },
    {
      "gate_value": 0.16377109289169312,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 8660
    },
    {
      "grad_norm": 0.18325786292552948,
      "learning_rate": 0.00027777275609444587,
      "loss": 0.4159,
      "step": 8670
    },
    {
      "gate_value": 0.16425953805446625,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 8670
    },
    {
      "grad_norm": 0.957232654094696,
      "learning_rate": 0.0002777077512557342,
      "loss": 0.4066,
      "step": 8680
    },
    {
      "gate_value": 0.1641382873058319,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 8680
    },
    {
      "grad_norm": 0.2491225153207779,
      "learning_rate": 0.0002776426591299787,
      "loss": 0.4108,
      "step": 8690
    },
    {
      "gate_value": 0.16421766579151154,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 8690
    },
    {
      "grad_norm": 3.7188527584075928,
      "learning_rate": 0.00027757747976166935,
      "loss": 0.4058,
      "step": 8700
    },
    {
      "gate_value": 0.16427725553512573,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 8700
    },
    {
      "grad_norm": 0.4891299307346344,
      "learning_rate": 0.00027751221319535557,
      "loss": 0.4075,
      "step": 8710
    },
    {
      "gate_value": 0.16449934244155884,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 8710
    },
    {
      "grad_norm": 0.20242206752300262,
      "learning_rate": 0.0002774468594756464,
      "loss": 0.4026,
      "step": 8720
    },
    {
      "gate_value": 0.1650473028421402,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 8720
    },
    {
      "grad_norm": 0.14760473370552063,
      "learning_rate": 0.0002773814186472106,
      "loss": 0.3994,
      "step": 8730
    },
    {
      "gate_value": 0.16507603228092194,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8730
    },
    {
      "grad_norm": 0.14821743965148926,
      "learning_rate": 0.00027731589075477624,
      "loss": 0.4118,
      "step": 8740
    },
    {
      "gate_value": 0.1656453013420105,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 8740
    },
    {
      "grad_norm": 0.7819263935089111,
      "learning_rate": 0.00027725027584313104,
      "loss": 0.399,
      "step": 8750
    },
    {
      "gate_value": 0.1657910943031311,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 8750
    },
    {
      "grad_norm": 0.29498928785324097,
      "learning_rate": 0.0002771845739571222,
      "loss": 0.4329,
      "step": 8760
    },
    {
      "gate_value": 0.16638685762882233,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 8760
    },
    {
      "grad_norm": 0.2815771996974945,
      "learning_rate": 0.0002771187851416564,
      "loss": 0.4159,
      "step": 8770
    },
    {
      "gate_value": 0.1663895696401596,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 8770
    },
    {
      "grad_norm": 0.11321847885847092,
      "learning_rate": 0.0002770529094416996,
      "loss": 0.4172,
      "step": 8780
    },
    {
      "gate_value": 0.16677428781986237,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 8780
    },
    {
      "grad_norm": 0.1718282252550125,
      "learning_rate": 0.0002769869469022772,
      "loss": 0.4133,
      "step": 8790
    },
    {
      "gate_value": 0.16660121083259583,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 8790
    },
    {
      "grad_norm": 0.3674944043159485,
      "learning_rate": 0.000276920897568474,
      "loss": 0.4151,
      "step": 8800
    },
    {
      "gate_value": 0.16616857051849365,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 8800
    },
    {
      "grad_norm": 0.262675940990448,
      "learning_rate": 0.00027685476148543416,
      "loss": 0.4092,
      "step": 8810
    },
    {
      "gate_value": 0.16660258173942566,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 8810
    },
    {
      "grad_norm": 0.4975556433200836,
      "learning_rate": 0.00027678853869836096,
      "loss": 0.4073,
      "step": 8820
    },
    {
      "gate_value": 0.16717791557312012,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 8820
    },
    {
      "grad_norm": 0.2040899395942688,
      "learning_rate": 0.0002767222292525171,
      "loss": 0.4238,
      "step": 8830
    },
    {
      "gate_value": 0.16736888885498047,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 8830
    },
    {
      "grad_norm": 0.1558050960302353,
      "learning_rate": 0.00027665583319322454,
      "loss": 0.4052,
      "step": 8840
    },
    {
      "gate_value": 0.1675145924091339,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 8840
    },
    {
      "grad_norm": 0.6124147176742554,
      "learning_rate": 0.0002765893505658642,
      "loss": 0.4167,
      "step": 8850
    },
    {
      "gate_value": 0.16753420233726501,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 8850
    },
    {
      "grad_norm": 0.25529617071151733,
      "learning_rate": 0.00027652278141587647,
      "loss": 0.4245,
      "step": 8860
    },
    {
      "gate_value": 0.16686958074569702,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 8860
    },
    {
      "grad_norm": 0.11387065052986145,
      "learning_rate": 0.00027645612578876066,
      "loss": 0.4092,
      "step": 8870
    },
    {
      "gate_value": 0.16669636964797974,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 8870
    },
    {
      "grad_norm": 0.0918634682893753,
      "learning_rate": 0.00027638938373007526,
      "loss": 0.4073,
      "step": 8880
    },
    {
      "gate_value": 0.1666368544101715,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8880
    },
    {
      "grad_norm": 0.12662643194198608,
      "learning_rate": 0.00027632255528543787,
      "loss": 0.4127,
      "step": 8890
    },
    {
      "gate_value": 0.1672160029411316,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8890
    },
    {
      "grad_norm": 0.10079085826873779,
      "learning_rate": 0.00027625564050052517,
      "loss": 0.4111,
      "step": 8900
    },
    {
      "gate_value": 0.16768445074558258,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 8900
    },
    {
      "grad_norm": 0.10930916666984558,
      "learning_rate": 0.0002761886394210726,
      "loss": 0.4033,
      "step": 8910
    },
    {
      "gate_value": 0.16912949085235596,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 8910
    },
    {
      "grad_norm": 0.12985733151435852,
      "learning_rate": 0.00027612155209287494,
      "loss": 0.3969,
      "step": 8920
    },
    {
      "gate_value": 0.16979645192623138,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 8920
    },
    {
      "grad_norm": 0.09886956959962845,
      "learning_rate": 0.0002760543785617857,
      "loss": 0.4102,
      "step": 8930
    },
    {
      "gate_value": 0.1697724461555481,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 8930
    },
    {
      "grad_norm": 0.3233407735824585,
      "learning_rate": 0.0002759871188737173,
      "loss": 0.4139,
      "step": 8940
    },
    {
      "gate_value": 0.17055948078632355,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 8940
    },
    {
      "grad_norm": 0.15284964442253113,
      "learning_rate": 0.0002759197730746411,
      "loss": 0.4091,
      "step": 8950
    },
    {
      "gate_value": 0.1704828441143036,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 8950
    },
    {
      "grad_norm": 1.4669501781463623,
      "learning_rate": 0.0002758523412105874,
      "loss": 0.4177,
      "step": 8960
    },
    {
      "gate_value": 0.1702096164226532,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 8960
    },
    {
      "grad_norm": 0.29607024788856506,
      "learning_rate": 0.00027578482332764516,
      "loss": 0.3793,
      "step": 8970
    },
    {
      "gate_value": 0.1705409288406372,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 8970
    },
    {
      "grad_norm": 1.1997206211090088,
      "learning_rate": 0.0002757172194719623,
      "loss": 0.4155,
      "step": 8980
    },
    {
      "gate_value": 0.17043624818325043,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 8980
    },
    {
      "grad_norm": 1.4481717348098755,
      "learning_rate": 0.00027564952968974534,
      "loss": 0.4089,
      "step": 8990
    },
    {
      "gate_value": 0.17082171142101288,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 8990
    },
    {
      "grad_norm": 0.1889583021402359,
      "learning_rate": 0.00027558175402725963,
      "loss": 0.4162,
      "step": 9000
    },
    {
      "gate_value": 0.17097678780555725,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 9000
    },
    {
      "grad_norm": 0.22113603353500366,
      "learning_rate": 0.00027551389253082926,
      "loss": 0.4092,
      "step": 9010
    },
    {
      "gate_value": 0.1711142361164093,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 9010
    },
    {
      "grad_norm": 0.12217526137828827,
      "learning_rate": 0.00027544594524683683,
      "loss": 0.4106,
      "step": 9020
    },
    {
      "gate_value": 0.17178325355052948,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 9020
    },
    {
      "grad_norm": 0.4917551279067993,
      "learning_rate": 0.0002753779122217237,
      "loss": 0.4076,
      "step": 9030
    },
    {
      "gate_value": 0.17185628414154053,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9030
    },
    {
      "grad_norm": 0.12286029756069183,
      "learning_rate": 0.00027530979350198987,
      "loss": 0.4132,
      "step": 9040
    },
    {
      "gate_value": 0.17211104929447174,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 9040
    },
    {
      "grad_norm": 0.1475491225719452,
      "learning_rate": 0.00027524158913419376,
      "loss": 0.4016,
      "step": 9050
    },
    {
      "gate_value": 0.17211231589317322,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 9050
    },
    {
      "grad_norm": 0.17296111583709717,
      "learning_rate": 0.0002751732991649524,
      "loss": 0.4067,
      "step": 9060
    },
    {
      "gate_value": 0.1717059165239334,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 9060
    },
    {
      "grad_norm": 4.388981819152832,
      "learning_rate": 0.0002751049236409414,
      "loss": 0.3944,
      "step": 9070
    },
    {
      "gate_value": 0.17243611812591553,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 9070
    },
    {
      "grad_norm": 0.2799641191959381,
      "learning_rate": 0.0002750364626088947,
      "loss": 0.4033,
      "step": 9080
    },
    {
      "gate_value": 0.17295986413955688,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 9080
    },
    {
      "grad_norm": 0.15736496448516846,
      "learning_rate": 0.0002749679161156049,
      "loss": 0.4135,
      "step": 9090
    },
    {
      "gate_value": 0.17265865206718445,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 9090
    },
    {
      "grad_norm": 0.14757375419139862,
      "learning_rate": 0.0002748992842079228,
      "loss": 0.4255,
      "step": 9100
    },
    {
      "gate_value": 0.1720341593027115,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 9100
    },
    {
      "grad_norm": 0.4037497639656067,
      "learning_rate": 0.0002748305669327577,
      "loss": 0.3946,
      "step": 9110
    },
    {
      "gate_value": 0.1721326857805252,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 9110
    },
    {
      "grad_norm": 0.11872224509716034,
      "learning_rate": 0.00027476176433707713,
      "loss": 0.3961,
      "step": 9120
    },
    {
      "gate_value": 0.17253492772579193,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 9120
    },
    {
      "grad_norm": 0.15447595715522766,
      "learning_rate": 0.0002746928764679071,
      "loss": 0.4018,
      "step": 9130
    },
    {
      "gate_value": 0.17255066335201263,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 9130
    },
    {
      "grad_norm": 0.7999837398529053,
      "learning_rate": 0.0002746239033723318,
      "loss": 0.4058,
      "step": 9140
    },
    {
      "gate_value": 0.17191246151924133,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 9140
    },
    {
      "grad_norm": 0.20919932425022125,
      "learning_rate": 0.0002745548450974936,
      "loss": 0.4074,
      "step": 9150
    },
    {
      "gate_value": 0.17232879996299744,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 9150
    },
    {
      "grad_norm": 0.595839262008667,
      "learning_rate": 0.0002744857016905933,
      "loss": 0.3921,
      "step": 9160
    },
    {
      "gate_value": 0.17247727513313293,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 9160
    },
    {
      "grad_norm": 0.5459367036819458,
      "learning_rate": 0.0002744164731988898,
      "loss": 0.4138,
      "step": 9170
    },
    {
      "gate_value": 0.17329102754592896,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 9170
    },
    {
      "grad_norm": 0.19087789952754974,
      "learning_rate": 0.00027434715966969997,
      "loss": 0.4062,
      "step": 9180
    },
    {
      "gate_value": 0.1738402396440506,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 9180
    },
    {
      "grad_norm": 0.21525757014751434,
      "learning_rate": 0.000274277761150399,
      "loss": 0.402,
      "step": 9190
    },
    {
      "gate_value": 0.1737126260995865,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 9190
    },
    {
      "grad_norm": 0.38013356924057007,
      "learning_rate": 0.00027420827768842023,
      "loss": 0.4117,
      "step": 9200
    },
    {
      "gate_value": 0.17394515872001648,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 9200
    },
    {
      "grad_norm": 0.17054623365402222,
      "learning_rate": 0.00027413870933125486,
      "loss": 0.4116,
      "step": 9210
    },
    {
      "gate_value": 0.173823744058609,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 9210
    },
    {
      "grad_norm": 0.16116704046726227,
      "learning_rate": 0.00027406905612645217,
      "loss": 0.4072,
      "step": 9220
    },
    {
      "gate_value": 0.17491909861564636,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 9220
    },
    {
      "grad_norm": 0.20770235359668732,
      "learning_rate": 0.00027399931812161957,
      "loss": 0.3936,
      "step": 9230
    },
    {
      "gate_value": 0.1753663271665573,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 9230
    },
    {
      "grad_norm": 0.12470269948244095,
      "learning_rate": 0.00027392949536442224,
      "loss": 0.407,
      "step": 9240
    },
    {
      "gate_value": 0.17491556704044342,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 9240
    },
    {
      "grad_norm": 0.29726898670196533,
      "learning_rate": 0.0002738595879025835,
      "loss": 0.4221,
      "step": 9250
    },
    {
      "gate_value": 0.1747361123561859,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 9250
    },
    {
      "grad_norm": 0.16315695643424988,
      "learning_rate": 0.0002737895957838842,
      "loss": 0.4042,
      "step": 9260
    },
    {
      "gate_value": 0.17513708770275116,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 9260
    },
    {
      "grad_norm": 0.3211088180541992,
      "learning_rate": 0.00027371951905616357,
      "loss": 0.4107,
      "step": 9270
    },
    {
      "gate_value": 0.17483621835708618,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 9270
    },
    {
      "grad_norm": 0.12201294302940369,
      "learning_rate": 0.00027364935776731826,
      "loss": 0.405,
      "step": 9280
    },
    {
      "gate_value": 0.1746746301651001,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 9280
    },
    {
      "grad_norm": 1.4433128833770752,
      "learning_rate": 0.00027357911196530284,
      "loss": 0.3999,
      "step": 9290
    },
    {
      "gate_value": 0.17453843355178833,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 9290
    },
    {
      "grad_norm": 0.13382488489151,
      "learning_rate": 0.0002735087816981296,
      "loss": 0.4227,
      "step": 9300
    },
    {
      "gate_value": 0.17486976087093353,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 9300
    },
    {
      "grad_norm": 0.3727971017360687,
      "learning_rate": 0.00027343836701386877,
      "loss": 0.4146,
      "step": 9310
    },
    {
      "gate_value": 0.17553885281085968,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9310
    },
    {
      "grad_norm": 0.5436861515045166,
      "learning_rate": 0.00027336786796064807,
      "loss": 0.3989,
      "step": 9320
    },
    {
      "gate_value": 0.1751997321844101,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 9320
    },
    {
      "grad_norm": 0.17482315003871918,
      "learning_rate": 0.00027329728458665284,
      "loss": 0.4068,
      "step": 9330
    },
    {
      "gate_value": 0.17559026181697845,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 9330
    },
    {
      "grad_norm": 0.23877385258674622,
      "learning_rate": 0.0002732266169401262,
      "loss": 0.4171,
      "step": 9340
    },
    {
      "gate_value": 0.17537659406661987,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 9340
    },
    {
      "grad_norm": 0.09425515681505203,
      "learning_rate": 0.0002731558650693689,
      "loss": 0.4215,
      "step": 9350
    },
    {
      "gate_value": 0.175358846783638,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 9350
    },
    {
      "grad_norm": 0.14440549910068512,
      "learning_rate": 0.00027308502902273913,
      "loss": 0.418,
      "step": 9360
    },
    {
      "gate_value": 0.17526549100875854,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 9360
    },
    {
      "grad_norm": 0.13115571439266205,
      "learning_rate": 0.0002730141088486526,
      "loss": 0.4042,
      "step": 9370
    },
    {
      "gate_value": 0.17517532408237457,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 9370
    },
    {
      "grad_norm": 0.10693828016519547,
      "learning_rate": 0.0002729431045955826,
      "loss": 0.3931,
      "step": 9380
    },
    {
      "gate_value": 0.17489777505397797,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 9380
    },
    {
      "grad_norm": 0.3044837713241577,
      "learning_rate": 0.00027287201631205995,
      "loss": 0.397,
      "step": 9390
    },
    {
      "gate_value": 0.17526645958423615,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9390
    },
    {
      "grad_norm": 0.3311840891838074,
      "learning_rate": 0.00027280084404667274,
      "loss": 0.4216,
      "step": 9400
    },
    {
      "gate_value": 0.17530137300491333,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 9400
    },
    {
      "grad_norm": 0.1382623314857483,
      "learning_rate": 0.0002727295878480666,
      "loss": 0.3987,
      "step": 9410
    },
    {
      "gate_value": 0.1757037341594696,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 9410
    },
    {
      "grad_norm": 0.2891187369823456,
      "learning_rate": 0.0002726582477649444,
      "loss": 0.4025,
      "step": 9420
    },
    {
      "gate_value": 0.1756793111562729,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 9420
    },
    {
      "grad_norm": 0.06408237665891647,
      "learning_rate": 0.00027258682384606646,
      "loss": 0.4079,
      "step": 9430
    },
    {
      "gate_value": 0.17592254281044006,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 9430
    },
    {
      "grad_norm": 0.18277965486049652,
      "learning_rate": 0.00027251531614025035,
      "loss": 0.3936,
      "step": 9440
    },
    {
      "gate_value": 0.1765175312757492,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 9440
    },
    {
      "grad_norm": 0.2279181033372879,
      "learning_rate": 0.00027244372469637087,
      "loss": 0.396,
      "step": 9450
    },
    {
      "gate_value": 0.17652124166488647,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 9450
    },
    {
      "grad_norm": 0.46269768476486206,
      "learning_rate": 0.0002723720495633602,
      "loss": 0.4162,
      "step": 9460
    },
    {
      "gate_value": 0.17670394480228424,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 9460
    },
    {
      "grad_norm": 0.5305395126342773,
      "learning_rate": 0.0002723002907902075,
      "loss": 0.4236,
      "step": 9470
    },
    {
      "gate_value": 0.1759636104106903,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9470
    },
    {
      "grad_norm": 0.5236008167266846,
      "learning_rate": 0.0002722284484259593,
      "loss": 0.4034,
      "step": 9480
    },
    {
      "gate_value": 0.1763700544834137,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 9480
    },
    {
      "grad_norm": 0.11968474835157394,
      "learning_rate": 0.00027215652251971913,
      "loss": 0.3932,
      "step": 9490
    },
    {
      "gate_value": 0.1767255663871765,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9490
    },
    {
      "grad_norm": 0.1513536274433136,
      "learning_rate": 0.0002720845131206477,
      "loss": 0.406,
      "step": 9500
    },
    {
      "gate_value": 0.1772974580526352,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 9500
    },
    {
      "grad_norm": 0.2298015058040619,
      "learning_rate": 0.00027201242027796274,
      "loss": 0.4074,
      "step": 9510
    },
    {
      "gate_value": 0.17706361413002014,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9510
    },
    {
      "grad_norm": 0.1239701434969902,
      "learning_rate": 0.000271940244040939,
      "loss": 0.409,
      "step": 9520
    },
    {
      "gate_value": 0.17645961046218872,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 9520
    },
    {
      "grad_norm": 0.16889768838882446,
      "learning_rate": 0.0002718679844589083,
      "loss": 0.4144,
      "step": 9530
    },
    {
      "gate_value": 0.17701050639152527,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9530
    },
    {
      "grad_norm": 1.308580994606018,
      "learning_rate": 0.0002717956415812594,
      "loss": 0.4063,
      "step": 9540
    },
    {
      "gate_value": 0.17713579535484314,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 9540
    },
    {
      "grad_norm": 0.1434451937675476,
      "learning_rate": 0.0002717232154574379,
      "loss": 0.3982,
      "step": 9550
    },
    {
      "gate_value": 0.1771237850189209,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 9550
    },
    {
      "grad_norm": 0.11620178818702698,
      "learning_rate": 0.0002716507061369464,
      "loss": 0.4241,
      "step": 9560
    },
    {
      "gate_value": 0.17687828838825226,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 9560
    },
    {
      "grad_norm": 0.29256099462509155,
      "learning_rate": 0.0002715781136693444,
      "loss": 0.4099,
      "step": 9570
    },
    {
      "gate_value": 0.17701010406017303,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 9570
    },
    {
      "grad_norm": 0.36338385939598083,
      "learning_rate": 0.00027150543810424815,
      "loss": 0.4106,
      "step": 9580
    },
    {
      "gate_value": 0.1774062216281891,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 9580
    },
    {
      "grad_norm": 0.1436304897069931,
      "learning_rate": 0.0002714326794913306,
      "loss": 0.4277,
      "step": 9590
    },
    {
      "gate_value": 0.1778738647699356,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 9590
    },
    {
      "grad_norm": 0.1371924877166748,
      "learning_rate": 0.0002713598378803217,
      "loss": 0.4125,
      "step": 9600
    },
    {
      "gate_value": 0.178547665476799,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9600
    },
    {
      "grad_norm": 0.2031945139169693,
      "learning_rate": 0.000271286913321008,
      "loss": 0.4155,
      "step": 9610
    },
    {
      "gate_value": 0.17900629341602325,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 9610
    },
    {
      "grad_norm": 0.13193300366401672,
      "learning_rate": 0.00027121390586323264,
      "loss": 0.4101,
      "step": 9620
    },
    {
      "gate_value": 0.1796974092721939,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 9620
    },
    {
      "grad_norm": 0.11391983181238174,
      "learning_rate": 0.0002711408155568956,
      "loss": 0.4049,
      "step": 9630
    },
    {
      "gate_value": 0.17991317808628082,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 9630
    },
    {
      "grad_norm": 1.1728397607803345,
      "learning_rate": 0.0002710676424519535,
      "loss": 0.4106,
      "step": 9640
    },
    {
      "gate_value": 0.18047550320625305,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9640
    },
    {
      "grad_norm": 0.14085493981838226,
      "learning_rate": 0.00027099438659841933,
      "loss": 0.3913,
      "step": 9650
    },
    {
      "gate_value": 0.18084587156772614,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 9650
    },
    {
      "grad_norm": 0.33219820261001587,
      "learning_rate": 0.0002709210480463628,
      "loss": 0.4001,
      "step": 9660
    },
    {
      "gate_value": 0.18111124634742737,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 9660
    },
    {
      "grad_norm": 0.1452653855085373,
      "learning_rate": 0.0002708476268459102,
      "loss": 0.4168,
      "step": 9670
    },
    {
      "gate_value": 0.1815754622220993,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 9670
    },
    {
      "grad_norm": 0.40567007660865784,
      "learning_rate": 0.0002707741230472442,
      "loss": 0.3981,
      "step": 9680
    },
    {
      "gate_value": 0.18233047425746918,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 9680
    },
    {
      "grad_norm": 0.13525651395320892,
      "learning_rate": 0.00027070053670060385,
      "loss": 0.4127,
      "step": 9690
    },
    {
      "gate_value": 0.1820317804813385,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 9690
    },
    {
      "grad_norm": 0.1818414181470871,
      "learning_rate": 0.0002706268678562849,
      "loss": 0.4117,
      "step": 9700
    },
    {
      "gate_value": 0.1822560578584671,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9700
    },
    {
      "grad_norm": 0.4028673470020294,
      "learning_rate": 0.0002705531165646391,
      "loss": 0.3898,
      "step": 9710
    },
    {
      "gate_value": 0.1818368136882782,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 9710
    },
    {
      "grad_norm": 0.12147530168294907,
      "learning_rate": 0.00027047928287607495,
      "loss": 0.4118,
      "step": 9720
    },
    {
      "gate_value": 0.182128444314003,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9720
    },
    {
      "grad_norm": 0.4396604299545288,
      "learning_rate": 0.000270405366841057,
      "loss": 0.4106,
      "step": 9730
    },
    {
      "gate_value": 0.18241579830646515,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 9730
    },
    {
      "grad_norm": 0.09294986724853516,
      "learning_rate": 0.0002703313685101062,
      "loss": 0.3941,
      "step": 9740
    },
    {
      "gate_value": 0.18249233067035675,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 9740
    },
    {
      "grad_norm": 0.15700602531433105,
      "learning_rate": 0.00027025728793379956,
      "loss": 0.412,
      "step": 9750
    },
    {
      "gate_value": 0.18315435945987701,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 9750
    },
    {
      "grad_norm": 0.7230283617973328,
      "learning_rate": 0.0002701831251627707,
      "loss": 0.4121,
      "step": 9760
    },
    {
      "gate_value": 0.1829233020544052,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 9760
    },
    {
      "grad_norm": 0.11449993401765823,
      "learning_rate": 0.000270108880247709,
      "loss": 0.397,
      "step": 9770
    },
    {
      "gate_value": 0.1825665831565857,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 9770
    },
    {
      "grad_norm": 0.2630598247051239,
      "learning_rate": 0.00027003455323936014,
      "loss": 0.4077,
      "step": 9780
    },
    {
      "gate_value": 0.18319743871688843,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 9780
    },
    {
      "grad_norm": 0.22726242244243622,
      "learning_rate": 0.00026996014418852616,
      "loss": 0.4094,
      "step": 9790
    },
    {
      "gate_value": 0.18323422968387604,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 9790
    },
    {
      "grad_norm": 0.13267017900943756,
      "learning_rate": 0.0002698856531460646,
      "loss": 0.3891,
      "step": 9800
    },
    {
      "gate_value": 0.1828533113002777,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 9800
    },
    {
      "grad_norm": 0.31722262501716614,
      "learning_rate": 0.0002698110801628897,
      "loss": 0.4149,
      "step": 9810
    },
    {
      "gate_value": 0.1827845424413681,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 9810
    },
    {
      "grad_norm": 1.0422130823135376,
      "learning_rate": 0.0002697364252899713,
      "loss": 0.4124,
      "step": 9820
    },
    {
      "gate_value": 0.18345683813095093,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 9820
    },
    {
      "grad_norm": 0.30988809466362,
      "learning_rate": 0.0002696616885783351,
      "loss": 0.4034,
      "step": 9830
    },
    {
      "gate_value": 0.18385672569274902,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 9830
    },
    {
      "grad_norm": 0.5596709847450256,
      "learning_rate": 0.0002695868700790632,
      "loss": 0.4102,
      "step": 9840
    },
    {
      "gate_value": 0.18392261862754822,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 9840
    },
    {
      "grad_norm": 0.3464175760746002,
      "learning_rate": 0.00026951196984329324,
      "loss": 0.3861,
      "step": 9850
    },
    {
      "gate_value": 0.18414254486560822,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 9850
    },
    {
      "grad_norm": 0.6886278390884399,
      "learning_rate": 0.00026943698792221876,
      "loss": 0.4217,
      "step": 9860
    },
    {
      "gate_value": 0.1840921938419342,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 9860
    },
    {
      "grad_norm": 0.1508173644542694,
      "learning_rate": 0.00026936192436708935,
      "loss": 0.4243,
      "step": 9870
    },
    {
      "gate_value": 0.1839398741722107,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 9870
    },
    {
      "grad_norm": 0.7154667377471924,
      "learning_rate": 0.0002692867792292101,
      "loss": 0.3945,
      "step": 9880
    },
    {
      "gate_value": 0.18328236043453217,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 9880
    },
    {
      "grad_norm": 0.453127920627594,
      "learning_rate": 0.000269211552559942,
      "loss": 0.3922,
      "step": 9890
    },
    {
      "gate_value": 0.1840886026620865,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 9890
    },
    {
      "grad_norm": 0.4540349543094635,
      "learning_rate": 0.0002691362444107019,
      "loss": 0.4095,
      "step": 9900
    },
    {
      "gate_value": 0.18400152027606964,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 9900
    },
    {
      "grad_norm": 0.09634116291999817,
      "learning_rate": 0.0002690608548329621,
      "loss": 0.3901,
      "step": 9910
    },
    {
      "gate_value": 0.1838119924068451,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 9910
    },
    {
      "grad_norm": 0.20187024772167206,
      "learning_rate": 0.00026898538387825076,
      "loss": 0.3852,
      "step": 9920
    },
    {
      "gate_value": 0.18371149897575378,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 9920
    },
    {
      "grad_norm": 0.14723998308181763,
      "learning_rate": 0.00026890983159815146,
      "loss": 0.4087,
      "step": 9930
    },
    {
      "gate_value": 0.18399539589881897,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 9930
    },
    {
      "grad_norm": 0.4817863702774048,
      "learning_rate": 0.00026883419804430347,
      "loss": 0.4053,
      "step": 9940
    },
    {
      "gate_value": 0.18433444201946259,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 9940
    },
    {
      "grad_norm": 0.10653712600469589,
      "learning_rate": 0.0002687584832684017,
      "loss": 0.4021,
      "step": 9950
    },
    {
      "gate_value": 0.18434855341911316,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 9950
    },
    {
      "grad_norm": 0.11284265667200089,
      "learning_rate": 0.00026868268732219646,
      "loss": 0.3996,
      "step": 9960
    },
    {
      "gate_value": 0.18480272591114044,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 9960
    },
    {
      "grad_norm": 1.6425282955169678,
      "learning_rate": 0.0002686068102574935,
      "loss": 0.4174,
      "step": 9970
    },
    {
      "gate_value": 0.18493328988552094,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 9970
    },
    {
      "grad_norm": 0.20928919315338135,
      "learning_rate": 0.00026853085212615415,
      "loss": 0.4149,
      "step": 9980
    },
    {
      "gate_value": 0.18461304903030396,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 9980
    },
    {
      "grad_norm": 0.1534522920846939,
      "learning_rate": 0.0002684548129800951,
      "loss": 0.4061,
      "step": 9990
    },
    {
      "gate_value": 0.1850883811712265,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 9990
    },
    {
      "grad_norm": 0.11889275908470154,
      "learning_rate": 0.0002683786928712883,
      "loss": 0.4142,
      "step": 10000
    },
    {
      "gate_value": 0.1857037991285324,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 10000
    },
    {
      "grad_norm": 0.10349398106336594,
      "learning_rate": 0.0002683024918517611,
      "loss": 0.4084,
      "step": 10010
    },
    {
      "gate_value": 0.18580205738544464,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 10010
    },
    {
      "grad_norm": 0.20732319355010986,
      "learning_rate": 0.0002682262099735963,
      "loss": 0.4431,
      "step": 10020
    },
    {
      "gate_value": 0.18496911227703094,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10020
    },
    {
      "grad_norm": 0.08580266684293747,
      "learning_rate": 0.0002681498472889318,
      "loss": 0.4009,
      "step": 10030
    },
    {
      "gate_value": 0.18453511595726013,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 10030
    },
    {
      "grad_norm": 0.14457011222839355,
      "learning_rate": 0.00026807340384996076,
      "loss": 0.4056,
      "step": 10040
    },
    {
      "gate_value": 0.18475934863090515,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10040
    },
    {
      "grad_norm": 0.11152706295251846,
      "learning_rate": 0.00026799687970893157,
      "loss": 0.4083,
      "step": 10050
    },
    {
      "gate_value": 0.1854647696018219,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10050
    },
    {
      "grad_norm": 0.20805339515209198,
      "learning_rate": 0.0002679202749181477,
      "loss": 0.4114,
      "step": 10060
    },
    {
      "gate_value": 0.1850779503583908,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 10060
    },
    {
      "grad_norm": 0.12921452522277832,
      "learning_rate": 0.00026784358952996784,
      "loss": 0.3928,
      "step": 10070
    },
    {
      "gate_value": 0.18519233167171478,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 10070
    },
    {
      "grad_norm": 0.6804535388946533,
      "learning_rate": 0.0002677668235968058,
      "loss": 0.4186,
      "step": 10080
    },
    {
      "gate_value": 0.18516965210437775,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 10080
    },
    {
      "grad_norm": 0.214371919631958,
      "learning_rate": 0.0002676899771711303,
      "loss": 0.4086,
      "step": 10090
    },
    {
      "gate_value": 0.18558774888515472,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 10090
    },
    {
      "grad_norm": 0.4606313407421112,
      "learning_rate": 0.0002676130503054651,
      "loss": 0.3933,
      "step": 10100
    },
    {
      "gate_value": 0.18566465377807617,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10100
    },
    {
      "grad_norm": 0.08777086436748505,
      "learning_rate": 0.00026753604305238904,
      "loss": 0.4084,
      "step": 10110
    },
    {
      "gate_value": 0.18632765114307404,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 10110
    },
    {
      "grad_norm": 0.12021689116954803,
      "learning_rate": 0.00026745895546453587,
      "loss": 0.4083,
      "step": 10120
    },
    {
      "gate_value": 0.18685314059257507,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 10120
    },
    {
      "grad_norm": 0.3062693774700165,
      "learning_rate": 0.0002673817875945942,
      "loss": 0.4227,
      "step": 10130
    },
    {
      "gate_value": 0.18733736872673035,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 10130
    },
    {
      "grad_norm": 0.08411114662885666,
      "learning_rate": 0.0002673045394953076,
      "loss": 0.3954,
      "step": 10140
    },
    {
      "gate_value": 0.1871633529663086,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 10140
    },
    {
      "grad_norm": 0.221735417842865,
      "learning_rate": 0.00026722721121947435,
      "loss": 0.3988,
      "step": 10150
    },
    {
      "gate_value": 0.1876184344291687,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 10150
    },
    {
      "grad_norm": 0.2119043618440628,
      "learning_rate": 0.00026714980281994756,
      "loss": 0.4149,
      "step": 10160
    },
    {
      "gate_value": 0.18710008263587952,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10160
    },
    {
      "grad_norm": 0.2251426726579666,
      "learning_rate": 0.0002670723143496353,
      "loss": 0.3877,
      "step": 10170
    },
    {
      "gate_value": 0.18695615231990814,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 10170
    },
    {
      "grad_norm": 0.12051714211702347,
      "learning_rate": 0.00026699474586150006,
      "loss": 0.4051,
      "step": 10180
    },
    {
      "gate_value": 0.18728826940059662,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 10180
    },
    {
      "grad_norm": 0.15876959264278412,
      "learning_rate": 0.0002669170974085592,
      "loss": 0.3975,
      "step": 10190
    },
    {
      "gate_value": 0.18744824826717377,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 10190
    },
    {
      "grad_norm": 0.06675142794847488,
      "learning_rate": 0.00026683936904388475,
      "loss": 0.4138,
      "step": 10200
    },
    {
      "gate_value": 0.18797366321086884,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 10200
    },
    {
      "grad_norm": 0.1978006511926651,
      "learning_rate": 0.00026676156082060324,
      "loss": 0.4155,
      "step": 10210
    },
    {
      "gate_value": 0.1884395033121109,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 10210
    },
    {
      "grad_norm": 0.13535474240779877,
      "learning_rate": 0.00026668367279189596,
      "loss": 0.4163,
      "step": 10220
    },
    {
      "gate_value": 0.18881042301654816,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10220
    },
    {
      "grad_norm": 0.3524298071861267,
      "learning_rate": 0.0002666057050109986,
      "loss": 0.3984,
      "step": 10230
    },
    {
      "gate_value": 0.1897391676902771,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10230
    },
    {
      "grad_norm": 0.11394257098436356,
      "learning_rate": 0.0002665276575312013,
      "loss": 0.4047,
      "step": 10240
    },
    {
      "gate_value": 0.18992333114147186,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10240
    },
    {
      "grad_norm": 0.19303439557552338,
      "learning_rate": 0.000266449530405849,
      "loss": 0.4021,
      "step": 10250
    },
    {
      "gate_value": 0.1900814026594162,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 10250
    },
    {
      "grad_norm": 0.25334855914115906,
      "learning_rate": 0.0002663713236883406,
      "loss": 0.41,
      "step": 10260
    },
    {
      "gate_value": 0.19061408936977386,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10260
    },
    {
      "grad_norm": 0.13257992267608643,
      "learning_rate": 0.00026629303743212984,
      "loss": 0.4089,
      "step": 10270
    },
    {
      "gate_value": 0.19116652011871338,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 10270
    },
    {
      "grad_norm": 0.10446831583976746,
      "learning_rate": 0.00026621467169072455,
      "loss": 0.4039,
      "step": 10280
    },
    {
      "gate_value": 0.19149279594421387,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 10280
    },
    {
      "grad_norm": 0.18859893083572388,
      "learning_rate": 0.00026613622651768703,
      "loss": 0.3911,
      "step": 10290
    },
    {
      "gate_value": 0.19110308587551117,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 10290
    },
    {
      "grad_norm": 0.1820632517337799,
      "learning_rate": 0.00026605770196663374,
      "loss": 0.4042,
      "step": 10300
    },
    {
      "gate_value": 0.19114728271961212,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 10300
    },
    {
      "grad_norm": 0.2477792650461197,
      "learning_rate": 0.0002659790980912355,
      "loss": 0.4068,
      "step": 10310
    },
    {
      "gate_value": 0.1913151741027832,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 10310
    },
    {
      "grad_norm": 0.14938776195049286,
      "learning_rate": 0.0002659004149452174,
      "loss": 0.4046,
      "step": 10320
    },
    {
      "gate_value": 0.19160404801368713,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 10320
    },
    {
      "grad_norm": 0.09767202287912369,
      "learning_rate": 0.0002658216525823585,
      "loss": 0.3943,
      "step": 10330
    },
    {
      "gate_value": 0.1918257772922516,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 10330
    },
    {
      "grad_norm": 0.10734827816486359,
      "learning_rate": 0.0002657428110564923,
      "loss": 0.4126,
      "step": 10340
    },
    {
      "gate_value": 0.1912240982055664,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 10340
    },
    {
      "grad_norm": 0.0930911973118782,
      "learning_rate": 0.00026566389042150597,
      "loss": 0.408,
      "step": 10350
    },
    {
      "gate_value": 0.19152508676052094,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 10350
    },
    {
      "grad_norm": 0.2743377089500427,
      "learning_rate": 0.0002655848907313413,
      "loss": 0.3837,
      "step": 10360
    },
    {
      "gate_value": 0.1920650601387024,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10360
    },
    {
      "grad_norm": 0.34478214383125305,
      "learning_rate": 0.00026550581203999365,
      "loss": 0.396,
      "step": 10370
    },
    {
      "gate_value": 0.19311079382896423,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 10370
    },
    {
      "grad_norm": 0.8458656072616577,
      "learning_rate": 0.00026542665440151266,
      "loss": 0.4128,
      "step": 10380
    },
    {
      "gate_value": 0.19356413185596466,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 10380
    },
    {
      "grad_norm": 0.08593804389238358,
      "learning_rate": 0.00026534741787000176,
      "loss": 0.4051,
      "step": 10390
    },
    {
      "gate_value": 0.1938723474740982,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 10390
    },
    {
      "grad_norm": 0.11517023295164108,
      "learning_rate": 0.0002652681024996185,
      "loss": 0.4116,
      "step": 10400
    },
    {
      "gate_value": 0.19354356825351715,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 10400
    },
    {
      "grad_norm": 0.07598927617073059,
      "learning_rate": 0.000265188708344574,
      "loss": 0.3978,
      "step": 10410
    },
    {
      "gate_value": 0.19409018754959106,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 10410
    },
    {
      "grad_norm": 0.2558054029941559,
      "learning_rate": 0.00026510923545913355,
      "loss": 0.3929,
      "step": 10420
    },
    {
      "gate_value": 0.19420674443244934,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 10420
    },
    {
      "grad_norm": 0.09162849932909012,
      "learning_rate": 0.0002650296838976161,
      "loss": 0.4059,
      "step": 10430
    },
    {
      "gate_value": 0.19449542462825775,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10430
    },
    {
      "grad_norm": 0.15429988503456116,
      "learning_rate": 0.00026495005371439433,
      "loss": 0.3918,
      "step": 10440
    },
    {
      "gate_value": 0.19442659616470337,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10440
    },
    {
      "grad_norm": 0.2850008010864258,
      "learning_rate": 0.00026487034496389475,
      "loss": 0.4045,
      "step": 10450
    },
    {
      "gate_value": 0.19495940208435059,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 10450
    },
    {
      "grad_norm": 0.060812897980213165,
      "learning_rate": 0.00026479055770059755,
      "loss": 0.4023,
      "step": 10460
    },
    {
      "gate_value": 0.19487933814525604,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 10460
    },
    {
      "grad_norm": 0.14067623019218445,
      "learning_rate": 0.0002647106919790366,
      "loss": 0.4084,
      "step": 10470
    },
    {
      "gate_value": 0.19565919041633606,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 10470
    },
    {
      "grad_norm": 0.5360892415046692,
      "learning_rate": 0.00026463074785379936,
      "loss": 0.412,
      "step": 10480
    },
    {
      "gate_value": 0.19578729569911957,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 10480
    },
    {
      "grad_norm": 0.1606275886297226,
      "learning_rate": 0.00026455072537952685,
      "loss": 0.4003,
      "step": 10490
    },
    {
      "gate_value": 0.19515106081962585,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 10490
    },
    {
      "grad_norm": 0.1275758445262909,
      "learning_rate": 0.00026447062461091366,
      "loss": 0.4102,
      "step": 10500
    },
    {
      "gate_value": 0.19452591240406036,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 10500
    },
    {
      "grad_norm": 0.757901668548584,
      "learning_rate": 0.000264390445602708,
      "loss": 0.4105,
      "step": 10510
    },
    {
      "gate_value": 0.19385652244091034,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 10510
    },
    {
      "grad_norm": 0.23169085383415222,
      "learning_rate": 0.00026431018840971136,
      "loss": 0.393,
      "step": 10520
    },
    {
      "gate_value": 0.19386009871959686,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10520
    },
    {
      "grad_norm": 0.09247046709060669,
      "learning_rate": 0.0002642298530867788,
      "loss": 0.3884,
      "step": 10530
    },
    {
      "gate_value": 0.1936485320329666,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 10530
    },
    {
      "grad_norm": 0.1239648163318634,
      "learning_rate": 0.0002641494396888188,
      "loss": 0.3954,
      "step": 10540
    },
    {
      "gate_value": 0.193940669298172,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 10540
    },
    {
      "grad_norm": 0.08281738311052322,
      "learning_rate": 0.00026406894827079317,
      "loss": 0.3819,
      "step": 10550
    },
    {
      "gate_value": 0.19448955357074738,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 10550
    },
    {
      "grad_norm": 0.2577625811100006,
      "learning_rate": 0.000263988378887717,
      "loss": 0.4143,
      "step": 10560
    },
    {
      "gate_value": 0.19554971158504486,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 10560
    },
    {
      "grad_norm": 0.1077817901968956,
      "learning_rate": 0.0002639077315946587,
      "loss": 0.4029,
      "step": 10570
    },
    {
      "gate_value": 0.19615165889263153,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 10570
    },
    {
      "grad_norm": 0.24295492470264435,
      "learning_rate": 0.0002638270064467399,
      "loss": 0.4179,
      "step": 10580
    },
    {
      "gate_value": 0.19590169191360474,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 10580
    },
    {
      "grad_norm": 0.1465950757265091,
      "learning_rate": 0.00026374620349913554,
      "loss": 0.3956,
      "step": 10590
    },
    {
      "gate_value": 0.196251779794693,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 10590
    },
    {
      "grad_norm": 0.1468675583600998,
      "learning_rate": 0.00026366532280707366,
      "loss": 0.3949,
      "step": 10600
    },
    {
      "gate_value": 0.19655568897724152,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10600
    },
    {
      "grad_norm": 0.25693845748901367,
      "learning_rate": 0.00026358436442583546,
      "loss": 0.4107,
      "step": 10610
    },
    {
      "gate_value": 0.19645854830741882,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 10610
    },
    {
      "grad_norm": 0.6391911506652832,
      "learning_rate": 0.0002635033284107552,
      "loss": 0.3942,
      "step": 10620
    },
    {
      "gate_value": 0.195563405752182,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 10620
    },
    {
      "grad_norm": 0.08769239485263824,
      "learning_rate": 0.00026342221481722025,
      "loss": 0.4048,
      "step": 10630
    },
    {
      "gate_value": 0.1954774558544159,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 10630
    },
    {
      "grad_norm": 0.8492870926856995,
      "learning_rate": 0.00026334102370067093,
      "loss": 0.4027,
      "step": 10640
    },
    {
      "gate_value": 0.19497144222259521,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 10640
    },
    {
      "grad_norm": 0.1712755709886551,
      "learning_rate": 0.00026325975511660066,
      "loss": 0.4122,
      "step": 10650
    },
    {
      "gate_value": 0.19459974765777588,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 10650
    },
    {
      "grad_norm": 0.0769922062754631,
      "learning_rate": 0.00026317840912055577,
      "loss": 0.3907,
      "step": 10660
    },
    {
      "gate_value": 0.19528329372406006,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 10660
    },
    {
      "grad_norm": 0.09351008385419846,
      "learning_rate": 0.00026309698576813546,
      "loss": 0.3861,
      "step": 10670
    },
    {
      "gate_value": 0.19526593387126923,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 10670
    },
    {
      "grad_norm": 0.10859563201665878,
      "learning_rate": 0.00026301548511499187,
      "loss": 0.3891,
      "step": 10680
    },
    {
      "gate_value": 0.1956755518913269,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 10680
    },
    {
      "grad_norm": 0.09231425821781158,
      "learning_rate": 0.0002629339072168298,
      "loss": 0.4035,
      "step": 10690
    },
    {
      "gate_value": 0.19672146439552307,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 10690
    },
    {
      "grad_norm": 0.1549149751663208,
      "learning_rate": 0.00026285225212940703,
      "loss": 0.3961,
      "step": 10700
    },
    {
      "gate_value": 0.19741348922252655,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 10700
    },
    {
      "grad_norm": 0.13633489608764648,
      "learning_rate": 0.0002627705199085341,
      "loss": 0.3813,
      "step": 10710
    },
    {
      "gate_value": 0.1971900761127472,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 10710
    },
    {
      "grad_norm": 0.22589802742004395,
      "learning_rate": 0.0002626887106100742,
      "loss": 0.4179,
      "step": 10720
    },
    {
      "gate_value": 0.19635334610939026,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 10720
    },
    {
      "grad_norm": 0.14190012216567993,
      "learning_rate": 0.0002626068242899432,
      "loss": 0.418,
      "step": 10730
    },
    {
      "gate_value": 0.1962626874446869,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 10730
    },
    {
      "grad_norm": 0.19081388413906097,
      "learning_rate": 0.0002625248610041095,
      "loss": 0.3844,
      "step": 10740
    },
    {
      "gate_value": 0.19717535376548767,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 10740
    },
    {
      "grad_norm": 0.1003599539399147,
      "learning_rate": 0.0002624428208085945,
      "loss": 0.4141,
      "step": 10750
    },
    {
      "gate_value": 0.19717150926589966,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 10750
    },
    {
      "grad_norm": 0.9871111512184143,
      "learning_rate": 0.0002623607037594717,
      "loss": 0.4207,
      "step": 10760
    },
    {
      "gate_value": 0.19743303954601288,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10760
    },
    {
      "grad_norm": 0.10571835190057755,
      "learning_rate": 0.0002622785099128673,
      "loss": 0.392,
      "step": 10770
    },
    {
      "gate_value": 0.197885200381279,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10770
    },
    {
      "grad_norm": 1.096756100654602,
      "learning_rate": 0.0002621962393249602,
      "loss": 0.4089,
      "step": 10780
    },
    {
      "gate_value": 0.19861187040805817,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 10780
    },
    {
      "grad_norm": 4.666723251342773,
      "learning_rate": 0.0002621138920519814,
      "loss": 0.3928,
      "step": 10790
    },
    {
      "gate_value": 0.1985689103603363,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 10790
    },
    {
      "grad_norm": 0.28258785605430603,
      "learning_rate": 0.0002620314681502146,
      "loss": 0.4085,
      "step": 10800
    },
    {
      "gate_value": 0.19868691265583038,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 10800
    },
    {
      "grad_norm": 0.14337381720542908,
      "learning_rate": 0.00026194896767599567,
      "loss": 0.4042,
      "step": 10810
    },
    {
      "gate_value": 0.19859598577022552,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 10810
    },
    {
      "grad_norm": 0.19911415874958038,
      "learning_rate": 0.000261866390685713,
      "loss": 0.4034,
      "step": 10820
    },
    {
      "gate_value": 0.19771799445152283,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 10820
    },
    {
      "grad_norm": 0.13793812692165375,
      "learning_rate": 0.0002617837372358071,
      "loss": 0.3974,
      "step": 10830
    },
    {
      "gate_value": 0.19785933196544647,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 10830
    },
    {
      "grad_norm": 0.2408376932144165,
      "learning_rate": 0.00026170100738277086,
      "loss": 0.4094,
      "step": 10840
    },
    {
      "gate_value": 0.19815245270729065,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 10840
    },
    {
      "grad_norm": 5.759314060211182,
      "learning_rate": 0.0002616182011831493,
      "loss": 0.3989,
      "step": 10850
    },
    {
      "gate_value": 0.1983094960451126,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10850
    },
    {
      "grad_norm": 0.5932163596153259,
      "learning_rate": 0.00026153531869353984,
      "loss": 0.3882,
      "step": 10860
    },
    {
      "gate_value": 0.19824165105819702,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 10860
    },
    {
      "grad_norm": 0.16285298764705658,
      "learning_rate": 0.0002614523599705917,
      "loss": 0.3957,
      "step": 10870
    },
    {
      "gate_value": 0.19801777601242065,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 10870
    },
    {
      "grad_norm": 0.10833615809679031,
      "learning_rate": 0.0002613693250710065,
      "loss": 0.4188,
      "step": 10880
    },
    {
      "gate_value": 0.1979728490114212,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10880
    },
    {
      "grad_norm": 0.27149438858032227,
      "learning_rate": 0.00026128621405153773,
      "loss": 0.3914,
      "step": 10890
    },
    {
      "gate_value": 0.19862493872642517,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 10890
    },
    {
      "grad_norm": 0.18332615494728088,
      "learning_rate": 0.000261203026968991,
      "loss": 0.4009,
      "step": 10900
    },
    {
      "gate_value": 0.1991543471813202,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 10900
    },
    {
      "grad_norm": 0.2317703515291214,
      "learning_rate": 0.0002611197638802239,
      "loss": 0.3942,
      "step": 10910
    },
    {
      "gate_value": 0.19955024123191833,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 10910
    },
    {
      "grad_norm": 0.31707388162612915,
      "learning_rate": 0.000261036424842146,
      "loss": 0.39,
      "step": 10920
    },
    {
      "gate_value": 0.19923262298107147,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10920
    },
    {
      "grad_norm": 0.15591078996658325,
      "learning_rate": 0.0002609530099117188,
      "loss": 0.3914,
      "step": 10930
    },
    {
      "gate_value": 0.1992308497428894,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 10930
    },
    {
      "grad_norm": 0.17535750567913055,
      "learning_rate": 0.0002608695191459555,
      "loss": 0.4069,
      "step": 10940
    },
    {
      "gate_value": 0.19915905594825745,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 10940
    },
    {
      "grad_norm": 0.18063399195671082,
      "learning_rate": 0.00026078595260192137,
      "loss": 0.3959,
      "step": 10950
    },
    {
      "gate_value": 0.19885124266147614,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10950
    },
    {
      "grad_norm": 0.39659494161605835,
      "learning_rate": 0.00026070231033673317,
      "loss": 0.4133,
      "step": 10960
    },
    {
      "gate_value": 0.198845773935318,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 10960
    },
    {
      "grad_norm": 0.3130630552768707,
      "learning_rate": 0.00026061859240755975,
      "loss": 0.4056,
      "step": 10970
    },
    {
      "gate_value": 0.1990204006433487,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 10970
    },
    {
      "grad_norm": 0.2690387964248657,
      "learning_rate": 0.00026053479887162156,
      "loss": 0.3996,
      "step": 10980
    },
    {
      "gate_value": 0.19938315451145172,
      "icl_sequence_length": 56,
      "num_contexts": 3,
      "step": 10980
    },
    {
      "grad_norm": 0.519874632358551,
      "learning_rate": 0.0002604509297861905,
      "loss": 0.3991,
      "step": 10990
    },
    {
      "gate_value": 0.19984617829322815,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 10990
    },
    {
      "grad_norm": 0.3831429183483124,
      "learning_rate": 0.00026036698520859054,
      "loss": 0.4068,
      "step": 11000
    },
    {
      "gate_value": 0.1997131109237671,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 11000
    },
    {
      "grad_norm": 0.3186333477497101,
      "learning_rate": 0.0002602829651961968,
      "loss": 0.4008,
      "step": 11010
    },
    {
      "gate_value": 0.20006214082241058,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 11010
    },
    {
      "grad_norm": 0.6247459650039673,
      "learning_rate": 0.0002601988698064363,
      "loss": 0.3934,
      "step": 11020
    },
    {
      "gate_value": 0.2011529803276062,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 11020
    },
    {
      "grad_norm": 0.5456902384757996,
      "learning_rate": 0.0002601146990967874,
      "loss": 0.3996,
      "step": 11030
    },
    {
      "gate_value": 0.2017337530851364,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 11030
    },
    {
      "grad_norm": 0.6686020493507385,
      "learning_rate": 0.00026003045312477996,
      "loss": 0.41,
      "step": 11040
    },
    {
      "gate_value": 0.20241455733776093,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11040
    },
    {
      "grad_norm": 0.33470675349235535,
      "learning_rate": 0.0002599461319479954,
      "loss": 0.4025,
      "step": 11050
    },
    {
      "gate_value": 0.20253930985927582,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 11050
    },
    {
      "grad_norm": 0.2852339446544647,
      "learning_rate": 0.0002598617356240663,
      "loss": 0.4074,
      "step": 11060
    },
    {
      "gate_value": 0.20202483236789703,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 11060
    },
    {
      "grad_norm": 3.269089460372925,
      "learning_rate": 0.00025977726421067687,
      "loss": 0.4201,
      "step": 11070
    },
    {
      "gate_value": 0.20169176161289215,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 11070
    },
    {
      "grad_norm": 0.577283501625061,
      "learning_rate": 0.0002596927177655625,
      "loss": 0.3903,
      "step": 11080
    },
    {
      "gate_value": 0.20160920917987823,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 11080
    },
    {
      "grad_norm": 0.5205775499343872,
      "learning_rate": 0.0002596080963465099,
      "loss": 0.4123,
      "step": 11090
    },
    {
      "gate_value": 0.2019902616739273,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 11090
    },
    {
      "grad_norm": 0.5158397555351257,
      "learning_rate": 0.00025952340001135694,
      "loss": 0.3929,
      "step": 11100
    },
    {
      "gate_value": 0.2014622986316681,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 11100
    },
    {
      "grad_norm": 0.3267214000225067,
      "learning_rate": 0.00025943862881799287,
      "loss": 0.3998,
      "step": 11110
    },
    {
      "gate_value": 0.201474130153656,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 11110
    },
    {
      "grad_norm": 0.4279942214488983,
      "learning_rate": 0.0002593537828243579,
      "loss": 0.3862,
      "step": 11120
    },
    {
      "gate_value": 0.20170217752456665,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 11120
    },
    {
      "grad_norm": 0.2724282741546631,
      "learning_rate": 0.0002592688620884435,
      "loss": 0.3939,
      "step": 11130
    },
    {
      "gate_value": 0.20227088034152985,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 11130
    },
    {
      "grad_norm": 0.24624592065811157,
      "learning_rate": 0.0002591838666682922,
      "loss": 0.3959,
      "step": 11140
    },
    {
      "gate_value": 0.20264950394630432,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 11140
    },
    {
      "grad_norm": 0.17288321256637573,
      "learning_rate": 0.0002590987966219976,
      "loss": 0.3931,
      "step": 11150
    },
    {
      "gate_value": 0.2034718543291092,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 11150
    },
    {
      "grad_norm": 0.12512151896953583,
      "learning_rate": 0.00025901365200770433,
      "loss": 0.3943,
      "step": 11160
    },
    {
      "gate_value": 0.2034679651260376,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 11160
    },
    {
      "grad_norm": 0.22475214302539825,
      "learning_rate": 0.00025892843288360777,
      "loss": 0.424,
      "step": 11170
    },
    {
      "gate_value": 0.2031925916671753,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 11170
    },
    {
      "grad_norm": 0.11940725892782211,
      "learning_rate": 0.0002588431393079544,
      "loss": 0.4023,
      "step": 11180
    },
    {
      "gate_value": 0.20355471968650818,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11180
    },
    {
      "grad_norm": 0.16159369051456451,
      "learning_rate": 0.00025875777133904177,
      "loss": 0.4014,
      "step": 11190
    },
    {
      "gate_value": 0.20374581217765808,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 11190
    },
    {
      "grad_norm": 1.926444172859192,
      "learning_rate": 0.0002586723290352179,
      "loss": 0.3897,
      "step": 11200
    },
    {
      "gate_value": 0.2042069137096405,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 11200
    },
    {
      "grad_norm": 0.15389111638069153,
      "learning_rate": 0.0002585868124548819,
      "loss": 0.3937,
      "step": 11210
    },
    {
      "gate_value": 0.2046923041343689,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 11210
    },
    {
      "grad_norm": 0.4765077233314514,
      "learning_rate": 0.0002585012216564834,
      "loss": 0.4035,
      "step": 11220
    },
    {
      "gate_value": 0.2043992131948471,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 11220
    },
    {
      "grad_norm": 0.20746025443077087,
      "learning_rate": 0.00025841555669852307,
      "loss": 0.4032,
      "step": 11230
    },
    {
      "gate_value": 0.20465603470802307,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 11230
    },
    {
      "grad_norm": 2.6083579063415527,
      "learning_rate": 0.00025832981763955205,
      "loss": 0.391,
      "step": 11240
    },
    {
      "gate_value": 0.2050364762544632,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 11240
    },
    {
      "grad_norm": 0.14017367362976074,
      "learning_rate": 0.0002582440045381721,
      "loss": 0.3997,
      "step": 11250
    },
    {
      "gate_value": 0.20489144325256348,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 11250
    },
    {
      "grad_norm": 0.21358314156532288,
      "learning_rate": 0.0002581581174530357,
      "loss": 0.407,
      "step": 11260
    },
    {
      "gate_value": 0.20463384687900543,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 11260
    },
    {
      "grad_norm": 0.5433079600334167,
      "learning_rate": 0.000258072156442846,
      "loss": 0.398,
      "step": 11270
    },
    {
      "gate_value": 0.20472584664821625,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11270
    },
    {
      "grad_norm": 0.2550680339336395,
      "learning_rate": 0.0002579861215663564,
      "loss": 0.4106,
      "step": 11280
    },
    {
      "gate_value": 0.2049986571073532,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 11280
    },
    {
      "grad_norm": 5.18765926361084,
      "learning_rate": 0.00025790001288237093,
      "loss": 0.4125,
      "step": 11290
    },
    {
      "gate_value": 0.20544874668121338,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 11290
    },
    {
      "grad_norm": 0.8696367740631104,
      "learning_rate": 0.00025781383044974415,
      "loss": 0.4079,
      "step": 11300
    },
    {
      "gate_value": 0.20583544671535492,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 11300
    },
    {
      "grad_norm": 0.540113091468811,
      "learning_rate": 0.0002577275743273808,
      "loss": 0.4053,
      "step": 11310
    },
    {
      "gate_value": 0.20618411898612976,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 11310
    },
    {
      "grad_norm": 0.2631917893886566,
      "learning_rate": 0.00025764124457423627,
      "loss": 0.4173,
      "step": 11320
    },
    {
      "gate_value": 0.2062470018863678,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 11320
    },
    {
      "grad_norm": 0.4374353885650635,
      "learning_rate": 0.00025755484124931604,
      "loss": 0.4042,
      "step": 11330
    },
    {
      "gate_value": 0.20691519975662231,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 11330
    },
    {
      "grad_norm": 0.11059547960758209,
      "learning_rate": 0.000257468364411676,
      "loss": 0.3971,
      "step": 11340
    },
    {
      "gate_value": 0.20722566545009613,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 11340
    },
    {
      "grad_norm": 1.345912218093872,
      "learning_rate": 0.0002573818141204222,
      "loss": 0.3874,
      "step": 11350
    },
    {
      "gate_value": 0.20745126903057098,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 11350
    },
    {
      "grad_norm": 0.13886769115924835,
      "learning_rate": 0.0002572951904347111,
      "loss": 0.4211,
      "step": 11360
    },
    {
      "gate_value": 0.20737117528915405,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 11360
    },
    {
      "grad_norm": 0.12669552862644196,
      "learning_rate": 0.000257208493413749,
      "loss": 0.4001,
      "step": 11370
    },
    {
      "gate_value": 0.2072693556547165,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 11370
    },
    {
      "grad_norm": 0.19020166993141174,
      "learning_rate": 0.00025712172311679254,
      "loss": 0.4082,
      "step": 11380
    },
    {
      "gate_value": 0.20699158310890198,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 11380
    },
    {
      "grad_norm": 1.0254766941070557,
      "learning_rate": 0.0002570348796031485,
      "loss": 0.4116,
      "step": 11390
    },
    {
      "gate_value": 0.2072145640850067,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 11390
    },
    {
      "grad_norm": 0.566098690032959,
      "learning_rate": 0.0002569479629321735,
      "loss": 0.3974,
      "step": 11400
    },
    {
      "gate_value": 0.2072124481201172,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 11400
    },
    {
      "grad_norm": 0.15408563613891602,
      "learning_rate": 0.0002568609731632743,
      "loss": 0.4004,
      "step": 11410
    },
    {
      "gate_value": 0.20725180208683014,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11410
    },
    {
      "grad_norm": 0.1329866200685501,
      "learning_rate": 0.00025677391035590764,
      "loss": 0.4095,
      "step": 11420
    },
    {
      "gate_value": 0.20727354288101196,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 11420
    },
    {
      "grad_norm": 0.20672903954982758,
      "learning_rate": 0.00025668677456957997,
      "loss": 0.4015,
      "step": 11430
    },
    {
      "gate_value": 0.20741140842437744,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11430
    },
    {
      "grad_norm": 2.841158390045166,
      "learning_rate": 0.00025659956586384795,
      "loss": 0.4014,
      "step": 11440
    },
    {
      "gate_value": 0.20806585252285004,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 11440
    },
    {
      "grad_norm": 0.2784873843193054,
      "learning_rate": 0.00025651228429831777,
      "loss": 0.3921,
      "step": 11450
    },
    {
      "gate_value": 0.2081623524427414,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 11450
    },
    {
      "grad_norm": 0.27490857243537903,
      "learning_rate": 0.00025642492993264564,
      "loss": 0.3905,
      "step": 11460
    },
    {
      "gate_value": 0.20837196707725525,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 11460
    },
    {
      "grad_norm": 0.17722657322883606,
      "learning_rate": 0.00025633750282653744,
      "loss": 0.4118,
      "step": 11470
    },
    {
      "gate_value": 0.20847874879837036,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 11470
    },
    {
      "grad_norm": 0.22108136117458344,
      "learning_rate": 0.0002562500030397488,
      "loss": 0.4031,
      "step": 11480
    },
    {
      "gate_value": 0.20883841812610626,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 11480
    },
    {
      "grad_norm": 7.156761646270752,
      "learning_rate": 0.0002561624306320849,
      "loss": 0.3949,
      "step": 11490
    },
    {
      "gate_value": 0.2090214192867279,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 11490
    },
    {
      "grad_norm": 1.761451005935669,
      "learning_rate": 0.0002560747856634007,
      "loss": 0.3936,
      "step": 11500
    },
    {
      "gate_value": 0.20910920202732086,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 11500
    },
    {
      "grad_norm": 0.25213727355003357,
      "learning_rate": 0.00025598706819360083,
      "loss": 0.379,
      "step": 11510
    },
    {
      "gate_value": 0.20905554294586182,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 11510
    },
    {
      "grad_norm": 0.11987407505512238,
      "learning_rate": 0.00025589927828263914,
      "loss": 0.4081,
      "step": 11520
    },
    {
      "gate_value": 0.20902156829833984,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 11520
    },
    {
      "grad_norm": 0.22566959261894226,
      "learning_rate": 0.00025581141599051937,
      "loss": 0.4216,
      "step": 11530
    },
    {
      "gate_value": 0.20962709188461304,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 11530
    },
    {
      "grad_norm": 0.4890899658203125,
      "learning_rate": 0.0002557234813772945,
      "loss": 0.4087,
      "step": 11540
    },
    {
      "gate_value": 0.2097298502922058,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 11540
    },
    {
      "grad_norm": 0.2041608691215515,
      "learning_rate": 0.00025563547450306703,
      "loss": 0.408,
      "step": 11550
    },
    {
      "gate_value": 0.20969891548156738,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 11550
    },
    {
      "grad_norm": 1.2819507122039795,
      "learning_rate": 0.0002555473954279888,
      "loss": 0.4097,
      "step": 11560
    },
    {
      "gate_value": 0.20980440080165863,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11560
    },
    {
      "grad_norm": 0.22319957613945007,
      "learning_rate": 0.00025545924421226107,
      "loss": 0.4189,
      "step": 11570
    },
    {
      "gate_value": 0.20961406826972961,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 11570
    },
    {
      "grad_norm": 0.2318154275417328,
      "learning_rate": 0.00025537102091613434,
      "loss": 0.4078,
      "step": 11580
    },
    {
      "gate_value": 0.20991379022598267,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11580
    },
    {
      "grad_norm": 0.2543465495109558,
      "learning_rate": 0.0002552827255999084,
      "loss": 0.3914,
      "step": 11590
    },
    {
      "gate_value": 0.21022337675094604,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 11590
    },
    {
      "grad_norm": 0.8110744953155518,
      "learning_rate": 0.00025519435832393225,
      "loss": 0.3871,
      "step": 11600
    },
    {
      "gate_value": 0.2104700356721878,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 11600
    },
    {
      "grad_norm": 0.1309548169374466,
      "learning_rate": 0.0002551059191486041,
      "loss": 0.4099,
      "step": 11610
    },
    {
      "gate_value": 0.21059533953666687,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 11610
    },
    {
      "grad_norm": 0.6616964936256409,
      "learning_rate": 0.00025501740813437137,
      "loss": 0.389,
      "step": 11620
    },
    {
      "gate_value": 0.21070751547813416,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 11620
    },
    {
      "grad_norm": 0.5464457869529724,
      "learning_rate": 0.00025492882534173037,
      "loss": 0.3947,
      "step": 11630
    },
    {
      "gate_value": 0.21091219782829285,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 11630
    },
    {
      "grad_norm": 8.245672225952148,
      "learning_rate": 0.0002548401708312267,
      "loss": 0.3988,
      "step": 11640
    },
    {
      "gate_value": 0.21120022237300873,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 11640
    },
    {
      "grad_norm": 0.2549119293689728,
      "learning_rate": 0.0002547514446634548,
      "loss": 0.3989,
      "step": 11650
    },
    {
      "gate_value": 0.2115693986415863,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 11650
    },
    {
      "grad_norm": 1.0499767065048218,
      "learning_rate": 0.00025466264689905826,
      "loss": 0.3905,
      "step": 11660
    },
    {
      "gate_value": 0.2116161584854126,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 11660
    },
    {
      "grad_norm": 0.4546162188053131,
      "learning_rate": 0.00025457377759872946,
      "loss": 0.4116,
      "step": 11670
    },
    {
      "gate_value": 0.2118997722864151,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 11670
    },
    {
      "grad_norm": 0.4323030710220337,
      "learning_rate": 0.00025448483682320976,
      "loss": 0.401,
      "step": 11680
    },
    {
      "gate_value": 0.21170073747634888,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 11680
    },
    {
      "grad_norm": 2.9688122272491455,
      "learning_rate": 0.00025439582463328937,
      "loss": 0.4011,
      "step": 11690
    },
    {
      "gate_value": 0.21142630279064178,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 11690
    },
    {
      "grad_norm": 0.3110814094543457,
      "learning_rate": 0.00025430674108980713,
      "loss": 0.3959,
      "step": 11700
    },
    {
      "gate_value": 0.21202224493026733,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 11700
    },
    {
      "grad_norm": 0.1604657620191574,
      "learning_rate": 0.000254217586253651,
      "loss": 0.3987,
      "step": 11710
    },
    {
      "gate_value": 0.21249915659427643,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 11710
    },
    {
      "grad_norm": 0.7040252089500427,
      "learning_rate": 0.0002541283601857573,
      "loss": 0.3869,
      "step": 11720
    },
    {
      "gate_value": 0.21285098791122437,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 11720
    },
    {
      "grad_norm": 0.8333344459533691,
      "learning_rate": 0.00025403906294711135,
      "loss": 0.3844,
      "step": 11730
    },
    {
      "gate_value": 0.21329067647457123,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 11730
    },
    {
      "grad_norm": 0.3842971622943878,
      "learning_rate": 0.0002539496945987469,
      "loss": 0.3995,
      "step": 11740
    },
    {
      "gate_value": 0.21374772489070892,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 11740
    },
    {
      "grad_norm": 0.16032423079013824,
      "learning_rate": 0.00025386025520174636,
      "loss": 0.3798,
      "step": 11750
    },
    {
      "gate_value": 0.21357718110084534,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11750
    },
    {
      "grad_norm": 12.013026237487793,
      "learning_rate": 0.0002537707448172407,
      "loss": 0.4131,
      "step": 11760
    },
    {
      "gate_value": 0.21378053724765778,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 11760
    },
    {
      "grad_norm": 0.17190168797969818,
      "learning_rate": 0.0002536811635064095,
      "loss": 0.3908,
      "step": 11770
    },
    {
      "gate_value": 0.21405042707920074,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11770
    },
    {
      "grad_norm": 0.15969933569431305,
      "learning_rate": 0.00025359151133048073,
      "loss": 0.3995,
      "step": 11780
    },
    {
      "gate_value": 0.2140841782093048,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 11780
    },
    {
      "grad_norm": 0.2060869038105011,
      "learning_rate": 0.0002535017883507307,
      "loss": 0.4008,
      "step": 11790
    },
    {
      "gate_value": 0.21388085186481476,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11790
    },
    {
      "grad_norm": 0.09752976894378662,
      "learning_rate": 0.0002534119946284844,
      "loss": 0.3981,
      "step": 11800
    },
    {
      "gate_value": 0.21341773867607117,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11800
    },
    {
      "grad_norm": 0.21220721304416656,
      "learning_rate": 0.00025332213022511476,
      "loss": 0.4095,
      "step": 11810
    },
    {
      "gate_value": 0.2137778401374817,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 11810
    },
    {
      "grad_norm": 9.720185279846191,
      "learning_rate": 0.00025323219520204343,
      "loss": 0.3894,
      "step": 11820
    },
    {
      "gate_value": 0.21382419764995575,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 11820
    },
    {
      "grad_norm": 0.1416504681110382,
      "learning_rate": 0.00025314218962074015,
      "loss": 0.3947,
      "step": 11830
    },
    {
      "gate_value": 0.21372593939304352,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 11830
    },
    {
      "grad_norm": 0.2012588381767273,
      "learning_rate": 0.0002530521135427228,
      "loss": 0.4004,
      "step": 11840
    },
    {
      "gate_value": 0.21327029168605804,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 11840
    },
    {
      "grad_norm": 0.40130773186683655,
      "learning_rate": 0.0002529619670295575,
      "loss": 0.4058,
      "step": 11850
    },
    {
      "gate_value": 0.21350201964378357,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 11850
    },
    {
      "grad_norm": 3.3934764862060547,
      "learning_rate": 0.0002528717501428587,
      "loss": 0.4031,
      "step": 11860
    },
    {
      "gate_value": 0.21378423273563385,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 11860
    },
    {
      "grad_norm": 0.17320500314235687,
      "learning_rate": 0.0002527814629442887,
      "loss": 0.389,
      "step": 11870
    },
    {
      "gate_value": 0.2139461636543274,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 11870
    },
    {
      "grad_norm": 0.15953584015369415,
      "learning_rate": 0.0002526911054955579,
      "loss": 0.3925,
      "step": 11880
    },
    {
      "gate_value": 0.21486397087574005,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 11880
    },
    {
      "grad_norm": 0.1063394844532013,
      "learning_rate": 0.00025260067785842484,
      "loss": 0.4014,
      "step": 11890
    },
    {
      "gate_value": 0.21554067730903625,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 11890
    },
    {
      "grad_norm": 0.2693503499031067,
      "learning_rate": 0.00025251018009469594,
      "loss": 0.4011,
      "step": 11900
    },
    {
      "gate_value": 0.21639026701450348,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 11900
    },
    {
      "grad_norm": 0.45159634947776794,
      "learning_rate": 0.00025241961226622555,
      "loss": 0.3985,
      "step": 11910
    },
    {
      "gate_value": 0.21661345660686493,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 11910
    },
    {
      "grad_norm": 0.23357713222503662,
      "learning_rate": 0.00025232897443491596,
      "loss": 0.396,
      "step": 11920
    },
    {
      "gate_value": 0.21685311198234558,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 11920
    },
    {
      "grad_norm": 0.15322349965572357,
      "learning_rate": 0.0002522382666627172,
      "loss": 0.4046,
      "step": 11930
    },
    {
      "gate_value": 0.21700112521648407,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 11930
    },
    {
      "grad_norm": 0.12242075055837631,
      "learning_rate": 0.00025214748901162724,
      "loss": 0.3988,
      "step": 11940
    },
    {
      "gate_value": 0.21745328605175018,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 11940
    },
    {
      "grad_norm": 0.18854005634784698,
      "learning_rate": 0.0002520566415436917,
      "loss": 0.3967,
      "step": 11950
    },
    {
      "gate_value": 0.21726283431053162,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 11950
    },
    {
      "grad_norm": 0.16896556317806244,
      "learning_rate": 0.00025196572432100404,
      "loss": 0.4054,
      "step": 11960
    },
    {
      "gate_value": 0.2172841727733612,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 11960
    },
    {
      "grad_norm": 0.15144844353199005,
      "learning_rate": 0.0002518747374057053,
      "loss": 0.415,
      "step": 11970
    },
    {
      "gate_value": 0.21710263192653656,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 11970
    },
    {
      "grad_norm": 0.29553645849227905,
      "learning_rate": 0.00025178368085998417,
      "loss": 0.4001,
      "step": 11980
    },
    {
      "gate_value": 0.21663376688957214,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 11980
    },
    {
      "grad_norm": 0.24101394414901733,
      "learning_rate": 0.0002516925547460769,
      "loss": 0.3992,
      "step": 11990
    },
    {
      "gate_value": 0.21583116054534912,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 11990
    },
    {
      "grad_norm": 3.6199612617492676,
      "learning_rate": 0.00025160135912626736,
      "loss": 0.4078,
      "step": 12000
    },
    {
      "gate_value": 0.2161846160888672,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12000
    },
    {
      "grad_norm": 0.21872039139270782,
      "learning_rate": 0.0002515100940628869,
      "loss": 0.4022,
      "step": 12010
    },
    {
      "gate_value": 0.2160039097070694,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 12010
    },
    {
      "grad_norm": 0.23018062114715576,
      "learning_rate": 0.0002514187596183144,
      "loss": 0.4129,
      "step": 12020
    },
    {
      "gate_value": 0.21576650440692902,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 12020
    },
    {
      "grad_norm": 0.10655272006988525,
      "learning_rate": 0.00025132735585497594,
      "loss": 0.4108,
      "step": 12030
    },
    {
      "gate_value": 0.2153758853673935,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 12030
    },
    {
      "grad_norm": 0.13647131621837616,
      "learning_rate": 0.00025123588283534524,
      "loss": 0.4082,
      "step": 12040
    },
    {
      "gate_value": 0.21577800810337067,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 12040
    },
    {
      "grad_norm": 0.14735141396522522,
      "learning_rate": 0.0002511443406219432,
      "loss": 0.4017,
      "step": 12050
    },
    {
      "gate_value": 0.2160462588071823,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 12050
    },
    {
      "grad_norm": 0.10175333172082901,
      "learning_rate": 0.00025105272927733815,
      "loss": 0.4152,
      "step": 12060
    },
    {
      "gate_value": 0.21647045016288757,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 12060
    },
    {
      "grad_norm": 0.15681752562522888,
      "learning_rate": 0.00025096104886414543,
      "loss": 0.4076,
      "step": 12070
    },
    {
      "gate_value": 0.21629151701927185,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 12070
    },
    {
      "grad_norm": 0.22178079187870026,
      "learning_rate": 0.0002508692994450279,
      "loss": 0.4194,
      "step": 12080
    },
    {
      "gate_value": 0.2159765064716339,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 12080
    },
    {
      "grad_norm": 0.3628661334514618,
      "learning_rate": 0.00025077748108269526,
      "loss": 0.3933,
      "step": 12090
    },
    {
      "gate_value": 0.21603453159332275,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 12090
    },
    {
      "grad_norm": 2.652317523956299,
      "learning_rate": 0.0002506855938399046,
      "loss": 0.4035,
      "step": 12100
    },
    {
      "gate_value": 0.21626484394073486,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 12100
    },
    {
      "grad_norm": 0.2516089379787445,
      "learning_rate": 0.00025059363777946,
      "loss": 0.3986,
      "step": 12110
    },
    {
      "gate_value": 0.2169078141450882,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 12110
    },
    {
      "grad_norm": 0.11790217459201813,
      "learning_rate": 0.0002505016129642125,
      "loss": 0.4076,
      "step": 12120
    },
    {
      "gate_value": 0.21709206700325012,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 12120
    },
    {
      "grad_norm": 0.11641325056552887,
      "learning_rate": 0.00025040951945706015,
      "loss": 0.4049,
      "step": 12130
    },
    {
      "gate_value": 0.21792539954185486,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 12130
    },
    {
      "grad_norm": 1.3086296319961548,
      "learning_rate": 0.0002503173573209481,
      "loss": 0.4068,
      "step": 12140
    },
    {
      "gate_value": 0.21753643453121185,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 12140
    },
    {
      "grad_norm": 0.12861375510692596,
      "learning_rate": 0.0002502251266188683,
      "loss": 0.383,
      "step": 12150
    },
    {
      "gate_value": 0.21757358312606812,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 12150
    },
    {
      "grad_norm": 0.15611277520656586,
      "learning_rate": 0.00025013282741385946,
      "loss": 0.4115,
      "step": 12160
    },
    {
      "gate_value": 0.21742816269397736,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 12160
    },
    {
      "grad_norm": 6.722140789031982,
      "learning_rate": 0.0002500404597690073,
      "loss": 0.4,
      "step": 12170
    },
    {
      "gate_value": 0.2178168147802353,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 12170
    },
    {
      "grad_norm": 0.666631281375885,
      "learning_rate": 0.00024994802374744417,
      "loss": 0.3926,
      "step": 12180
    },
    {
      "gate_value": 0.21850833296775818,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 12180
    },
    {
      "grad_norm": 0.7619335651397705,
      "learning_rate": 0.00024985551941234934,
      "loss": 0.4156,
      "step": 12190
    },
    {
      "gate_value": 0.21924303472042084,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 12190
    },
    {
      "grad_norm": 0.20830854773521423,
      "learning_rate": 0.00024976294682694855,
      "loss": 0.3696,
      "step": 12200
    },
    {
      "gate_value": 0.21952006220817566,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 12200
    },
    {
      "grad_norm": 0.15673328936100006,
      "learning_rate": 0.00024967030605451426,
      "loss": 0.4022,
      "step": 12210
    },
    {
      "gate_value": 0.2195618897676468,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 12210
    },
    {
      "grad_norm": 0.19766902923583984,
      "learning_rate": 0.0002495775971583657,
      "loss": 0.3935,
      "step": 12220
    },
    {
      "gate_value": 0.2193445861339569,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 12220
    },
    {
      "grad_norm": 0.25193437933921814,
      "learning_rate": 0.0002494848202018684,
      "loss": 0.3898,
      "step": 12230
    },
    {
      "gate_value": 0.21949507296085358,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 12230
    },
    {
      "grad_norm": 0.16779395937919617,
      "learning_rate": 0.0002493919752484346,
      "loss": 0.3802,
      "step": 12240
    },
    {
      "gate_value": 0.2198145091533661,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 12240
    },
    {
      "grad_norm": 0.22382983565330505,
      "learning_rate": 0.0002492990623615229,
      "loss": 0.382,
      "step": 12250
    },
    {
      "gate_value": 0.22057458758354187,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12250
    },
    {
      "grad_norm": 0.19160112738609314,
      "learning_rate": 0.0002492060816046384,
      "loss": 0.4112,
      "step": 12260
    },
    {
      "gate_value": 0.22058996558189392,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 12260
    },
    {
      "grad_norm": 0.20744547247886658,
      "learning_rate": 0.00024911303304133255,
      "loss": 0.4119,
      "step": 12270
    },
    {
      "gate_value": 0.2202199250459671,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 12270
    },
    {
      "grad_norm": 1.4961268901824951,
      "learning_rate": 0.0002490199167352033,
      "loss": 0.4039,
      "step": 12280
    },
    {
      "gate_value": 0.2205304652452469,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 12280
    },
    {
      "grad_norm": 0.15413504838943481,
      "learning_rate": 0.0002489267327498946,
      "loss": 0.4038,
      "step": 12290
    },
    {
      "gate_value": 0.2205338329076767,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 12290
    },
    {
      "grad_norm": 0.41113772988319397,
      "learning_rate": 0.00024883348114909686,
      "loss": 0.402,
      "step": 12300
    },
    {
      "gate_value": 0.22082765400409698,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 12300
    },
    {
      "grad_norm": 0.1423916518688202,
      "learning_rate": 0.0002487401619965467,
      "loss": 0.4016,
      "step": 12310
    },
    {
      "gate_value": 0.2202834039926529,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 12310
    },
    {
      "grad_norm": 0.292427122592926,
      "learning_rate": 0.000248646775356027,
      "loss": 0.4115,
      "step": 12320
    },
    {
      "gate_value": 0.21988192200660706,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 12320
    },
    {
      "grad_norm": 0.33503174781799316,
      "learning_rate": 0.0002485533212913664,
      "loss": 0.3795,
      "step": 12330
    },
    {
      "gate_value": 0.2196183055639267,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 12330
    },
    {
      "grad_norm": 0.14144979417324066,
      "learning_rate": 0.0002484597998664401,
      "loss": 0.3932,
      "step": 12340
    },
    {
      "gate_value": 0.2193087488412857,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 12340
    },
    {
      "grad_norm": 0.42851975560188293,
      "learning_rate": 0.00024836621114516887,
      "loss": 0.3965,
      "step": 12350
    },
    {
      "gate_value": 0.21940088272094727,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 12350
    },
    {
      "grad_norm": 1.2762000560760498,
      "learning_rate": 0.00024827255519152,
      "loss": 0.3975,
      "step": 12360
    },
    {
      "gate_value": 0.2194403111934662,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12360
    },
    {
      "grad_norm": 0.374787837266922,
      "learning_rate": 0.0002481788320695062,
      "loss": 0.3782,
      "step": 12370
    },
    {
      "gate_value": 0.21983832120895386,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 12370
    },
    {
      "grad_norm": 0.21959860622882843,
      "learning_rate": 0.0002480850418431865,
      "loss": 0.3915,
      "step": 12380
    },
    {
      "gate_value": 0.22001340985298157,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 12380
    },
    {
      "grad_norm": 0.1599218249320984,
      "learning_rate": 0.0002479911845766656,
      "loss": 0.3897,
      "step": 12390
    },
    {
      "gate_value": 0.22048789262771606,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 12390
    },
    {
      "grad_norm": 0.7569872736930847,
      "learning_rate": 0.00024789726033409403,
      "loss": 0.3922,
      "step": 12400
    },
    {
      "gate_value": 0.22119253873825073,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 12400
    },
    {
      "grad_norm": 1.3088319301605225,
      "learning_rate": 0.0002478032691796682,
      "loss": 0.3877,
      "step": 12410
    },
    {
      "gate_value": 0.222129225730896,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 12410
    },
    {
      "grad_norm": 0.1527448296546936,
      "learning_rate": 0.00024770921117763,
      "loss": 0.3947,
      "step": 12420
    },
    {
      "gate_value": 0.22193755209445953,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 12420
    },
    {
      "grad_norm": 0.16166727244853973,
      "learning_rate": 0.0002476150863922674,
      "loss": 0.4036,
      "step": 12430
    },
    {
      "gate_value": 0.22141483426094055,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 12430
    },
    {
      "grad_norm": 0.2394699603319168,
      "learning_rate": 0.00024752089488791365,
      "loss": 0.3781,
      "step": 12440
    },
    {
      "gate_value": 0.22163152694702148,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 12440
    },
    {
      "grad_norm": 0.7809823155403137,
      "learning_rate": 0.00024742663672894786,
      "loss": 0.4088,
      "step": 12450
    },
    {
      "gate_value": 0.2219049334526062,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 12450
    },
    {
      "grad_norm": 0.5379728674888611,
      "learning_rate": 0.00024733231197979444,
      "loss": 0.3875,
      "step": 12460
    },
    {
      "gate_value": 0.22181984782218933,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12460
    },
    {
      "grad_norm": 0.2976929247379303,
      "learning_rate": 0.0002472379207049237,
      "loss": 0.3876,
      "step": 12470
    },
    {
      "gate_value": 0.22250066697597504,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 12470
    },
    {
      "grad_norm": 0.16642062366008759,
      "learning_rate": 0.000247143462968851,
      "loss": 0.4015,
      "step": 12480
    },
    {
      "gate_value": 0.2227054238319397,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 12480
    },
    {
      "grad_norm": 0.15899008512496948,
      "learning_rate": 0.00024704893883613734,
      "loss": 0.3878,
      "step": 12490
    },
    {
      "gate_value": 0.22255997359752655,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 12490
    },
    {
      "grad_norm": 0.8335736989974976,
      "learning_rate": 0.0002469543483713891,
      "loss": 0.3893,
      "step": 12500
    },
    {
      "gate_value": 0.22294364869594574,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 12500
    },
    {
      "grad_norm": 0.48398110270500183,
      "learning_rate": 0.000246859691639258,
      "loss": 0.4259,
      "step": 12510
    },
    {
      "gate_value": 0.2238253355026245,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 12510
    },
    {
      "grad_norm": 0.5804698467254639,
      "learning_rate": 0.00024676496870444105,
      "loss": 0.408,
      "step": 12520
    },
    {
      "gate_value": 0.22372859716415405,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 12520
    },
    {
      "grad_norm": 0.1858420968055725,
      "learning_rate": 0.0002466701796316804,
      "loss": 0.3909,
      "step": 12530
    },
    {
      "gate_value": 0.22408469021320343,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 12530
    },
    {
      "grad_norm": 0.28055644035339355,
      "learning_rate": 0.00024657532448576347,
      "loss": 0.3836,
      "step": 12540
    },
    {
      "gate_value": 0.22420431673526764,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 12540
    },
    {
      "grad_norm": 0.36310645937919617,
      "learning_rate": 0.00024648040333152295,
      "loss": 0.3864,
      "step": 12550
    },
    {
      "gate_value": 0.2246132344007492,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 12550
    },
    {
      "grad_norm": 0.22087214887142181,
      "learning_rate": 0.00024638541623383647,
      "loss": 0.3993,
      "step": 12560
    },
    {
      "gate_value": 0.22429001331329346,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 12560
    },
    {
      "grad_norm": 0.13360650837421417,
      "learning_rate": 0.00024629036325762686,
      "loss": 0.3914,
      "step": 12570
    },
    {
      "gate_value": 0.223907932639122,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 12570
    },
    {
      "grad_norm": 0.27683377265930176,
      "learning_rate": 0.00024619524446786197,
      "loss": 0.3823,
      "step": 12580
    },
    {
      "gate_value": 0.22451086342334747,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12580
    },
    {
      "grad_norm": 0.11729135364294052,
      "learning_rate": 0.0002461000599295545,
      "loss": 0.3952,
      "step": 12590
    },
    {
      "gate_value": 0.2250521332025528,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 12590
    },
    {
      "grad_norm": 0.3065733015537262,
      "learning_rate": 0.00024600480970776224,
      "loss": 0.3835,
      "step": 12600
    },
    {
      "gate_value": 0.22507460415363312,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 12600
    },
    {
      "grad_norm": 0.2557946741580963,
      "learning_rate": 0.0002459094938675879,
      "loss": 0.4028,
      "step": 12610
    },
    {
      "gate_value": 0.2248997837305069,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 12610
    },
    {
      "grad_norm": 0.1608905792236328,
      "learning_rate": 0.0002458141124741788,
      "loss": 0.3926,
      "step": 12620
    },
    {
      "gate_value": 0.2250816822052002,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12620
    },
    {
      "grad_norm": 0.5290404558181763,
      "learning_rate": 0.00024571866559272733,
      "loss": 0.3991,
      "step": 12630
    },
    {
      "gate_value": 0.22497162222862244,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 12630
    },
    {
      "grad_norm": 0.7313876748085022,
      "learning_rate": 0.00024562315328847045,
      "loss": 0.4006,
      "step": 12640
    },
    {
      "gate_value": 0.22498567402362823,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 12640
    },
    {
      "grad_norm": 0.6579257845878601,
      "learning_rate": 0.00024552757562669,
      "loss": 0.4221,
      "step": 12650
    },
    {
      "gate_value": 0.22542236745357513,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 12650
    },
    {
      "grad_norm": 0.2773542106151581,
      "learning_rate": 0.0002454319326727124,
      "loss": 0.4127,
      "step": 12660
    },
    {
      "gate_value": 0.22569654881954193,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12660
    },
    {
      "grad_norm": 0.6629268527030945,
      "learning_rate": 0.00024533622449190865,
      "loss": 0.3972,
      "step": 12670
    },
    {
      "gate_value": 0.22591601312160492,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 12670
    },
    {
      "grad_norm": 0.5978025197982788,
      "learning_rate": 0.00024524045114969446,
      "loss": 0.4055,
      "step": 12680
    },
    {
      "gate_value": 0.225976824760437,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 12680
    },
    {
      "grad_norm": 0.41180774569511414,
      "learning_rate": 0.00024514461271153,
      "loss": 0.3886,
      "step": 12690
    },
    {
      "gate_value": 0.22621405124664307,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 12690
    },
    {
      "grad_norm": 0.18459030985832214,
      "learning_rate": 0.0002450487092429198,
      "loss": 0.3793,
      "step": 12700
    },
    {
      "gate_value": 0.22648906707763672,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 12700
    },
    {
      "grad_norm": 0.16683566570281982,
      "learning_rate": 0.0002449527408094132,
      "loss": 0.391,
      "step": 12710
    },
    {
      "gate_value": 0.22698599100112915,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 12710
    },
    {
      "grad_norm": 0.18806658685207367,
      "learning_rate": 0.0002448567074766035,
      "loss": 0.41,
      "step": 12720
    },
    {
      "gate_value": 0.2272697538137436,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 12720
    },
    {
      "grad_norm": 0.189344123005867,
      "learning_rate": 0.00024476060931012884,
      "loss": 0.3958,
      "step": 12730
    },
    {
      "gate_value": 0.22789913415908813,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 12730
    },
    {
      "grad_norm": 0.14973287284374237,
      "learning_rate": 0.00024466444637567114,
      "loss": 0.3955,
      "step": 12740
    },
    {
      "gate_value": 0.22827696800231934,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 12740
    },
    {
      "grad_norm": 0.16878141462802887,
      "learning_rate": 0.000244568218738957,
      "loss": 0.3812,
      "step": 12750
    },
    {
      "gate_value": 0.228811576962471,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 12750
    },
    {
      "grad_norm": 0.1252906322479248,
      "learning_rate": 0.0002444719264657571,
      "loss": 0.3787,
      "step": 12760
    },
    {
      "gate_value": 0.22924955189228058,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 12760
    },
    {
      "grad_norm": 0.16972161829471588,
      "learning_rate": 0.0002443755696218862,
      "loss": 0.4027,
      "step": 12770
    },
    {
      "gate_value": 0.22910422086715698,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 12770
    },
    {
      "grad_norm": 0.4904925227165222,
      "learning_rate": 0.0002442791482732034,
      "loss": 0.3838,
      "step": 12780
    },
    {
      "gate_value": 0.2290533185005188,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 12780
    },
    {
      "grad_norm": 0.6321537494659424,
      "learning_rate": 0.0002441826624856118,
      "loss": 0.3995,
      "step": 12790
    },
    {
      "gate_value": 0.22894862294197083,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 12790
    },
    {
      "grad_norm": 3.330693006515503,
      "learning_rate": 0.0002440861123250585,
      "loss": 0.4076,
      "step": 12800
    },
    {
      "gate_value": 0.2295045107603073,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12800
    },
    {
      "grad_norm": 0.434231698513031,
      "learning_rate": 0.00024398949785753453,
      "loss": 0.4006,
      "step": 12810
    },
    {
      "gate_value": 0.2292521893978119,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12810
    },
    {
      "grad_norm": 0.20848998427391052,
      "learning_rate": 0.00024389281914907507,
      "loss": 0.3867,
      "step": 12820
    },
    {
      "gate_value": 0.22858992218971252,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 12820
    },
    {
      "grad_norm": 0.12240742146968842,
      "learning_rate": 0.00024379607626575912,
      "loss": 0.3894,
      "step": 12830
    },
    {
      "gate_value": 0.22902952134609222,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12830
    },
    {
      "grad_norm": 0.32847192883491516,
      "learning_rate": 0.00024369926927370945,
      "loss": 0.3937,
      "step": 12840
    },
    {
      "gate_value": 0.22911740839481354,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 12840
    },
    {
      "grad_norm": 1.2735588550567627,
      "learning_rate": 0.0002436023982390928,
      "loss": 0.4043,
      "step": 12850
    },
    {
      "gate_value": 0.22957158088684082,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 12850
    },
    {
      "grad_norm": 0.12431875616312027,
      "learning_rate": 0.0002435054632281195,
      "loss": 0.3846,
      "step": 12860
    },
    {
      "gate_value": 0.2296081781387329,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 12860
    },
    {
      "grad_norm": 0.23901695013046265,
      "learning_rate": 0.00024340846430704382,
      "loss": 0.3835,
      "step": 12870
    },
    {
      "gate_value": 0.2295922040939331,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 12870
    },
    {
      "grad_norm": 0.24101102352142334,
      "learning_rate": 0.00024331140154216358,
      "loss": 0.3951,
      "step": 12880
    },
    {
      "gate_value": 0.22967174649238586,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 12880
    },
    {
      "grad_norm": 0.16511069238185883,
      "learning_rate": 0.00024321427499982026,
      "loss": 0.4028,
      "step": 12890
    },
    {
      "gate_value": 0.2295798808336258,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12890
    },
    {
      "grad_norm": 0.40296611189842224,
      "learning_rate": 0.00024311708474639891,
      "loss": 0.4025,
      "step": 12900
    },
    {
      "gate_value": 0.22963640093803406,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12900
    },
    {
      "grad_norm": 0.7211963534355164,
      "learning_rate": 0.00024301983084832826,
      "loss": 0.3905,
      "step": 12910
    },
    {
      "gate_value": 0.2297767698764801,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 12910
    },
    {
      "grad_norm": 0.2979947030544281,
      "learning_rate": 0.00024292251337208027,
      "loss": 0.4067,
      "step": 12920
    },
    {
      "gate_value": 0.22896960377693176,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 12920
    },
    {
      "grad_norm": 0.4544394016265869,
      "learning_rate": 0.0002428251323841706,
      "loss": 0.4026,
      "step": 12930
    },
    {
      "gate_value": 0.22909384965896606,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 12930
    },
    {
      "grad_norm": 0.4741581380367279,
      "learning_rate": 0.0002427276879511583,
      "loss": 0.4034,
      "step": 12940
    },
    {
      "gate_value": 0.22910211980342865,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 12940
    },
    {
      "grad_norm": 0.6591212749481201,
      "learning_rate": 0.00024263018013964558,
      "loss": 0.3913,
      "step": 12950
    },
    {
      "gate_value": 0.2297748625278473,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 12950
    },
    {
      "grad_norm": 1.3386341333389282,
      "learning_rate": 0.0002425326090162782,
      "loss": 0.3971,
      "step": 12960
    },
    {
      "gate_value": 0.22955504059791565,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 12960
    },
    {
      "grad_norm": 0.5584853887557983,
      "learning_rate": 0.00024243497464774514,
      "loss": 0.3836,
      "step": 12970
    },
    {
      "gate_value": 0.22991710901260376,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 12970
    },
    {
      "grad_norm": 1.5411170721054077,
      "learning_rate": 0.00024233727710077843,
      "loss": 0.4148,
      "step": 12980
    },
    {
      "gate_value": 0.23056305944919586,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 12980
    },
    {
      "grad_norm": 1.385258436203003,
      "learning_rate": 0.00024223951644215358,
      "loss": 0.3851,
      "step": 12990
    },
    {
      "gate_value": 0.23073600232601166,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 12990
    },
    {
      "grad_norm": 4.856663227081299,
      "learning_rate": 0.000242141692738689,
      "loss": 0.3942,
      "step": 13000
    },
    {
      "gate_value": 0.2309921234846115,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13000
    },
    {
      "grad_norm": 1.7304902076721191,
      "learning_rate": 0.00024204380605724626,
      "loss": 0.397,
      "step": 13010
    },
    {
      "gate_value": 0.23110632598400116,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 13010
    },
    {
      "grad_norm": 4.753711223602295,
      "learning_rate": 0.00024194585646473,
      "loss": 0.394,
      "step": 13020
    },
    {
      "gate_value": 0.2315414547920227,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 13020
    },
    {
      "grad_norm": 3.1984050273895264,
      "learning_rate": 0.00024184784402808785,
      "loss": 0.3844,
      "step": 13030
    },
    {
      "gate_value": 0.2317209541797638,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13030
    },
    {
      "grad_norm": 4.3004302978515625,
      "learning_rate": 0.0002417497688143104,
      "loss": 0.3802,
      "step": 13040
    },
    {
      "gate_value": 0.23195067048072815,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 13040
    },
    {
      "grad_norm": 9.027650833129883,
      "learning_rate": 0.0002416516308904311,
      "loss": 0.403,
      "step": 13050
    },
    {
      "gate_value": 0.2321426421403885,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 13050
    },
    {
      "grad_norm": 3.0218982696533203,
      "learning_rate": 0.00024155343032352628,
      "loss": 0.3972,
      "step": 13060
    },
    {
      "gate_value": 0.23222358524799347,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 13060
    },
    {
      "grad_norm": 2.7752718925476074,
      "learning_rate": 0.00024145516718071517,
      "loss": 0.3899,
      "step": 13070
    },
    {
      "gate_value": 0.2324615716934204,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 13070
    },
    {
      "grad_norm": 2.805215358734131,
      "learning_rate": 0.00024135684152915964,
      "loss": 0.3856,
      "step": 13080
    },
    {
      "gate_value": 0.2327684760093689,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 13080
    },
    {
      "grad_norm": 5.619499683380127,
      "learning_rate": 0.0002412584534360644,
      "loss": 0.3887,
      "step": 13090
    },
    {
      "gate_value": 0.23298239707946777,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13090
    },
    {
      "grad_norm": 8.072099685668945,
      "learning_rate": 0.0002411600029686767,
      "loss": 0.3859,
      "step": 13100
    },
    {
      "gate_value": 0.23314546048641205,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 13100
    },
    {
      "grad_norm": 6.481606960296631,
      "learning_rate": 0.00024106149019428657,
      "loss": 0.4038,
      "step": 13110
    },
    {
      "gate_value": 0.23341412842273712,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 13110
    },
    {
      "grad_norm": 77.76628875732422,
      "learning_rate": 0.0002409629151802266,
      "loss": 0.393,
      "step": 13120
    },
    {
      "gate_value": 0.23348568379878998,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 13120
    },
    {
      "grad_norm": 2.652857780456543,
      "learning_rate": 0.00024086427799387182,
      "loss": 0.3835,
      "step": 13130
    },
    {
      "gate_value": 0.23361773788928986,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 13130
    },
    {
      "grad_norm": 4.54749059677124,
      "learning_rate": 0.0002407655787026398,
      "loss": 0.4017,
      "step": 13140
    },
    {
      "gate_value": 0.23375937342643738,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 13140
    },
    {
      "grad_norm": 6.540721893310547,
      "learning_rate": 0.00024066681737399062,
      "loss": 0.387,
      "step": 13150
    },
    {
      "gate_value": 0.23380054533481598,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13150
    },
    {
      "grad_norm": 5.220513820648193,
      "learning_rate": 0.00024056799407542667,
      "loss": 0.4035,
      "step": 13160
    },
    {
      "gate_value": 0.233912393450737,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13160
    },
    {
      "grad_norm": 2.220935583114624,
      "learning_rate": 0.00024046910887449283,
      "loss": 0.3829,
      "step": 13170
    },
    {
      "gate_value": 0.23434002697467804,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13170
    },
    {
      "grad_norm": 10.84092903137207,
      "learning_rate": 0.00024037016183877614,
      "loss": 0.3715,
      "step": 13180
    },
    {
      "gate_value": 0.23447051644325256,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 13180
    },
    {
      "grad_norm": 2.110276699066162,
      "learning_rate": 0.0002402711530359059,
      "loss": 0.422,
      "step": 13190
    },
    {
      "gate_value": 0.23508982360363007,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 13190
    },
    {
      "grad_norm": 2.608215093612671,
      "learning_rate": 0.00024017208253355383,
      "loss": 0.3955,
      "step": 13200
    },
    {
      "gate_value": 0.2356402426958084,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 13200
    },
    {
      "grad_norm": 3.4447200298309326,
      "learning_rate": 0.0002400729503994336,
      "loss": 0.3674,
      "step": 13210
    },
    {
      "gate_value": 0.23609599471092224,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13210
    },
    {
      "grad_norm": 31.154781341552734,
      "learning_rate": 0.00023997375670130116,
      "loss": 0.376,
      "step": 13220
    },
    {
      "gate_value": 0.23642614483833313,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13220
    },
    {
      "grad_norm": 11.202011108398438,
      "learning_rate": 0.00023987450150695437,
      "loss": 0.396,
      "step": 13230
    },
    {
      "gate_value": 0.2366163730621338,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 13230
    },
    {
      "grad_norm": 8.70836067199707,
      "learning_rate": 0.00023977518488423324,
      "loss": 0.3887,
      "step": 13240
    },
    {
      "gate_value": 0.23657521605491638,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 13240
    },
    {
      "grad_norm": 4.990417957305908,
      "learning_rate": 0.0002396758069010198,
      "loss": 0.3666,
      "step": 13250
    },
    {
      "gate_value": 0.23652224242687225,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 13250
    },
    {
      "grad_norm": 8.101316452026367,
      "learning_rate": 0.00023957636762523792,
      "loss": 0.3931,
      "step": 13260
    },
    {
      "gate_value": 0.23652319610118866,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 13260
    },
    {
      "grad_norm": 11.491870880126953,
      "learning_rate": 0.00023947686712485347,
      "loss": 0.3816,
      "step": 13270
    },
    {
      "gate_value": 0.23662154376506805,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 13270
    },
    {
      "grad_norm": 9.589191436767578,
      "learning_rate": 0.00023937730546787404,
      "loss": 0.3723,
      "step": 13280
    },
    {
      "gate_value": 0.23666512966156006,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 13280
    },
    {
      "grad_norm": 9.990079879760742,
      "learning_rate": 0.00023927768272234907,
      "loss": 0.3981,
      "step": 13290
    },
    {
      "gate_value": 0.23673947155475616,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 13290
    },
    {
      "grad_norm": 9.692089080810547,
      "learning_rate": 0.00023917799895636983,
      "loss": 0.396,
      "step": 13300
    },
    {
      "gate_value": 0.23682591319084167,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 13300
    },
    {
      "grad_norm": 12.743815422058105,
      "learning_rate": 0.00023907825423806915,
      "loss": 0.3835,
      "step": 13310
    },
    {
      "gate_value": 0.23698779940605164,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 13310
    },
    {
      "grad_norm": 18.46089744567871,
      "learning_rate": 0.00023897844863562175,
      "loss": 0.3947,
      "step": 13320
    },
    {
      "gate_value": 0.23710179328918457,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 13320
    },
    {
      "grad_norm": 4.994933605194092,
      "learning_rate": 0.00023887858221724364,
      "loss": 0.3818,
      "step": 13330
    },
    {
      "gate_value": 0.23720552027225494,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 13330
    },
    {
      "grad_norm": 33.670082092285156,
      "learning_rate": 0.00023877865505119266,
      "loss": 0.392,
      "step": 13340
    },
    {
      "gate_value": 0.23719537258148193,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 13340
    },
    {
      "grad_norm": 15.782670974731445,
      "learning_rate": 0.00023867866720576813,
      "loss": 0.3922,
      "step": 13350
    },
    {
      "gate_value": 0.2372649610042572,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 13350
    },
    {
      "grad_norm": 6.85189151763916,
      "learning_rate": 0.00023857861874931074,
      "loss": 0.3822,
      "step": 13360
    },
    {
      "gate_value": 0.2373196929693222,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 13360
    },
    {
      "grad_norm": 14.476446151733398,
      "learning_rate": 0.00023847850975020266,
      "loss": 0.3954,
      "step": 13370
    },
    {
      "gate_value": 0.2373858541250229,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 13370
    },
    {
      "grad_norm": 12.489557266235352,
      "learning_rate": 0.0002383783402768675,
      "loss": 0.3907,
      "step": 13380
    },
    {
      "gate_value": 0.2375316321849823,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13380
    },
    {
      "grad_norm": 6.788305759429932,
      "learning_rate": 0.0002382781103977701,
      "loss": 0.3866,
      "step": 13390
    },
    {
      "gate_value": 0.23768095672130585,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 13390
    },
    {
      "grad_norm": 9.206138610839844,
      "learning_rate": 0.00023817782018141666,
      "loss": 0.4043,
      "step": 13400
    },
    {
      "gate_value": 0.23779208958148956,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 13400
    },
    {
      "grad_norm": 20.340181350708008,
      "learning_rate": 0.0002380774696963546,
      "loss": 0.4027,
      "step": 13410
    },
    {
      "gate_value": 0.23787511885166168,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13410
    },
    {
      "grad_norm": 14.296485900878906,
      "learning_rate": 0.00023797705901117252,
      "loss": 0.3959,
      "step": 13420
    },
    {
      "gate_value": 0.23800590634346008,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13420
    },
    {
      "grad_norm": 9.20689868927002,
      "learning_rate": 0.00023787658819450017,
      "loss": 0.382,
      "step": 13430
    },
    {
      "gate_value": 0.2381312996149063,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 13430
    },
    {
      "grad_norm": 11.133676528930664,
      "learning_rate": 0.0002377760573150084,
      "loss": 0.3873,
      "step": 13440
    },
    {
      "gate_value": 0.2381925731897354,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 13440
    },
    {
      "grad_norm": 6.757209777832031,
      "learning_rate": 0.00023767546644140917,
      "loss": 0.3809,
      "step": 13450
    },
    {
      "gate_value": 0.2382330298423767,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 13450
    },
    {
      "grad_norm": 9.891308784484863,
      "learning_rate": 0.00023757481564245535,
      "loss": 0.394,
      "step": 13460
    },
    {
      "gate_value": 0.23824086785316467,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13460
    },
    {
      "grad_norm": 9.862556457519531,
      "learning_rate": 0.0002374741049869408,
      "loss": 0.3918,
      "step": 13470
    },
    {
      "gate_value": 0.23823952674865723,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 13470
    },
    {
      "grad_norm": 17.30132293701172,
      "learning_rate": 0.00023737333454370034,
      "loss": 0.4038,
      "step": 13480
    },
    {
      "gate_value": 0.2382458597421646,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 13480
    },
    {
      "grad_norm": 14.025443077087402,
      "learning_rate": 0.00023727250438160957,
      "loss": 0.3877,
      "step": 13490
    },
    {
      "gate_value": 0.23827463388442993,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 13490
    },
    {
      "grad_norm": 7.736385345458984,
      "learning_rate": 0.00023717161456958508,
      "loss": 0.3922,
      "step": 13500
    },
    {
      "gate_value": 0.23830415308475494,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13500
    },
    {
      "grad_norm": 10.48551082611084,
      "learning_rate": 0.00023707066517658393,
      "loss": 0.3671,
      "step": 13510
    },
    {
      "gate_value": 0.23844420909881592,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 13510
    },
    {
      "grad_norm": 9.336437225341797,
      "learning_rate": 0.00023696965627160416,
      "loss": 0.3838,
      "step": 13520
    },
    {
      "gate_value": 0.23857417702674866,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13520
    },
    {
      "grad_norm": 10.722733497619629,
      "learning_rate": 0.0002368685879236844,
      "loss": 0.3806,
      "step": 13530
    },
    {
      "gate_value": 0.23865066468715668,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 13530
    },
    {
      "grad_norm": 8.75007438659668,
      "learning_rate": 0.0002367674602019039,
      "loss": 0.4108,
      "step": 13540
    },
    {
      "gate_value": 0.23874539136886597,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 13540
    },
    {
      "grad_norm": 8.046998023986816,
      "learning_rate": 0.00023666627317538258,
      "loss": 0.3837,
      "step": 13550
    },
    {
      "gate_value": 0.23880630731582642,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 13550
    },
    {
      "grad_norm": 6.331174373626709,
      "learning_rate": 0.00023656502691328074,
      "loss": 0.3923,
      "step": 13560
    },
    {
      "gate_value": 0.23895002901554108,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 13560
    },
    {
      "grad_norm": 10.85879898071289,
      "learning_rate": 0.00023646372148479925,
      "loss": 0.3662,
      "step": 13570
    },
    {
      "gate_value": 0.23905928432941437,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 13570
    },
    {
      "grad_norm": 10.636667251586914,
      "learning_rate": 0.00023636235695917942,
      "loss": 0.4045,
      "step": 13580
    },
    {
      "gate_value": 0.23911528289318085,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13580
    },
    {
      "grad_norm": 14.647674560546875,
      "learning_rate": 0.00023626093340570298,
      "loss": 0.3841,
      "step": 13590
    },
    {
      "gate_value": 0.23917175829410553,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 13590
    },
    {
      "grad_norm": 10.018962860107422,
      "learning_rate": 0.00023615945089369193,
      "loss": 0.3831,
      "step": 13600
    },
    {
      "gate_value": 0.2392829954624176,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13600
    },
    {
      "grad_norm": 11.435403823852539,
      "learning_rate": 0.00023605790949250864,
      "loss": 0.4096,
      "step": 13610
    },
    {
      "gate_value": 0.2394438236951828,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13610
    },
    {
      "grad_norm": 41.82572937011719,
      "learning_rate": 0.00023595630927155571,
      "loss": 0.3759,
      "step": 13620
    },
    {
      "gate_value": 0.23965805768966675,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 13620
    },
    {
      "grad_norm": 6.350934028625488,
      "learning_rate": 0.00023585465030027586,
      "loss": 0.3888,
      "step": 13630
    },
    {
      "gate_value": 0.23977549374103546,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 13630
    },
    {
      "grad_norm": 9.555365562438965,
      "learning_rate": 0.00023575293264815214,
      "loss": 0.3868,
      "step": 13640
    },
    {
      "gate_value": 0.239946648478508,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 13640
    },
    {
      "grad_norm": 15.217024803161621,
      "learning_rate": 0.00023565115638470754,
      "loss": 0.3924,
      "step": 13650
    },
    {
      "gate_value": 0.2401033490896225,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 13650
    },
    {
      "grad_norm": 8.90522289276123,
      "learning_rate": 0.00023554932157950518,
      "loss": 0.3688,
      "step": 13660
    },
    {
      "gate_value": 0.24017292261123657,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 13660
    },
    {
      "grad_norm": 11.551718711853027,
      "learning_rate": 0.00023544742830214823,
      "loss": 0.3912,
      "step": 13670
    },
    {
      "gate_value": 0.24020229279994965,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13670
    },
    {
      "grad_norm": 8.776484489440918,
      "learning_rate": 0.0002353454766222797,
      "loss": 0.3941,
      "step": 13680
    },
    {
      "gate_value": 0.24032677710056305,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 13680
    },
    {
      "grad_norm": 3.0297889709472656,
      "learning_rate": 0.00023524346660958273,
      "loss": 0.3864,
      "step": 13690
    },
    {
      "gate_value": 0.2403283268213272,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 13690
    },
    {
      "grad_norm": 2.8323323726654053,
      "learning_rate": 0.0002351413983337801,
      "loss": 0.3771,
      "step": 13700
    },
    {
      "gate_value": 0.2404557168483734,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13700
    },
    {
      "grad_norm": 4.921107292175293,
      "learning_rate": 0.00023503927186463455,
      "loss": 0.4035,
      "step": 13710
    },
    {
      "gate_value": 0.24044989049434662,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 13710
    },
    {
      "grad_norm": 4.018657207489014,
      "learning_rate": 0.00023493708727194854,
      "loss": 0.3895,
      "step": 13720
    },
    {
      "gate_value": 0.2406502068042755,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 13720
    },
    {
      "grad_norm": 3.5907695293426514,
      "learning_rate": 0.00023483484462556427,
      "loss": 0.3844,
      "step": 13730
    },
    {
      "gate_value": 0.2406596541404724,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 13730
    },
    {
      "grad_norm": 4.6112847328186035,
      "learning_rate": 0.00023473254399536368,
      "loss": 0.3948,
      "step": 13740
    },
    {
      "gate_value": 0.24075298011302948,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13740
    },
    {
      "grad_norm": 7.318800449371338,
      "learning_rate": 0.00023463018545126827,
      "loss": 0.3905,
      "step": 13750
    },
    {
      "gate_value": 0.2410333752632141,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 13750
    },
    {
      "grad_norm": 4.03450345993042,
      "learning_rate": 0.00023452776906323906,
      "loss": 0.3951,
      "step": 13760
    },
    {
      "gate_value": 0.24121364951133728,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 13760
    },
    {
      "grad_norm": 4.004604339599609,
      "learning_rate": 0.00023442529490127678,
      "loss": 0.3899,
      "step": 13770
    },
    {
      "gate_value": 0.241162970662117,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 13770
    },
    {
      "grad_norm": 3.408446788787842,
      "learning_rate": 0.00023432276303542152,
      "loss": 0.3875,
      "step": 13780
    },
    {
      "gate_value": 0.24171173572540283,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13780
    },
    {
      "grad_norm": 0.6727409362792969,
      "learning_rate": 0.0002342201735357528,
      "loss": 0.3736,
      "step": 13790
    },
    {
      "gate_value": 0.24206897616386414,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 13790
    },
    {
      "grad_norm": 0.4120021462440491,
      "learning_rate": 0.00023411752647238963,
      "loss": 0.379,
      "step": 13800
    },
    {
      "gate_value": 0.242006316781044,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 13800
    },
    {
      "grad_norm": 0.5434548854827881,
      "learning_rate": 0.00023401482191549034,
      "loss": 0.4013,
      "step": 13810
    },
    {
      "gate_value": 0.24193528294563293,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 13810
    },
    {
      "grad_norm": 0.6041681170463562,
      "learning_rate": 0.00023391205993525245,
      "loss": 0.4044,
      "step": 13820
    },
    {
      "gate_value": 0.2423665076494217,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 13820
    },
    {
      "grad_norm": 0.38581350445747375,
      "learning_rate": 0.00023380924060191287,
      "loss": 0.4103,
      "step": 13830
    },
    {
      "gate_value": 0.24196967482566833,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13830
    },
    {
      "grad_norm": 0.5824718475341797,
      "learning_rate": 0.00023370636398574758,
      "loss": 0.3836,
      "step": 13840
    },
    {
      "gate_value": 0.24174365401268005,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 13840
    },
    {
      "grad_norm": 0.7611846923828125,
      "learning_rate": 0.0002336034301570718,
      "loss": 0.3934,
      "step": 13850
    },
    {
      "gate_value": 0.24081198871135712,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 13850
    },
    {
      "grad_norm": 0.7086225748062134,
      "learning_rate": 0.00023350043918623982,
      "loss": 0.4015,
      "step": 13860
    },
    {
      "gate_value": 0.24044911563396454,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 13860
    },
    {
      "grad_norm": 1.1706241369247437,
      "learning_rate": 0.00023339739114364508,
      "loss": 0.3975,
      "step": 13870
    },
    {
      "gate_value": 0.2400846630334854,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 13870
    },
    {
      "grad_norm": 9.380261421203613,
      "learning_rate": 0.00023329428609971986,
      "loss": 0.3988,
      "step": 13880
    },
    {
      "gate_value": 0.23952551186084747,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 13880
    },
    {
      "grad_norm": 4.3653388023376465,
      "learning_rate": 0.00023319112412493553,
      "loss": 0.3991,
      "step": 13890
    },
    {
      "gate_value": 0.23941710591316223,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 13890
    },
    {
      "grad_norm": 2.4812557697296143,
      "learning_rate": 0.00023308790528980226,
      "loss": 0.4141,
      "step": 13900
    },
    {
      "gate_value": 0.23994140326976776,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 13900
    },
    {
      "grad_norm": 3.448402166366577,
      "learning_rate": 0.00023298462966486923,
      "loss": 0.41,
      "step": 13910
    },
    {
      "gate_value": 0.2402760088443756,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 13910
    },
    {
      "grad_norm": 1.7383501529693604,
      "learning_rate": 0.00023288129732072432,
      "loss": 0.3945,
      "step": 13920
    },
    {
      "gate_value": 0.24017523229122162,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 13920
    },
    {
      "grad_norm": 0.8303579688072205,
      "learning_rate": 0.00023277790832799418,
      "loss": 0.4153,
      "step": 13930
    },
    {
      "gate_value": 0.2399144023656845,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 13930
    },
    {
      "grad_norm": 1.4897278547286987,
      "learning_rate": 0.00023267446275734431,
      "loss": 0.389,
      "step": 13940
    },
    {
      "gate_value": 0.23930053412914276,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 13940
    },
    {
      "grad_norm": 1.5729742050170898,
      "learning_rate": 0.00023257096067947868,
      "loss": 0.3811,
      "step": 13950
    },
    {
      "gate_value": 0.23936836421489716,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 13950
    },
    {
      "grad_norm": 0.9085593223571777,
      "learning_rate": 0.00023246740216513998,
      "loss": 0.3947,
      "step": 13960
    },
    {
      "gate_value": 0.2395535558462143,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 13960
    },
    {
      "grad_norm": 1.2284908294677734,
      "learning_rate": 0.00023236378728510963,
      "loss": 0.3951,
      "step": 13970
    },
    {
      "gate_value": 0.23942606151103973,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 13970
    },
    {
      "grad_norm": 0.8122304677963257,
      "learning_rate": 0.00023226011611020723,
      "loss": 0.3864,
      "step": 13980
    },
    {
      "gate_value": 0.23953235149383545,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 13980
    },
    {
      "grad_norm": 0.6110461354255676,
      "learning_rate": 0.00023215638871129115,
      "loss": 0.3924,
      "step": 13990
    },
    {
      "gate_value": 0.23886938393115997,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 13990
    },
    {
      "grad_norm": 0.8406757116317749,
      "learning_rate": 0.00023205260515925808,
      "loss": 0.4018,
      "step": 14000
    },
    {
      "gate_value": 0.2380830943584442,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14000
    },
    {
      "grad_norm": 0.534526526927948,
      "learning_rate": 0.0002319487655250431,
      "loss": 0.4073,
      "step": 14010
    },
    {
      "gate_value": 0.23780196905136108,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 14010
    },
    {
      "grad_norm": 14.408823013305664,
      "learning_rate": 0.00023184486987961963,
      "loss": 0.3853,
      "step": 14020
    },
    {
      "gate_value": 0.23794253170490265,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 14020
    },
    {
      "grad_norm": 6.213502883911133,
      "learning_rate": 0.0002317409182939993,
      "loss": 0.3991,
      "step": 14030
    },
    {
      "gate_value": 0.23781372606754303,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 14030
    },
    {
      "grad_norm": 1.1910241842269897,
      "learning_rate": 0.00023163691083923212,
      "loss": 0.3973,
      "step": 14040
    },
    {
      "gate_value": 0.2376827746629715,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14040
    },
    {
      "grad_norm": 2.642822265625,
      "learning_rate": 0.00023153284758640618,
      "loss": 0.3936,
      "step": 14050
    },
    {
      "gate_value": 0.23790773749351501,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 14050
    },
    {
      "grad_norm": 1.310566782951355,
      "learning_rate": 0.00023142872860664775,
      "loss": 0.4136,
      "step": 14060
    },
    {
      "gate_value": 0.23784318566322327,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 14060
    },
    {
      "grad_norm": 4.528188705444336,
      "learning_rate": 0.00023132455397112107,
      "loss": 0.3759,
      "step": 14070
    },
    {
      "gate_value": 0.23753774166107178,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 14070
    },
    {
      "grad_norm": 2.330179214477539,
      "learning_rate": 0.00023122032375102862,
      "loss": 0.4058,
      "step": 14080
    },
    {
      "gate_value": 0.23672199249267578,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 14080
    },
    {
      "grad_norm": 3.0592496395111084,
      "learning_rate": 0.00023111603801761075,
      "loss": 0.3816,
      "step": 14090
    },
    {
      "gate_value": 0.23660390079021454,
      "icl_sequence_length": 50,
      "num_contexts": 3,
      "step": 14090
    },
    {
      "grad_norm": 1.2959048748016357,
      "learning_rate": 0.00023101169684214577,
      "loss": 0.3854,
      "step": 14100
    },
    {
      "gate_value": 0.2372078150510788,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 14100
    },
    {
      "grad_norm": 2.996217727661133,
      "learning_rate": 0.00023090730029594995,
      "loss": 0.3806,
      "step": 14110
    },
    {
      "gate_value": 0.237883061170578,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 14110
    },
    {
      "grad_norm": 29.50691032409668,
      "learning_rate": 0.0002308028484503772,
      "loss": 0.3709,
      "step": 14120
    },
    {
      "gate_value": 0.23824338614940643,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 14120
    },
    {
      "grad_norm": 6.587845325469971,
      "learning_rate": 0.00023069834137681952,
      "loss": 0.3881,
      "step": 14130
    },
    {
      "gate_value": 0.23884567618370056,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 14130
    },
    {
      "grad_norm": 0.7628917694091797,
      "learning_rate": 0.0002305937791467064,
      "loss": 0.3745,
      "step": 14140
    },
    {
      "gate_value": 0.2393723726272583,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 14140
    },
    {
      "grad_norm": 1.8146260976791382,
      "learning_rate": 0.00023048916183150524,
      "loss": 0.4034,
      "step": 14150
    },
    {
      "gate_value": 0.23927997052669525,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 14150
    },
    {
      "grad_norm": 0.6181547045707703,
      "learning_rate": 0.0002303844895027209,
      "loss": 0.3794,
      "step": 14160
    },
    {
      "gate_value": 0.23931756615638733,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14160
    },
    {
      "grad_norm": 1.4100581407546997,
      "learning_rate": 0.000230279762231896,
      "loss": 0.409,
      "step": 14170
    },
    {
      "gate_value": 0.23971882462501526,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 14170
    },
    {
      "grad_norm": 4.569934844970703,
      "learning_rate": 0.00023017498009061057,
      "loss": 0.3979,
      "step": 14180
    },
    {
      "gate_value": 0.23882213234901428,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 14180
    },
    {
      "grad_norm": 0.9258241057395935,
      "learning_rate": 0.0002300701431504823,
      "loss": 0.4028,
      "step": 14190
    },
    {
      "gate_value": 0.2383560687303543,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 14190
    },
    {
      "grad_norm": 0.9763109683990479,
      "learning_rate": 0.00022996525148316616,
      "loss": 0.3873,
      "step": 14200
    },
    {
      "gate_value": 0.23882944881916046,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 14200
    },
    {
      "grad_norm": 0.7269139289855957,
      "learning_rate": 0.0002298603051603547,
      "loss": 0.4011,
      "step": 14210
    },
    {
      "gate_value": 0.23907381296157837,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 14210
    },
    {
      "grad_norm": 1.4872263669967651,
      "learning_rate": 0.00022975530425377763,
      "loss": 0.4069,
      "step": 14220
    },
    {
      "gate_value": 0.2386438399553299,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 14220
    },
    {
      "grad_norm": 1.6720694303512573,
      "learning_rate": 0.00022965024883520217,
      "loss": 0.4069,
      "step": 14230
    },
    {
      "gate_value": 0.23835961520671844,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14230
    },
    {
      "grad_norm": 14.838658332824707,
      "learning_rate": 0.00022954513897643274,
      "loss": 0.3907,
      "step": 14240
    },
    {
      "gate_value": 0.238117977976799,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 14240
    },
    {
      "grad_norm": 1.1093040704727173,
      "learning_rate": 0.00022943997474931087,
      "loss": 0.4086,
      "step": 14250
    },
    {
      "gate_value": 0.23805883526802063,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 14250
    },
    {
      "grad_norm": 1.4715149402618408,
      "learning_rate": 0.0002293347562257153,
      "loss": 0.3885,
      "step": 14260
    },
    {
      "gate_value": 0.23897786438465118,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 14260
    },
    {
      "grad_norm": 1.3662744760513306,
      "learning_rate": 0.00022922948347756195,
      "loss": 0.3885,
      "step": 14270
    },
    {
      "gate_value": 0.23961474001407623,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14270
    },
    {
      "grad_norm": 1.8294868469238281,
      "learning_rate": 0.00022912415657680375,
      "loss": 0.3966,
      "step": 14280
    },
    {
      "gate_value": 0.23970316350460052,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 14280
    },
    {
      "grad_norm": 1.3246426582336426,
      "learning_rate": 0.00022901877559543057,
      "loss": 0.3829,
      "step": 14290
    },
    {
      "gate_value": 0.24013105034828186,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 14290
    },
    {
      "grad_norm": 0.7515550851821899,
      "learning_rate": 0.00022891334060546947,
      "loss": 0.3869,
      "step": 14300
    },
    {
      "gate_value": 0.24048610031604767,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14300
    },
    {
      "grad_norm": 1.6856255531311035,
      "learning_rate": 0.00022880785167898407,
      "loss": 0.4136,
      "step": 14310
    },
    {
      "gate_value": 0.24040848016738892,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14310
    },
    {
      "grad_norm": 2.9111411571502686,
      "learning_rate": 0.0002287023088880752,
      "loss": 0.404,
      "step": 14320
    },
    {
      "gate_value": 0.2407020479440689,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14320
    },
    {
      "grad_norm": 0.9571368098258972,
      "learning_rate": 0.00022859671230488033,
      "loss": 0.4,
      "step": 14330
    },
    {
      "gate_value": 0.24106580018997192,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 14330
    },
    {
      "grad_norm": 0.6420993208885193,
      "learning_rate": 0.00022849106200157373,
      "loss": 0.3871,
      "step": 14340
    },
    {
      "gate_value": 0.24104709923267365,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 14340
    },
    {
      "grad_norm": 1.0583370923995972,
      "learning_rate": 0.0002283853580503664,
      "loss": 0.4126,
      "step": 14350
    },
    {
      "gate_value": 0.24134980142116547,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 14350
    },
    {
      "grad_norm": 0.8332363367080688,
      "learning_rate": 0.00022827960052350594,
      "loss": 0.3982,
      "step": 14360
    },
    {
      "gate_value": 0.24138779938220978,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 14360
    },
    {
      "grad_norm": 28.765117645263672,
      "learning_rate": 0.0002281737894932766,
      "loss": 0.3802,
      "step": 14370
    },
    {
      "gate_value": 0.24165429174900055,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 14370
    },
    {
      "grad_norm": 2.636730670928955,
      "learning_rate": 0.00022806792503199936,
      "loss": 0.3944,
      "step": 14380
    },
    {
      "gate_value": 0.24217766523361206,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14380
    },
    {
      "grad_norm": 2.2959983348846436,
      "learning_rate": 0.0002279620072120315,
      "loss": 0.3728,
      "step": 14390
    },
    {
      "gate_value": 0.24180297553539276,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14390
    },
    {
      "grad_norm": 1.3253600597381592,
      "learning_rate": 0.0002278560361057668,
      "loss": 0.3904,
      "step": 14400
    },
    {
      "gate_value": 0.24161909520626068,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14400
    },
    {
      "grad_norm": 0.8932808637619019,
      "learning_rate": 0.00022775001178563557,
      "loss": 0.3762,
      "step": 14410
    },
    {
      "gate_value": 0.2418455183506012,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14410
    },
    {
      "grad_norm": 13.018226623535156,
      "learning_rate": 0.00022764393432410442,
      "loss": 0.3948,
      "step": 14420
    },
    {
      "gate_value": 0.24179722368717194,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 14420
    },
    {
      "grad_norm": 0.9938327074050903,
      "learning_rate": 0.00022753780379367633,
      "loss": 0.3786,
      "step": 14430
    },
    {
      "gate_value": 0.24247105419635773,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14430
    },
    {
      "grad_norm": 1.7252559661865234,
      "learning_rate": 0.00022743162026689047,
      "loss": 0.3905,
      "step": 14440
    },
    {
      "gate_value": 0.24302411079406738,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 14440
    },
    {
      "grad_norm": 1.7536218166351318,
      "learning_rate": 0.0002273253838163223,
      "loss": 0.3863,
      "step": 14450
    },
    {
      "gate_value": 0.24294431507587433,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 14450
    },
    {
      "grad_norm": 1.1929810047149658,
      "learning_rate": 0.0002272190945145834,
      "loss": 0.3985,
      "step": 14460
    },
    {
      "gate_value": 0.24252630770206451,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 14460
    },
    {
      "grad_norm": 1.075748085975647,
      "learning_rate": 0.00022711275243432154,
      "loss": 0.39,
      "step": 14470
    },
    {
      "gate_value": 0.24282178282737732,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 14470
    },
    {
      "grad_norm": 4.662461757659912,
      "learning_rate": 0.00022700635764822058,
      "loss": 0.3893,
      "step": 14480
    },
    {
      "gate_value": 0.24319680035114288,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 14480
    },
    {
      "grad_norm": 1.0762276649475098,
      "learning_rate": 0.00022689991022900022,
      "loss": 0.3821,
      "step": 14490
    },
    {
      "gate_value": 0.2434968203306198,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 14490
    },
    {
      "grad_norm": 1.5702511072158813,
      "learning_rate": 0.00022679341024941632,
      "loss": 0.3778,
      "step": 14500
    },
    {
      "gate_value": 0.24319276213645935,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 14500
    },
    {
      "grad_norm": 0.9678138494491577,
      "learning_rate": 0.00022668685778226073,
      "loss": 0.3871,
      "step": 14510
    },
    {
      "gate_value": 0.24318233132362366,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 14510
    },
    {
      "grad_norm": 1.7996389865875244,
      "learning_rate": 0.00022658025290036085,
      "loss": 0.3944,
      "step": 14520
    },
    {
      "gate_value": 0.2435857653617859,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 14520
    },
    {
      "grad_norm": 0.8995399475097656,
      "learning_rate": 0.00022647359567658034,
      "loss": 0.4001,
      "step": 14530
    },
    {
      "gate_value": 0.24373860657215118,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14530
    },
    {
      "grad_norm": 1.5024384260177612,
      "learning_rate": 0.0002263668861838182,
      "loss": 0.3874,
      "step": 14540
    },
    {
      "gate_value": 0.24341940879821777,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14540
    },
    {
      "grad_norm": 0.7691788077354431,
      "learning_rate": 0.00022626012449500945,
      "loss": 0.3854,
      "step": 14550
    },
    {
      "gate_value": 0.24297001957893372,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 14550
    },
    {
      "grad_norm": 1.1324009895324707,
      "learning_rate": 0.00022615331068312472,
      "loss": 0.3833,
      "step": 14560
    },
    {
      "gate_value": 0.24257968366146088,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14560
    },
    {
      "grad_norm": 1.6810340881347656,
      "learning_rate": 0.00022604644482117028,
      "loss": 0.4013,
      "step": 14570
    },
    {
      "gate_value": 0.24228624999523163,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 14570
    },
    {
      "grad_norm": 0.5641115307807922,
      "learning_rate": 0.00022593952698218782,
      "loss": 0.3679,
      "step": 14580
    },
    {
      "gate_value": 0.24178647994995117,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14580
    },
    {
      "grad_norm": 0.9644331932067871,
      "learning_rate": 0.00022583255723925471,
      "loss": 0.3939,
      "step": 14590
    },
    {
      "gate_value": 0.24127645790576935,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 14590
    },
    {
      "grad_norm": 1.2938753366470337,
      "learning_rate": 0.00022572553566548378,
      "loss": 0.3957,
      "step": 14600
    },
    {
      "gate_value": 0.24074934422969818,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 14600
    },
    {
      "grad_norm": 1.338955044746399,
      "learning_rate": 0.00022561846233402333,
      "loss": 0.4033,
      "step": 14610
    },
    {
      "gate_value": 0.24045029282569885,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 14610
    },
    {
      "grad_norm": 1.5236607789993286,
      "learning_rate": 0.00022551133731805689,
      "loss": 0.3936,
      "step": 14620
    },
    {
      "gate_value": 0.24116292595863342,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 14620
    },
    {
      "grad_norm": 2.066084146499634,
      "learning_rate": 0.00022540416069080342,
      "loss": 0.3808,
      "step": 14630
    },
    {
      "gate_value": 0.24113567173480988,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 14630
    },
    {
      "grad_norm": 1.0546592473983765,
      "learning_rate": 0.00022529693252551714,
      "loss": 0.4019,
      "step": 14640
    },
    {
      "gate_value": 0.24161110818386078,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 14640
    },
    {
      "grad_norm": 1.3510944843292236,
      "learning_rate": 0.0002251896528954875,
      "loss": 0.3956,
      "step": 14650
    },
    {
      "gate_value": 0.24217374622821808,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 14650
    },
    {
      "grad_norm": 262.3678894042969,
      "learning_rate": 0.00022508232187403907,
      "loss": 0.4029,
      "step": 14660
    },
    {
      "gate_value": 0.24230490624904633,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 14660
    },
    {
      "grad_norm": 10.194805145263672,
      "learning_rate": 0.00022497493953453165,
      "loss": 0.3908,
      "step": 14670
    },
    {
      "gate_value": 0.2426171898841858,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 14670
    },
    {
      "grad_norm": 0.7869942784309387,
      "learning_rate": 0.00022486750595036005,
      "loss": 0.3942,
      "step": 14680
    },
    {
      "gate_value": 0.24261200428009033,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14680
    },
    {
      "grad_norm": 0.7607384920120239,
      "learning_rate": 0.00022476002119495403,
      "loss": 0.3809,
      "step": 14690
    },
    {
      "gate_value": 0.24294787645339966,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 14690
    },
    {
      "grad_norm": 1.007214069366455,
      "learning_rate": 0.00022465248534177848,
      "loss": 0.4001,
      "step": 14700
    },
    {
      "gate_value": 0.24369680881500244,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 14700
    },
    {
      "grad_norm": 0.6694496273994446,
      "learning_rate": 0.0002245448984643332,
      "loss": 0.3956,
      "step": 14710
    },
    {
      "gate_value": 0.24375005066394806,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 14710
    },
    {
      "grad_norm": 8.913844108581543,
      "learning_rate": 0.00022443726063615265,
      "loss": 0.401,
      "step": 14720
    },
    {
      "gate_value": 0.24351546168327332,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 14720
    },
    {
      "grad_norm": 12.980989456176758,
      "learning_rate": 0.00022432957193080643,
      "loss": 0.3934,
      "step": 14730
    },
    {
      "gate_value": 0.2434246689081192,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 14730
    },
    {
      "grad_norm": 7.73109769821167,
      "learning_rate": 0.00022422183242189862,
      "loss": 0.3741,
      "step": 14740
    },
    {
      "gate_value": 0.24368853867053986,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 14740
    },
    {
      "grad_norm": 1.094784140586853,
      "learning_rate": 0.0002241140421830682,
      "loss": 0.3947,
      "step": 14750
    },
    {
      "gate_value": 0.24333494901657104,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 14750
    },
    {
      "grad_norm": 56.325748443603516,
      "learning_rate": 0.00022400620128798892,
      "loss": 0.3731,
      "step": 14760
    },
    {
      "gate_value": 0.24329960346221924,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14760
    },
    {
      "grad_norm": 1.4292678833007812,
      "learning_rate": 0.00022389830981036878,
      "loss": 0.3852,
      "step": 14770
    },
    {
      "gate_value": 0.24320080876350403,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14770
    },
    {
      "grad_norm": 11.566863059997559,
      "learning_rate": 0.00022379036782395074,
      "loss": 0.3927,
      "step": 14780
    },
    {
      "gate_value": 0.24328622221946716,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 14780
    },
    {
      "grad_norm": 1.4415967464447021,
      "learning_rate": 0.00022368237540251209,
      "loss": 0.3929,
      "step": 14790
    },
    {
      "gate_value": 0.2440831959247589,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 14790
    },
    {
      "grad_norm": 2.255908489227295,
      "learning_rate": 0.0002235743326198646,
      "loss": 0.3876,
      "step": 14800
    },
    {
      "gate_value": 0.24463777244091034,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 14800
    },
    {
      "grad_norm": 1.6364030838012695,
      "learning_rate": 0.00022346623954985463,
      "loss": 0.3858,
      "step": 14810
    },
    {
      "gate_value": 0.24514533579349518,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 14810
    },
    {
      "grad_norm": 2.745298147201538,
      "learning_rate": 0.00022335809626636264,
      "loss": 0.3967,
      "step": 14820
    },
    {
      "gate_value": 0.2454182356595993,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 14820
    },
    {
      "grad_norm": 0.6744495630264282,
      "learning_rate": 0.00022324990284330355,
      "loss": 0.3865,
      "step": 14830
    },
    {
      "gate_value": 0.2454521507024765,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 14830
    },
    {
      "grad_norm": 2.1479763984680176,
      "learning_rate": 0.00022314165935462656,
      "loss": 0.3919,
      "step": 14840
    },
    {
      "gate_value": 0.2457962930202484,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 14840
    },
    {
      "grad_norm": 1.0453277826309204,
      "learning_rate": 0.0002230333658743151,
      "loss": 0.3881,
      "step": 14850
    },
    {
      "gate_value": 0.24663123488426208,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 14850
    },
    {
      "grad_norm": 13.712295532226562,
      "learning_rate": 0.00022292502247638673,
      "loss": 0.3828,
      "step": 14860
    },
    {
      "gate_value": 0.24684672057628632,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 14860
    },
    {
      "grad_norm": 0.5274834036827087,
      "learning_rate": 0.00022281662923489312,
      "loss": 0.3867,
      "step": 14870
    },
    {
      "gate_value": 0.24730166792869568,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 14870
    },
    {
      "grad_norm": 0.4449861943721771,
      "learning_rate": 0.0002227081862239201,
      "loss": 0.4015,
      "step": 14880
    },
    {
      "gate_value": 0.24659566581249237,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 14880
    },
    {
      "grad_norm": 0.8005648255348206,
      "learning_rate": 0.00022259969351758733,
      "loss": 0.3941,
      "step": 14890
    },
    {
      "gate_value": 0.2455216944217682,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 14890
    },
    {
      "grad_norm": 0.32042214274406433,
      "learning_rate": 0.00022249115119004863,
      "loss": 0.398,
      "step": 14900
    },
    {
      "gate_value": 0.24477018415927887,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 14900
    },
    {
      "grad_norm": 0.7770922780036926,
      "learning_rate": 0.00022238255931549168,
      "loss": 0.4056,
      "step": 14910
    },
    {
      "gate_value": 0.24532420933246613,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 14910
    },
    {
      "grad_norm": 0.9371281266212463,
      "learning_rate": 0.00022227391796813794,
      "loss": 0.3916,
      "step": 14920
    },
    {
      "gate_value": 0.24590061604976654,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 14920
    },
    {
      "grad_norm": 3.6227617263793945,
      "learning_rate": 0.00022216522722224278,
      "loss": 0.3933,
      "step": 14930
    },
    {
      "gate_value": 0.24611833691596985,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 14930
    },
    {
      "grad_norm": 59.824222564697266,
      "learning_rate": 0.00022205648715209526,
      "loss": 0.3817,
      "step": 14940
    },
    {
      "gate_value": 0.24605101346969604,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 14940
    },
    {
      "grad_norm": 4.043715000152588,
      "learning_rate": 0.00022194769783201828,
      "loss": 0.3987,
      "step": 14950
    },
    {
      "gate_value": 0.2458629310131073,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14950
    },
    {
      "grad_norm": 1.7102625370025635,
      "learning_rate": 0.0002218388593363682,
      "loss": 0.3887,
      "step": 14960
    },
    {
      "gate_value": 0.24561689794063568,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 14960
    },
    {
      "grad_norm": 61.252071380615234,
      "learning_rate": 0.00022172997173953518,
      "loss": 0.3995,
      "step": 14970
    },
    {
      "gate_value": 0.24532824754714966,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 14970
    },
    {
      "grad_norm": 0.8813376426696777,
      "learning_rate": 0.0002216210351159429,
      "loss": 0.3859,
      "step": 14980
    },
    {
      "gate_value": 0.24525928497314453,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 14980
    },
    {
      "grad_norm": 1.0708917379379272,
      "learning_rate": 0.0002215120495400484,
      "loss": 0.4059,
      "step": 14990
    },
    {
      "gate_value": 0.2450612485408783,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 14990
    },
    {
      "grad_norm": 2.3246612548828125,
      "learning_rate": 0.00022140301508634237,
      "loss": 0.3872,
      "step": 15000
    },
    {
      "gate_value": 0.24598005414009094,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 15000
    },
    {
      "grad_norm": 1.221426010131836,
      "learning_rate": 0.00022129393182934883,
      "loss": 0.4096,
      "step": 15010
    },
    {
      "gate_value": 0.2468685358762741,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 15010
    },
    {
      "grad_norm": 0.46905654668807983,
      "learning_rate": 0.00022118479984362512,
      "loss": 0.39,
      "step": 15020
    },
    {
      "gate_value": 0.24734513461589813,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 15020
    },
    {
      "grad_norm": 204.0078582763672,
      "learning_rate": 0.00022107561920376202,
      "loss": 0.4064,
      "step": 15030
    },
    {
      "gate_value": 0.24712851643562317,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 15030
    },
    {
      "grad_norm": 0.45268166065216064,
      "learning_rate": 0.00022096638998438334,
      "loss": 0.3833,
      "step": 15040
    },
    {
      "gate_value": 0.24720291793346405,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 15040
    },
    {
      "grad_norm": 0.8164011240005493,
      "learning_rate": 0.00022085711226014625,
      "loss": 0.3996,
      "step": 15050
    },
    {
      "gate_value": 0.24735890328884125,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 15050
    },
    {
      "grad_norm": 1.2292507886886597,
      "learning_rate": 0.00022074778610574114,
      "loss": 0.3666,
      "step": 15060
    },
    {
      "gate_value": 0.24747858941555023,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 15060
    },
    {
      "grad_norm": 1.1117497682571411,
      "learning_rate": 0.0002206384115958913,
      "loss": 0.3836,
      "step": 15070
    },
    {
      "gate_value": 0.24780391156673431,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 15070
    },
    {
      "grad_norm": 1.9108260869979858,
      "learning_rate": 0.00022052898880535324,
      "loss": 0.3846,
      "step": 15080
    },
    {
      "gate_value": 0.2478286623954773,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 15080
    },
    {
      "grad_norm": 2.420485496520996,
      "learning_rate": 0.00022041951780891637,
      "loss": 0.3976,
      "step": 15090
    },
    {
      "gate_value": 0.24756808578968048,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 15090
    },
    {
      "grad_norm": 1.190596342086792,
      "learning_rate": 0.00022030999868140306,
      "loss": 0.3996,
      "step": 15100
    },
    {
      "gate_value": 0.24758541584014893,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 15100
    },
    {
      "grad_norm": 1.1695483922958374,
      "learning_rate": 0.00022020043149766872,
      "loss": 0.3699,
      "step": 15110
    },
    {
      "gate_value": 0.2479713410139084,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 15110
    },
    {
      "grad_norm": 2.384326219558716,
      "learning_rate": 0.0002200908163326013,
      "loss": 0.3778,
      "step": 15120
    },
    {
      "gate_value": 0.2486722320318222,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 15120
    },
    {
      "grad_norm": 6.0441484451293945,
      "learning_rate": 0.0002199811532611219,
      "loss": 0.3925,
      "step": 15130
    },
    {
      "gate_value": 0.24903367459774017,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 15130
    },
    {
      "grad_norm": 1.6270265579223633,
      "learning_rate": 0.0002198714423581841,
      "loss": 0.4015,
      "step": 15140
    },
    {
      "gate_value": 0.2497212141752243,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 15140
    },
    {
      "grad_norm": 0.7187557220458984,
      "learning_rate": 0.00021976168369877428,
      "loss": 0.3847,
      "step": 15150
    },
    {
      "gate_value": 0.24981430172920227,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 15150
    },
    {
      "grad_norm": 1.3020397424697876,
      "learning_rate": 0.00021965187735791154,
      "loss": 0.393,
      "step": 15160
    },
    {
      "gate_value": 0.24998848140239716,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 15160
    },
    {
      "grad_norm": 0.6040678024291992,
      "learning_rate": 0.00021954202341064731,
      "loss": 0.3774,
      "step": 15170
    },
    {
      "gate_value": 0.25018632411956787,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 15170
    },
    {
      "grad_norm": 0.56707763671875,
      "learning_rate": 0.00021943212193206588,
      "loss": 0.3768,
      "step": 15180
    },
    {
      "gate_value": 0.2501216530799866,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 15180
    },
    {
      "grad_norm": 0.6855562925338745,
      "learning_rate": 0.00021932217299728383,
      "loss": 0.4046,
      "step": 15190
    },
    {
      "gate_value": 0.2500430643558502,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 15190
    },
    {
      "grad_norm": 0.48342952132225037,
      "learning_rate": 0.00021921217668145014,
      "loss": 0.3892,
      "step": 15200
    },
    {
      "gate_value": 0.24985510110855103,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 15200
    },
    {
      "grad_norm": 0.8365819454193115,
      "learning_rate": 0.00021910213305974637,
      "loss": 0.3907,
      "step": 15210
    },
    {
      "gate_value": 0.2495216578245163,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 15210
    },
    {
      "grad_norm": 7.897146224975586,
      "learning_rate": 0.0002189920422073862,
      "loss": 0.3848,
      "step": 15220
    },
    {
      "gate_value": 0.25015580654144287,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 15220
    },
    {
      "grad_norm": 3.6526150703430176,
      "learning_rate": 0.00021888190419961582,
      "loss": 0.4047,
      "step": 15230
    },
    {
      "gate_value": 0.2501993775367737,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 15230
    },
    {
      "grad_norm": 0.5569013357162476,
      "learning_rate": 0.00021877171911171338,
      "loss": 0.3841,
      "step": 15240
    },
    {
      "gate_value": 0.25032347440719604,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 15240
    },
    {
      "grad_norm": 1.0121521949768066,
      "learning_rate": 0.00021866148701898939,
      "loss": 0.4137,
      "step": 15250
    },
    {
      "gate_value": 0.25046205520629883,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 15250
    },
    {
      "grad_norm": 80.40043640136719,
      "learning_rate": 0.0002185512079967865,
      "loss": 0.3993,
      "step": 15260
    },
    {
      "gate_value": 0.25006845593452454,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 15260
    },
    {
      "grad_norm": 1.2732210159301758,
      "learning_rate": 0.00021844088212047934,
      "loss": 0.3928,
      "step": 15270
    },
    {
      "gate_value": 0.24979396164417267,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 15270
    },
    {
      "grad_norm": 0.9375239610671997,
      "learning_rate": 0.0002183305094654746,
      "loss": 0.3938,
      "step": 15280
    },
    {
      "gate_value": 0.250060111284256,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 15280
    },
    {
      "grad_norm": 0.3681797981262207,
      "learning_rate": 0.00021822009010721095,
      "loss": 0.3745,
      "step": 15290
    },
    {
      "gate_value": 0.2503737509250641,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 15290
    },
    {
      "grad_norm": 0.4525548219680786,
      "learning_rate": 0.000218109624121159,
      "loss": 0.3909,
      "step": 15300
    },
    {
      "gate_value": 0.25037622451782227,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 15300
    },
    {
      "grad_norm": 29.581342697143555,
      "learning_rate": 0.0002179991115828212,
      "loss": 0.3881,
      "step": 15310
    },
    {
      "gate_value": 0.25151196122169495,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 15310
    },
    {
      "grad_norm": 25.658153533935547,
      "learning_rate": 0.00021788855256773182,
      "loss": 0.4066,
      "step": 15320
    },
    {
      "gate_value": 0.25173476338386536,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15320
    },
    {
      "grad_norm": 0.3691151738166809,
      "learning_rate": 0.0002177779471514569,
      "loss": 0.4099,
      "step": 15330
    },
    {
      "gate_value": 0.25139957666397095,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 15330
    },
    {
      "grad_norm": 0.46884021162986755,
      "learning_rate": 0.00021766729540959422,
      "loss": 0.4095,
      "step": 15340
    },
    {
      "gate_value": 0.2511599361896515,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 15340
    },
    {
      "grad_norm": 2.712085723876953,
      "learning_rate": 0.00021755659741777317,
      "loss": 0.3866,
      "step": 15350
    },
    {
      "gate_value": 0.25105735659599304,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 15350
    },
    {
      "grad_norm": 0.5205731987953186,
      "learning_rate": 0.00021744585325165485,
      "loss": 0.3945,
      "step": 15360
    },
    {
      "gate_value": 0.25111421942710876,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 15360
    },
    {
      "grad_norm": 0.41241544485092163,
      "learning_rate": 0.00021733506298693178,
      "loss": 0.4053,
      "step": 15370
    },
    {
      "gate_value": 0.25125807523727417,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 15370
    },
    {
      "grad_norm": 0.6068854331970215,
      "learning_rate": 0.0002172242266993281,
      "loss": 0.3962,
      "step": 15380
    },
    {
      "gate_value": 0.25145697593688965,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 15380
    },
    {
      "grad_norm": 0.8114979267120361,
      "learning_rate": 0.00021711334446459937,
      "loss": 0.3839,
      "step": 15390
    },
    {
      "gate_value": 0.25126177072525024,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 15390
    },
    {
      "grad_norm": 0.5185739398002625,
      "learning_rate": 0.0002170024163585325,
      "loss": 0.3781,
      "step": 15400
    },
    {
      "gate_value": 0.2513514757156372,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 15400
    },
    {
      "grad_norm": 8.948464393615723,
      "learning_rate": 0.0002168914424569459,
      "loss": 0.3842,
      "step": 15410
    },
    {
      "gate_value": 0.251672625541687,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 15410
    },
    {
      "grad_norm": 0.726407527923584,
      "learning_rate": 0.0002167804228356891,
      "loss": 0.3908,
      "step": 15420
    },
    {
      "gate_value": 0.2515237033367157,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 15420
    },
    {
      "grad_norm": 0.4296571910381317,
      "learning_rate": 0.00021666935757064294,
      "loss": 0.3717,
      "step": 15430
    },
    {
      "gate_value": 0.2517387866973877,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 15430
    },
    {
      "grad_norm": 0.547008216381073,
      "learning_rate": 0.00021655824673771963,
      "loss": 0.3755,
      "step": 15440
    },
    {
      "gate_value": 0.2511221468448639,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 15440
    },
    {
      "grad_norm": 4.1485819816589355,
      "learning_rate": 0.0002164470904128622,
      "loss": 0.3919,
      "step": 15450
    },
    {
      "gate_value": 0.2507605254650116,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15450
    },
    {
      "grad_norm": 1.476075291633606,
      "learning_rate": 0.00021633588867204509,
      "loss": 0.3869,
      "step": 15460
    },
    {
      "gate_value": 0.2507340610027313,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 15460
    },
    {
      "grad_norm": 0.7980257272720337,
      "learning_rate": 0.0002162246415912736,
      "loss": 0.4104,
      "step": 15470
    },
    {
      "gate_value": 0.2506262958049774,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 15470
    },
    {
      "grad_norm": 0.5964367389678955,
      "learning_rate": 0.00021611334924658397,
      "loss": 0.39,
      "step": 15480
    },
    {
      "gate_value": 0.25122764706611633,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 15480
    },
    {
      "grad_norm": 0.5874263048171997,
      "learning_rate": 0.00021600201171404358,
      "loss": 0.369,
      "step": 15490
    },
    {
      "gate_value": 0.251434862613678,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 15490
    },
    {
      "grad_norm": 0.9638623595237732,
      "learning_rate": 0.00021589062906975055,
      "loss": 0.3913,
      "step": 15500
    },
    {
      "gate_value": 0.2517266571521759,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15500
    },
    {
      "grad_norm": 0.5481185913085938,
      "learning_rate": 0.00021577920138983383,
      "loss": 0.3991,
      "step": 15510
    },
    {
      "gate_value": 0.2520568072795868,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 15510
    },
    {
      "grad_norm": 2.838137626647949,
      "learning_rate": 0.00021566772875045327,
      "loss": 0.4051,
      "step": 15520
    },
    {
      "gate_value": 0.2513696253299713,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 15520
    },
    {
      "grad_norm": 6.476634502410889,
      "learning_rate": 0.00021555621122779927,
      "loss": 0.3831,
      "step": 15530
    },
    {
      "gate_value": 0.2510931193828583,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 15530
    },
    {
      "grad_norm": 0.6045856475830078,
      "learning_rate": 0.00021544464889809307,
      "loss": 0.3815,
      "step": 15540
    },
    {
      "gate_value": 0.25079429149627686,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 15540
    },
    {
      "grad_norm": 9.019379615783691,
      "learning_rate": 0.0002153330418375865,
      "loss": 0.4041,
      "step": 15550
    },
    {
      "gate_value": 0.2513893246650696,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 15550
    },
    {
      "grad_norm": 0.8856226205825806,
      "learning_rate": 0.0002152213901225618,
      "loss": 0.3808,
      "step": 15560
    },
    {
      "gate_value": 0.251968115568161,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 15560
    },
    {
      "grad_norm": 0.6020224690437317,
      "learning_rate": 0.00021510969382933204,
      "loss": 0.3901,
      "step": 15570
    },
    {
      "gate_value": 0.25232499837875366,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15570
    },
    {
      "grad_norm": 1.5310420989990234,
      "learning_rate": 0.00021499795303424045,
      "loss": 0.4009,
      "step": 15580
    },
    {
      "gate_value": 0.2531309723854065,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 15580
    },
    {
      "grad_norm": 0.6574874520301819,
      "learning_rate": 0.00021488616781366088,
      "loss": 0.3933,
      "step": 15590
    },
    {
      "gate_value": 0.25358569622039795,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15590
    },
    {
      "grad_norm": 1.205522894859314,
      "learning_rate": 0.00021477433824399741,
      "loss": 0.3957,
      "step": 15600
    },
    {
      "gate_value": 0.25376975536346436,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 15600
    },
    {
      "grad_norm": 2.3468384742736816,
      "learning_rate": 0.00021466246440168457,
      "loss": 0.3819,
      "step": 15610
    },
    {
      "gate_value": 0.2539609670639038,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 15610
    },
    {
      "grad_norm": 0.36947518587112427,
      "learning_rate": 0.00021455054636318702,
      "loss": 0.3828,
      "step": 15620
    },
    {
      "gate_value": 0.2545165717601776,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15620
    },
    {
      "grad_norm": 0.6467231512069702,
      "learning_rate": 0.0002144385842049997,
      "loss": 0.3631,
      "step": 15630
    },
    {
      "gate_value": 0.2550152540206909,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 15630
    },
    {
      "grad_norm": 0.6928891539573669,
      "learning_rate": 0.00021432657800364775,
      "loss": 0.4008,
      "step": 15640
    },
    {
      "gate_value": 0.254954069852829,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 15640
    },
    {
      "grad_norm": 1.5424365997314453,
      "learning_rate": 0.00021421452783568624,
      "loss": 0.3866,
      "step": 15650
    },
    {
      "gate_value": 0.25487643480300903,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 15650
    },
    {
      "grad_norm": 1.0675183534622192,
      "learning_rate": 0.00021410243377770048,
      "loss": 0.3751,
      "step": 15660
    },
    {
      "gate_value": 0.2548693120479584,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 15660
    },
    {
      "grad_norm": 2.203230381011963,
      "learning_rate": 0.00021399029590630567,
      "loss": 0.3958,
      "step": 15670
    },
    {
      "gate_value": 0.2552475035190582,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 15670
    },
    {
      "grad_norm": 3.342008590698242,
      "learning_rate": 0.000213878114298147,
      "loss": 0.3727,
      "step": 15680
    },
    {
      "gate_value": 0.255521297454834,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 15680
    },
    {
      "grad_norm": 20.107084274291992,
      "learning_rate": 0.00021376588902989962,
      "loss": 0.3893,
      "step": 15690
    },
    {
      "gate_value": 0.25595200061798096,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15690
    },
    {
      "grad_norm": 0.9243622422218323,
      "learning_rate": 0.00021365362017826826,
      "loss": 0.3972,
      "step": 15700
    },
    {
      "gate_value": 0.25568291544914246,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 15700
    },
    {
      "grad_norm": 0.8758662343025208,
      "learning_rate": 0.00021354130781998774,
      "loss": 0.4018,
      "step": 15710
    },
    {
      "gate_value": 0.25568267703056335,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 15710
    },
    {
      "grad_norm": 13.960478782653809,
      "learning_rate": 0.00021342895203182256,
      "loss": 0.3883,
      "step": 15720
    },
    {
      "gate_value": 0.25582268834114075,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 15720
    },
    {
      "grad_norm": 0.7308502793312073,
      "learning_rate": 0.00021331655289056668,
      "loss": 0.3832,
      "step": 15730
    },
    {
      "gate_value": 0.2554386258125305,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15730
    },
    {
      "grad_norm": 0.8558282256126404,
      "learning_rate": 0.00021320411047304398,
      "loss": 0.3748,
      "step": 15740
    },
    {
      "gate_value": 0.25477343797683716,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 15740
    },
    {
      "grad_norm": 0.4697778820991516,
      "learning_rate": 0.00021309162485610774,
      "loss": 0.3853,
      "step": 15750
    },
    {
      "gate_value": 0.2549998462200165,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 15750
    },
    {
      "grad_norm": 19.340211868286133,
      "learning_rate": 0.00021297909611664085,
      "loss": 0.3793,
      "step": 15760
    },
    {
      "gate_value": 0.2552267909049988,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15760
    },
    {
      "grad_norm": 0.7915621995925903,
      "learning_rate": 0.0002128665243315556,
      "loss": 0.4153,
      "step": 15770
    },
    {
      "gate_value": 0.2552363872528076,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 15770
    },
    {
      "grad_norm": 1.852100133895874,
      "learning_rate": 0.00021275390957779377,
      "loss": 0.3861,
      "step": 15780
    },
    {
      "gate_value": 0.2549734115600586,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 15780
    },
    {
      "grad_norm": 1.6746331453323364,
      "learning_rate": 0.0002126412519323265,
      "loss": 0.3867,
      "step": 15790
    },
    {
      "gate_value": 0.2554560899734497,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 15790
    },
    {
      "grad_norm": 0.7904180288314819,
      "learning_rate": 0.00021252855147215415,
      "loss": 0.396,
      "step": 15800
    },
    {
      "gate_value": 0.25506383180618286,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15800
    },
    {
      "grad_norm": 11.795361518859863,
      "learning_rate": 0.0002124158082743065,
      "loss": 0.4056,
      "step": 15810
    },
    {
      "gate_value": 0.2555553913116455,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 15810
    },
    {
      "grad_norm": 0.7169960737228394,
      "learning_rate": 0.0002123030224158425,
      "loss": 0.3954,
      "step": 15820
    },
    {
      "gate_value": 0.2563459873199463,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 15820
    },
    {
      "grad_norm": 0.35485365986824036,
      "learning_rate": 0.0002121901939738501,
      "loss": 0.3916,
      "step": 15830
    },
    {
      "gate_value": 0.2569548785686493,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 15830
    },
    {
      "grad_norm": 0.3839747905731201,
      "learning_rate": 0.00021207732302544656,
      "loss": 0.3911,
      "step": 15840
    },
    {
      "gate_value": 0.257243812084198,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 15840
    },
    {
      "grad_norm": 0.42446404695510864,
      "learning_rate": 0.00021196440964777808,
      "loss": 0.3931,
      "step": 15850
    },
    {
      "gate_value": 0.25690987706184387,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 15850
    },
    {
      "grad_norm": 0.5473038554191589,
      "learning_rate": 0.00021185145391801989,
      "loss": 0.3968,
      "step": 15860
    },
    {
      "gate_value": 0.25695231556892395,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 15860
    },
    {
      "grad_norm": 2.4800989627838135,
      "learning_rate": 0.00021173845591337614,
      "loss": 0.3932,
      "step": 15870
    },
    {
      "gate_value": 0.2575494349002838,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 15870
    },
    {
      "grad_norm": 0.5532228946685791,
      "learning_rate": 0.00021162541571108,
      "loss": 0.3849,
      "step": 15880
    },
    {
      "gate_value": 0.25717470049858093,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 15880
    },
    {
      "grad_norm": 0.6373085975646973,
      "learning_rate": 0.00021151233338839324,
      "loss": 0.3814,
      "step": 15890
    },
    {
      "gate_value": 0.25678393244743347,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 15890
    },
    {
      "grad_norm": 0.9044815897941589,
      "learning_rate": 0.0002113992090226067,
      "loss": 0.3792,
      "step": 15900
    },
    {
      "gate_value": 0.2567276358604431,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 15900
    },
    {
      "grad_norm": 1.4532538652420044,
      "learning_rate": 0.0002112860426910397,
      "loss": 0.407,
      "step": 15910
    },
    {
      "gate_value": 0.2564510107040405,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 15910
    },
    {
      "grad_norm": 0.8240043520927429,
      "learning_rate": 0.00021117283447104045,
      "loss": 0.3954,
      "step": 15920
    },
    {
      "gate_value": 0.25678786635398865,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 15920
    },
    {
      "grad_norm": 1.1029771566390991,
      "learning_rate": 0.00021105958443998568,
      "loss": 0.3692,
      "step": 15930
    },
    {
      "gate_value": 0.2571948766708374,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 15930
    },
    {
      "grad_norm": 1.8991724252700806,
      "learning_rate": 0.00021094629267528065,
      "loss": 0.3921,
      "step": 15940
    },
    {
      "gate_value": 0.25753188133239746,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 15940
    },
    {
      "grad_norm": 0.6801439523696899,
      "learning_rate": 0.0002108329592543593,
      "loss": 0.392,
      "step": 15950
    },
    {
      "gate_value": 0.2578553855419159,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 15950
    },
    {
      "grad_norm": 1.262444257736206,
      "learning_rate": 0.0002107195842546839,
      "loss": 0.3761,
      "step": 15960
    },
    {
      "gate_value": 0.2574891149997711,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 15960
    },
    {
      "grad_norm": 2.524458885192871,
      "learning_rate": 0.0002106061677537453,
      "loss": 0.3865,
      "step": 15970
    },
    {
      "gate_value": 0.2576063573360443,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 15970
    },
    {
      "grad_norm": 1.6312613487243652,
      "learning_rate": 0.00021049270982906242,
      "loss": 0.3722,
      "step": 15980
    },
    {
      "gate_value": 0.2574409544467926,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 15980
    },
    {
      "grad_norm": 3.748892068862915,
      "learning_rate": 0.0002103792105581828,
      "loss": 0.3816,
      "step": 15990
    },
    {
      "gate_value": 0.2578420639038086,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 15990
    },
    {
      "grad_norm": 1.2964622974395752,
      "learning_rate": 0.00021026567001868212,
      "loss": 0.391,
      "step": 16000
    },
    {
      "gate_value": 0.25792190432548523,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 16000
    },
    {
      "grad_norm": 3.316319704055786,
      "learning_rate": 0.00021015208828816423,
      "loss": 0.3748,
      "step": 16010
    },
    {
      "gate_value": 0.25808802247047424,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 16010
    },
    {
      "grad_norm": 1.7387820482254028,
      "learning_rate": 0.0002100384654442613,
      "loss": 0.3812,
      "step": 16020
    },
    {
      "gate_value": 0.25833994150161743,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 16020
    },
    {
      "grad_norm": 3.10507869720459,
      "learning_rate": 0.00020992480156463325,
      "loss": 0.3989,
      "step": 16030
    },
    {
      "gate_value": 0.2587062418460846,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 16030
    },
    {
      "grad_norm": 2.5109314918518066,
      "learning_rate": 0.0002098110967269684,
      "loss": 0.3885,
      "step": 16040
    },
    {
      "gate_value": 0.25896432995796204,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16040
    },
    {
      "grad_norm": 9.999404907226562,
      "learning_rate": 0.00020969735100898296,
      "loss": 0.3975,
      "step": 16050
    },
    {
      "gate_value": 0.25936567783355713,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 16050
    },
    {
      "grad_norm": 2.2664084434509277,
      "learning_rate": 0.00020958356448842096,
      "loss": 0.3859,
      "step": 16060
    },
    {
      "gate_value": 0.2597810924053192,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 16060
    },
    {
      "grad_norm": 2.365952730178833,
      "learning_rate": 0.00020946973724305455,
      "loss": 0.3775,
      "step": 16070
    },
    {
      "gate_value": 0.2600749731063843,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 16070
    },
    {
      "grad_norm": 3.8386356830596924,
      "learning_rate": 0.00020935586935068347,
      "loss": 0.3663,
      "step": 16080
    },
    {
      "gate_value": 0.2603296935558319,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16080
    },
    {
      "grad_norm": 1.1269052028656006,
      "learning_rate": 0.00020924196088913536,
      "loss": 0.3807,
      "step": 16090
    },
    {
      "gate_value": 0.26086583733558655,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 16090
    },
    {
      "grad_norm": 9.94027328491211,
      "learning_rate": 0.00020912801193626564,
      "loss": 0.3623,
      "step": 16100
    },
    {
      "gate_value": 0.2611926794052124,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 16100
    },
    {
      "grad_norm": 2.996548652648926,
      "learning_rate": 0.00020901402256995728,
      "loss": 0.3801,
      "step": 16110
    },
    {
      "gate_value": 0.26136496663093567,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 16110
    },
    {
      "grad_norm": 1.541462779045105,
      "learning_rate": 0.000208899992868121,
      "loss": 0.3786,
      "step": 16120
    },
    {
      "gate_value": 0.26164090633392334,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 16120
    },
    {
      "grad_norm": 2.2632198333740234,
      "learning_rate": 0.00020878592290869493,
      "loss": 0.3909,
      "step": 16130
    },
    {
      "gate_value": 0.26203489303588867,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 16130
    },
    {
      "grad_norm": 1.2951090335845947,
      "learning_rate": 0.00020867181276964486,
      "loss": 0.374,
      "step": 16140
    },
    {
      "gate_value": 0.2623451054096222,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 16140
    },
    {
      "grad_norm": 1.0222433805465698,
      "learning_rate": 0.00020855766252896407,
      "loss": 0.3705,
      "step": 16150
    },
    {
      "gate_value": 0.2623714804649353,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 16150
    },
    {
      "grad_norm": 7.520640850067139,
      "learning_rate": 0.00020844347226467306,
      "loss": 0.3886,
      "step": 16160
    },
    {
      "gate_value": 0.26221963763237,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 16160
    },
    {
      "grad_norm": 34.34793472290039,
      "learning_rate": 0.00020832924205481986,
      "loss": 0.3846,
      "step": 16170
    },
    {
      "gate_value": 0.26230815052986145,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 16170
    },
    {
      "grad_norm": 0.3719327449798584,
      "learning_rate": 0.00020821497197747973,
      "loss": 0.3975,
      "step": 16180
    },
    {
      "gate_value": 0.26282429695129395,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16180
    },
    {
      "grad_norm": 0.6486546397209167,
      "learning_rate": 0.00020810066211075516,
      "loss": 0.3876,
      "step": 16190
    },
    {
      "gate_value": 0.2626320421695709,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 16190
    },
    {
      "grad_norm": 82.20526885986328,
      "learning_rate": 0.00020798631253277598,
      "loss": 0.385,
      "step": 16200
    },
    {
      "gate_value": 0.2627756595611572,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 16200
    },
    {
      "grad_norm": 0.4642679691314697,
      "learning_rate": 0.00020787192332169887,
      "loss": 0.3929,
      "step": 16210
    },
    {
      "gate_value": 0.2625521421432495,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 16210
    },
    {
      "grad_norm": 41.34733581542969,
      "learning_rate": 0.00020775749455570792,
      "loss": 0.3805,
      "step": 16220
    },
    {
      "gate_value": 0.2622221112251282,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 16220
    },
    {
      "grad_norm": 1.4750176668167114,
      "learning_rate": 0.00020764302631301403,
      "loss": 0.3977,
      "step": 16230
    },
    {
      "gate_value": 0.26216328144073486,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16230
    },
    {
      "grad_norm": 0.7368234992027283,
      "learning_rate": 0.0002075285186718552,
      "loss": 0.4035,
      "step": 16240
    },
    {
      "gate_value": 0.26280391216278076,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 16240
    },
    {
      "grad_norm": 0.5541696548461914,
      "learning_rate": 0.00020741397171049637,
      "loss": 0.3978,
      "step": 16250
    },
    {
      "gate_value": 0.2634401321411133,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 16250
    },
    {
      "grad_norm": 2.5238094329833984,
      "learning_rate": 0.0002072993855072292,
      "loss": 0.374,
      "step": 16260
    },
    {
      "gate_value": 0.26335403323173523,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 16260
    },
    {
      "grad_norm": 0.4893578588962555,
      "learning_rate": 0.00020718476014037235,
      "loss": 0.3891,
      "step": 16270
    },
    {
      "gate_value": 0.2633839547634125,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 16270
    },
    {
      "grad_norm": 0.690986692905426,
      "learning_rate": 0.00020707009568827117,
      "loss": 0.4066,
      "step": 16280
    },
    {
      "gate_value": 0.2635418176651001,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 16280
    },
    {
      "grad_norm": 0.5379742980003357,
      "learning_rate": 0.00020695539222929767,
      "loss": 0.3842,
      "step": 16290
    },
    {
      "gate_value": 0.26373836398124695,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16290
    },
    {
      "grad_norm": 3.5610744953155518,
      "learning_rate": 0.00020684064984185076,
      "loss": 0.3884,
      "step": 16300
    },
    {
      "gate_value": 0.2638593912124634,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 16300
    },
    {
      "grad_norm": 3.4267258644104004,
      "learning_rate": 0.00020672586860435557,
      "loss": 0.3813,
      "step": 16310
    },
    {
      "gate_value": 0.2640724778175354,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 16310
    },
    {
      "grad_norm": 31.349645614624023,
      "learning_rate": 0.0002066110485952641,
      "loss": 0.3883,
      "step": 16320
    },
    {
      "gate_value": 0.2640630304813385,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 16320
    },
    {
      "grad_norm": 0.7556735277175903,
      "learning_rate": 0.0002064961898930547,
      "loss": 0.3804,
      "step": 16330
    },
    {
      "gate_value": 0.2637392580509186,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 16330
    },
    {
      "grad_norm": 0.6610752940177917,
      "learning_rate": 0.00020638129257623229,
      "loss": 0.3827,
      "step": 16340
    },
    {
      "gate_value": 0.2638241946697235,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 16340
    },
    {
      "grad_norm": 8.386857986450195,
      "learning_rate": 0.00020626635672332802,
      "loss": 0.3712,
      "step": 16350
    },
    {
      "gate_value": 0.26400476694107056,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16350
    },
    {
      "grad_norm": 1.04672372341156,
      "learning_rate": 0.00020615138241289948,
      "loss": 0.3741,
      "step": 16360
    },
    {
      "gate_value": 0.26382866501808167,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 16360
    },
    {
      "grad_norm": 0.6411679983139038,
      "learning_rate": 0.00020603636972353056,
      "loss": 0.3949,
      "step": 16370
    },
    {
      "gate_value": 0.2640067934989929,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16370
    },
    {
      "grad_norm": 0.7315338253974915,
      "learning_rate": 0.0002059213187338313,
      "loss": 0.3845,
      "step": 16380
    },
    {
      "gate_value": 0.26440954208374023,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 16380
    },
    {
      "grad_norm": 1.2115294933319092,
      "learning_rate": 0.000205806229522438,
      "loss": 0.3826,
      "step": 16390
    },
    {
      "gate_value": 0.2647918164730072,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 16390
    },
    {
      "grad_norm": 21.087554931640625,
      "learning_rate": 0.00020569110216801307,
      "loss": 0.393,
      "step": 16400
    },
    {
      "gate_value": 0.26522791385650635,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16400
    },
    {
      "grad_norm": 0.5692827701568604,
      "learning_rate": 0.0002055759367492449,
      "loss": 0.3909,
      "step": 16410
    },
    {
      "gate_value": 0.26617687940597534,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 16410
    },
    {
      "grad_norm": 0.6674289107322693,
      "learning_rate": 0.00020546073334484804,
      "loss": 0.3812,
      "step": 16420
    },
    {
      "gate_value": 0.2657477855682373,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 16420
    },
    {
      "grad_norm": 1.9666045904159546,
      "learning_rate": 0.00020534549203356288,
      "loss": 0.3826,
      "step": 16430
    },
    {
      "gate_value": 0.2655538022518158,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 16430
    },
    {
      "grad_norm": 1.16848886013031,
      "learning_rate": 0.00020523021289415582,
      "loss": 0.3835,
      "step": 16440
    },
    {
      "gate_value": 0.2654661536216736,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 16440
    },
    {
      "grad_norm": 3.0859735012054443,
      "learning_rate": 0.00020511489600541903,
      "loss": 0.3807,
      "step": 16450
    },
    {
      "gate_value": 0.2656824588775635,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 16450
    },
    {
      "grad_norm": 56.08064651489258,
      "learning_rate": 0.0002049995414461705,
      "loss": 0.3982,
      "step": 16460
    },
    {
      "gate_value": 0.2662789821624756,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 16460
    },
    {
      "grad_norm": 0.6443231701850891,
      "learning_rate": 0.00020488414929525404,
      "loss": 0.3912,
      "step": 16470
    },
    {
      "gate_value": 0.26696157455444336,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 16470
    },
    {
      "grad_norm": 0.6962271332740784,
      "learning_rate": 0.00020476871963153907,
      "loss": 0.3855,
      "step": 16480
    },
    {
      "gate_value": 0.2670133709907532,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 16480
    },
    {
      "grad_norm": 0.8205323219299316,
      "learning_rate": 0.00020465325253392062,
      "loss": 0.4058,
      "step": 16490
    },
    {
      "gate_value": 0.2671220302581787,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 16490
    },
    {
      "grad_norm": 1.2751874923706055,
      "learning_rate": 0.00020453774808131944,
      "loss": 0.3817,
      "step": 16500
    },
    {
      "gate_value": 0.2672402858734131,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 16500
    },
    {
      "grad_norm": 0.5550151467323303,
      "learning_rate": 0.00020442220635268166,
      "loss": 0.3749,
      "step": 16510
    },
    {
      "gate_value": 0.2680794596672058,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 16510
    },
    {
      "grad_norm": 0.48716598749160767,
      "learning_rate": 0.00020430662742697907,
      "loss": 0.3914,
      "step": 16520
    },
    {
      "gate_value": 0.26837554574012756,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 16520
    },
    {
      "grad_norm": 4.756121635437012,
      "learning_rate": 0.00020419101138320872,
      "loss": 0.3942,
      "step": 16530
    },
    {
      "gate_value": 0.26773133873939514,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 16530
    },
    {
      "grad_norm": 0.6253768801689148,
      "learning_rate": 0.00020407535830039303,
      "loss": 0.3894,
      "step": 16540
    },
    {
      "gate_value": 0.26719731092453003,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 16540
    },
    {
      "grad_norm": 0.7593842148780823,
      "learning_rate": 0.0002039596682575799,
      "loss": 0.3835,
      "step": 16550
    },
    {
      "gate_value": 0.2668936252593994,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 16550
    },
    {
      "grad_norm": 0.5268985033035278,
      "learning_rate": 0.00020384394133384228,
      "loss": 0.3697,
      "step": 16560
    },
    {
      "gate_value": 0.26714757084846497,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 16560
    },
    {
      "grad_norm": 0.9582957029342651,
      "learning_rate": 0.0002037281776082785,
      "loss": 0.3995,
      "step": 16570
    },
    {
      "gate_value": 0.267073392868042,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 16570
    },
    {
      "grad_norm": 0.8234410285949707,
      "learning_rate": 0.00020361237716001195,
      "loss": 0.3968,
      "step": 16580
    },
    {
      "gate_value": 0.26781952381134033,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 16580
    },
    {
      "grad_norm": 1.6054513454437256,
      "learning_rate": 0.00020349654006819113,
      "loss": 0.4014,
      "step": 16590
    },
    {
      "gate_value": 0.2684796154499054,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 16590
    },
    {
      "grad_norm": 2.0337159633636475,
      "learning_rate": 0.00020338066641198963,
      "loss": 0.4016,
      "step": 16600
    },
    {
      "gate_value": 0.26811471581459045,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 16600
    },
    {
      "grad_norm": 22.712963104248047,
      "learning_rate": 0.00020326475627060594,
      "loss": 0.3852,
      "step": 16610
    },
    {
      "gate_value": 0.2682332992553711,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 16610
    },
    {
      "grad_norm": 0.768212080001831,
      "learning_rate": 0.00020314880972326367,
      "loss": 0.3818,
      "step": 16620
    },
    {
      "gate_value": 0.2681061029434204,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16620
    },
    {
      "grad_norm": 1.2150635719299316,
      "learning_rate": 0.00020303282684921108,
      "loss": 0.3934,
      "step": 16630
    },
    {
      "gate_value": 0.2679195702075958,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 16630
    },
    {
      "grad_norm": 1.3667492866516113,
      "learning_rate": 0.00020291680772772138,
      "loss": 0.3872,
      "step": 16640
    },
    {
      "gate_value": 0.2678031623363495,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 16640
    },
    {
      "grad_norm": 6.326725482940674,
      "learning_rate": 0.00020280075243809265,
      "loss": 0.3942,
      "step": 16650
    },
    {
      "gate_value": 0.2683236002922058,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 16650
    },
    {
      "grad_norm": 14.302623748779297,
      "learning_rate": 0.0002026846610596474,
      "loss": 0.3753,
      "step": 16660
    },
    {
      "gate_value": 0.2683285176753998,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 16660
    },
    {
      "grad_norm": 0.833937406539917,
      "learning_rate": 0.00020256853367173322,
      "loss": 0.3807,
      "step": 16670
    },
    {
      "gate_value": 0.26835429668426514,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 16670
    },
    {
      "grad_norm": 1.282999038696289,
      "learning_rate": 0.00020245237035372194,
      "loss": 0.3945,
      "step": 16680
    },
    {
      "gate_value": 0.2686046361923218,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 16680
    },
    {
      "grad_norm": 1.7827116250991821,
      "learning_rate": 0.00020233617118501005,
      "loss": 0.4033,
      "step": 16690
    },
    {
      "gate_value": 0.2689083218574524,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 16690
    },
    {
      "grad_norm": 0.7584987878799438,
      "learning_rate": 0.00020221993624501872,
      "loss": 0.3908,
      "step": 16700
    },
    {
      "gate_value": 0.2689765393733978,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 16700
    },
    {
      "grad_norm": 2.065035104751587,
      "learning_rate": 0.00020210366561319336,
      "loss": 0.382,
      "step": 16710
    },
    {
      "gate_value": 0.2689971923828125,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 16710
    },
    {
      "grad_norm": 0.7940707206726074,
      "learning_rate": 0.00020198735936900386,
      "loss": 0.3896,
      "step": 16720
    },
    {
      "gate_value": 0.2692660391330719,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 16720
    },
    {
      "grad_norm": 0.7941778302192688,
      "learning_rate": 0.00020187101759194443,
      "loss": 0.3776,
      "step": 16730
    },
    {
      "gate_value": 0.27001139521598816,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16730
    },
    {
      "grad_norm": 47.64362716674805,
      "learning_rate": 0.00020175464036153358,
      "loss": 0.3814,
      "step": 16740
    },
    {
      "gate_value": 0.2703426778316498,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 16740
    },
    {
      "grad_norm": 0.5801603198051453,
      "learning_rate": 0.0002016382277573141,
      "loss": 0.3856,
      "step": 16750
    },
    {
      "gate_value": 0.27029573917388916,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 16750
    },
    {
      "grad_norm": 4.548717498779297,
      "learning_rate": 0.00020152177985885284,
      "loss": 0.3725,
      "step": 16760
    },
    {
      "gate_value": 0.2708137333393097,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 16760
    },
    {
      "grad_norm": 1.0393775701522827,
      "learning_rate": 0.00020140529674574087,
      "loss": 0.3797,
      "step": 16770
    },
    {
      "gate_value": 0.2712128460407257,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 16770
    },
    {
      "grad_norm": 1.5896652936935425,
      "learning_rate": 0.0002012887784975933,
      "loss": 0.3955,
      "step": 16780
    },
    {
      "gate_value": 0.27125340700149536,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 16780
    },
    {
      "grad_norm": 23.115083694458008,
      "learning_rate": 0.00020117222519404923,
      "loss": 0.394,
      "step": 16790
    },
    {
      "gate_value": 0.2709924280643463,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 16790
    },
    {
      "grad_norm": 3.4433460235595703,
      "learning_rate": 0.00020105563691477177,
      "loss": 0.3856,
      "step": 16800
    },
    {
      "gate_value": 0.27060866355895996,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 16800
    },
    {
      "grad_norm": 0.9717051386833191,
      "learning_rate": 0.00020093901373944794,
      "loss": 0.3935,
      "step": 16810
    },
    {
      "gate_value": 0.2705410122871399,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16810
    },
    {
      "grad_norm": 5.411111354827881,
      "learning_rate": 0.0002008223557477885,
      "loss": 0.3851,
      "step": 16820
    },
    {
      "gate_value": 0.27148672938346863,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 16820
    },
    {
      "grad_norm": 1.2662025690078735,
      "learning_rate": 0.00020070566301952817,
      "loss": 0.394,
      "step": 16830
    },
    {
      "gate_value": 0.2718506455421448,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16830
    },
    {
      "grad_norm": 4.485749244689941,
      "learning_rate": 0.00020058893563442527,
      "loss": 0.3809,
      "step": 16840
    },
    {
      "gate_value": 0.2717529237270355,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 16840
    },
    {
      "grad_norm": 70.55220794677734,
      "learning_rate": 0.00020047217367226192,
      "loss": 0.376,
      "step": 16850
    },
    {
      "gate_value": 0.2719095051288605,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 16850
    },
    {
      "grad_norm": 1.0547325611114502,
      "learning_rate": 0.00020035537721284377,
      "loss": 0.3788,
      "step": 16860
    },
    {
      "gate_value": 0.2722213566303253,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 16860
    },
    {
      "grad_norm": 1.2858396768569946,
      "learning_rate": 0.0002002385463360001,
      "loss": 0.384,
      "step": 16870
    },
    {
      "gate_value": 0.27259713411331177,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 16870
    },
    {
      "grad_norm": 171.38514709472656,
      "learning_rate": 0.00020012168112158374,
      "loss": 0.3998,
      "step": 16880
    },
    {
      "gate_value": 0.2726678252220154,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 16880
    },
    {
      "grad_norm": 1.532321810722351,
      "learning_rate": 0.00020000478164947094,
      "loss": 0.3847,
      "step": 16890
    },
    {
      "gate_value": 0.2719527781009674,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 16890
    },
    {
      "grad_norm": 36.90795135498047,
      "learning_rate": 0.00019988784799956143,
      "loss": 0.3756,
      "step": 16900
    },
    {
      "gate_value": 0.271659255027771,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 16900
    },
    {
      "grad_norm": 0.6787513494491577,
      "learning_rate": 0.00019977088025177823,
      "loss": 0.3902,
      "step": 16910
    },
    {
      "gate_value": 0.2718561887741089,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 16910
    },
    {
      "grad_norm": 2.4867050647735596,
      "learning_rate": 0.00019965387848606766,
      "loss": 0.3832,
      "step": 16920
    },
    {
      "gate_value": 0.27170494198799133,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 16920
    },
    {
      "grad_norm": 3.547852039337158,
      "learning_rate": 0.00019953684278239936,
      "loss": 0.394,
      "step": 16930
    },
    {
      "gate_value": 0.2722950279712677,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16930
    },
    {
      "grad_norm": 0.845325767993927,
      "learning_rate": 0.00019941977322076614,
      "loss": 0.381,
      "step": 16940
    },
    {
      "gate_value": 0.2729490399360657,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 16940
    },
    {
      "grad_norm": 0.924738883972168,
      "learning_rate": 0.0001993026698811839,
      "loss": 0.3806,
      "step": 16950
    },
    {
      "gate_value": 0.27347734570503235,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 16950
    },
    {
      "grad_norm": 1.2066264152526855,
      "learning_rate": 0.00019918553284369172,
      "loss": 0.3906,
      "step": 16960
    },
    {
      "gate_value": 0.2736641764640808,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 16960
    },
    {
      "grad_norm": 1.266615867614746,
      "learning_rate": 0.0001990683621883516,
      "loss": 0.3807,
      "step": 16970
    },
    {
      "gate_value": 0.27343323826789856,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 16970
    },
    {
      "grad_norm": 0.651129961013794,
      "learning_rate": 0.00019895115799524864,
      "loss": 0.3753,
      "step": 16980
    },
    {
      "gate_value": 0.27308720350265503,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 16980
    },
    {
      "grad_norm": 1.6364394426345825,
      "learning_rate": 0.00019883392034449076,
      "loss": 0.3846,
      "step": 16990
    },
    {
      "gate_value": 0.27326497435569763,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 16990
    },
    {
      "grad_norm": 1.037740707397461,
      "learning_rate": 0.00019871664931620883,
      "loss": 0.3864,
      "step": 17000
    },
    {
      "gate_value": 0.2733471095561981,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 17000
    },
    {
      "grad_norm": 0.8848700523376465,
      "learning_rate": 0.0001985993449905564,
      "loss": 0.3888,
      "step": 17010
    },
    {
      "gate_value": 0.2732689678668976,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 17010
    },
    {
      "grad_norm": 173.5919952392578,
      "learning_rate": 0.00019848200744770997,
      "loss": 0.3823,
      "step": 17020
    },
    {
      "gate_value": 0.27310124039649963,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 17020
    },
    {
      "grad_norm": 0.7701226472854614,
      "learning_rate": 0.00019836463676786866,
      "loss": 0.3689,
      "step": 17030
    },
    {
      "gate_value": 0.27315619587898254,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 17030
    },
    {
      "grad_norm": 1.866520643234253,
      "learning_rate": 0.0001982472330312541,
      "loss": 0.3665,
      "step": 17040
    },
    {
      "gate_value": 0.2732912600040436,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 17040
    },
    {
      "grad_norm": 0.9565479159355164,
      "learning_rate": 0.00019812979631811072,
      "loss": 0.3866,
      "step": 17050
    },
    {
      "gate_value": 0.2739657461643219,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 17050
    },
    {
      "grad_norm": 1.207183599472046,
      "learning_rate": 0.0001980123267087054,
      "loss": 0.3896,
      "step": 17060
    },
    {
      "gate_value": 0.2743777334690094,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 17060
    },
    {
      "grad_norm": 0.6322810053825378,
      "learning_rate": 0.00019789482428332747,
      "loss": 0.3863,
      "step": 17070
    },
    {
      "gate_value": 0.2746823728084564,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 17070
    },
    {
      "grad_norm": 0.6914767026901245,
      "learning_rate": 0.0001977772891222888,
      "loss": 0.3857,
      "step": 17080
    },
    {
      "gate_value": 0.27512282133102417,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 17080
    },
    {
      "grad_norm": 2.1301958560943604,
      "learning_rate": 0.00019765972130592356,
      "loss": 0.3933,
      "step": 17090
    },
    {
      "gate_value": 0.2752041518688202,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 17090
    },
    {
      "grad_norm": 1.580463171005249,
      "learning_rate": 0.00019754212091458814,
      "loss": 0.4096,
      "step": 17100
    },
    {
      "gate_value": 0.2751167416572571,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 17100
    },
    {
      "grad_norm": 74.18382263183594,
      "learning_rate": 0.00019742448802866143,
      "loss": 0.3928,
      "step": 17110
    },
    {
      "gate_value": 0.2754330635070801,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 17110
    },
    {
      "grad_norm": 1.2828655242919922,
      "learning_rate": 0.0001973068227285443,
      "loss": 0.3764,
      "step": 17120
    },
    {
      "gate_value": 0.27579745650291443,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 17120
    },
    {
      "grad_norm": 1.951072096824646,
      "learning_rate": 0.00019718912509465993,
      "loss": 0.3882,
      "step": 17130
    },
    {
      "gate_value": 0.2759058177471161,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 17130
    },
    {
      "grad_norm": 1.1944855451583862,
      "learning_rate": 0.00019707139520745354,
      "loss": 0.4006,
      "step": 17140
    },
    {
      "gate_value": 0.27621176838874817,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 17140
    },
    {
      "grad_norm": 1.2348740100860596,
      "learning_rate": 0.00019695363314739235,
      "loss": 0.3775,
      "step": 17150
    },
    {
      "gate_value": 0.276276171207428,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 17150
    },
    {
      "grad_norm": 1.934401512145996,
      "learning_rate": 0.00019683583899496565,
      "loss": 0.3779,
      "step": 17160
    },
    {
      "gate_value": 0.27666452527046204,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 17160
    },
    {
      "grad_norm": 2.7875421047210693,
      "learning_rate": 0.00019671801283068464,
      "loss": 0.3676,
      "step": 17170
    },
    {
      "gate_value": 0.2770780622959137,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 17170
    },
    {
      "grad_norm": 1.0745530128479004,
      "learning_rate": 0.0001966001547350824,
      "loss": 0.3832,
      "step": 17180
    },
    {
      "gate_value": 0.27716943621635437,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 17180
    },
    {
      "grad_norm": 0.7714540362358093,
      "learning_rate": 0.0001964822647887138,
      "loss": 0.3953,
      "step": 17190
    },
    {
      "gate_value": 0.2768040895462036,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 17190
    },
    {
      "grad_norm": 0.5804464221000671,
      "learning_rate": 0.00019636434307215552,
      "loss": 0.3839,
      "step": 17200
    },
    {
      "gate_value": 0.27638521790504456,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 17200
    },
    {
      "grad_norm": 0.36973661184310913,
      "learning_rate": 0.00019624638966600591,
      "loss": 0.3748,
      "step": 17210
    },
    {
      "gate_value": 0.2765795588493347,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 17210
    },
    {
      "grad_norm": 6.002297878265381,
      "learning_rate": 0.0001961284046508851,
      "loss": 0.3886,
      "step": 17220
    },
    {
      "gate_value": 0.2769390046596527,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 17220
    },
    {
      "grad_norm": 2.958909511566162,
      "learning_rate": 0.00019601038810743463,
      "loss": 0.3828,
      "step": 17230
    },
    {
      "gate_value": 0.2774118483066559,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 17230
    },
    {
      "grad_norm": 1.149144172668457,
      "learning_rate": 0.0001958923401163178,
      "loss": 0.3943,
      "step": 17240
    },
    {
      "gate_value": 0.2777392566204071,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 17240
    },
    {
      "grad_norm": 2.879011631011963,
      "learning_rate": 0.00019577426075821915,
      "loss": 0.3885,
      "step": 17250
    },
    {
      "gate_value": 0.2781127095222473,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 17250
    },
    {
      "grad_norm": 0.6843719482421875,
      "learning_rate": 0.00019565615011384494,
      "loss": 0.3917,
      "step": 17260
    },
    {
      "gate_value": 0.27844953536987305,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 17260
    },
    {
      "grad_norm": 3.6135499477386475,
      "learning_rate": 0.00019553800826392262,
      "loss": 0.3723,
      "step": 17270
    },
    {
      "gate_value": 0.27857479453086853,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 17270
    },
    {
      "grad_norm": 1.2397135496139526,
      "learning_rate": 0.00019541983528920112,
      "loss": 0.3697,
      "step": 17280
    },
    {
      "gate_value": 0.27856993675231934,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 17280
    },
    {
      "grad_norm": 0.6731451153755188,
      "learning_rate": 0.00019530163127045046,
      "loss": 0.3858,
      "step": 17290
    },
    {
      "gate_value": 0.2787000238895416,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 17290
    },
    {
      "grad_norm": 0.786884605884552,
      "learning_rate": 0.00019518339628846193,
      "loss": 0.4013,
      "step": 17300
    },
    {
      "gate_value": 0.27849066257476807,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 17300
    },
    {
      "grad_norm": 18.715499877929688,
      "learning_rate": 0.00019506513042404815,
      "loss": 0.3896,
      "step": 17310
    },
    {
      "gate_value": 0.27890416979789734,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 17310
    },
    {
      "grad_norm": 0.6340904235839844,
      "learning_rate": 0.00019494683375804265,
      "loss": 0.3884,
      "step": 17320
    },
    {
      "gate_value": 0.27936267852783203,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 17320
    },
    {
      "grad_norm": 3.241877555847168,
      "learning_rate": 0.00019482850637130006,
      "loss": 0.3711,
      "step": 17330
    },
    {
      "gate_value": 0.27889105677604675,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 17330
    },
    {
      "grad_norm": 0.9872519969940186,
      "learning_rate": 0.0001947101483446961,
      "loss": 0.387,
      "step": 17340
    },
    {
      "gate_value": 0.2789508104324341,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 17340
    },
    {
      "grad_norm": 0.6962840557098389,
      "learning_rate": 0.00019459175975912736,
      "loss": 0.3963,
      "step": 17350
    },
    {
      "gate_value": 0.2791309952735901,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 17350
    },
    {
      "grad_norm": 0.7151376008987427,
      "learning_rate": 0.0001944733406955113,
      "loss": 0.3745,
      "step": 17360
    },
    {
      "gate_value": 0.2794833183288574,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 17360
    },
    {
      "grad_norm": 0.8369631767272949,
      "learning_rate": 0.00019435489123478624,
      "loss": 0.3894,
      "step": 17370
    },
    {
      "gate_value": 0.27932778000831604,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 17370
    },
    {
      "grad_norm": 1.7139389514923096,
      "learning_rate": 0.00019423641145791123,
      "loss": 0.3873,
      "step": 17380
    },
    {
      "gate_value": 0.2792007029056549,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 17380
    },
    {
      "grad_norm": 0.8466968536376953,
      "learning_rate": 0.0001941179014458661,
      "loss": 0.3863,
      "step": 17390
    },
    {
      "gate_value": 0.27936652302742004,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 17390
    },
    {
      "grad_norm": 1.240417718887329,
      "learning_rate": 0.00019399936127965136,
      "loss": 0.4034,
      "step": 17400
    },
    {
      "gate_value": 0.2796449065208435,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 17400
    },
    {
      "grad_norm": 0.7106504440307617,
      "learning_rate": 0.00019388079104028808,
      "loss": 0.3805,
      "step": 17410
    },
    {
      "gate_value": 0.2800625264644623,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 17410
    },
    {
      "grad_norm": 0.7933707237243652,
      "learning_rate": 0.00019376219080881793,
      "loss": 0.3584,
      "step": 17420
    },
    {
      "gate_value": 0.28004422783851624,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 17420
    },
    {
      "grad_norm": 50.58368682861328,
      "learning_rate": 0.00019364356066630295,
      "loss": 0.3856,
      "step": 17430
    },
    {
      "gate_value": 0.2805579602718353,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 17430
    },
    {
      "grad_norm": 0.6874160170555115,
      "learning_rate": 0.0001935249006938258,
      "loss": 0.3559,
      "step": 17440
    },
    {
      "gate_value": 0.2811239957809448,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 17440
    },
    {
      "grad_norm": 5.250424385070801,
      "learning_rate": 0.00019340621097248945,
      "loss": 0.3737,
      "step": 17450
    },
    {
      "gate_value": 0.28114932775497437,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 17450
    },
    {
      "grad_norm": 12.111510276794434,
      "learning_rate": 0.00019328749158341728,
      "loss": 0.3757,
      "step": 17460
    },
    {
      "gate_value": 0.28097689151763916,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 17460
    },
    {
      "grad_norm": 1.5629442930221558,
      "learning_rate": 0.00019316874260775268,
      "loss": 0.3783,
      "step": 17470
    },
    {
      "gate_value": 0.28098264336586,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 17470
    },
    {
      "grad_norm": 0.4609309136867523,
      "learning_rate": 0.00019304996412665957,
      "loss": 0.3848,
      "step": 17480
    },
    {
      "gate_value": 0.2811479866504669,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 17480
    },
    {
      "grad_norm": 0.9423713088035583,
      "learning_rate": 0.00019293115622132193,
      "loss": 0.3714,
      "step": 17490
    },
    {
      "gate_value": 0.2813046872615814,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 17490
    },
    {
      "grad_norm": 1.93399178981781,
      "learning_rate": 0.00019281231897294384,
      "loss": 0.3816,
      "step": 17500
    },
    {
      "gate_value": 0.28104689717292786,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 17500
    },
    {
      "grad_norm": 1.776007890701294,
      "learning_rate": 0.0001926934524627495,
      "loss": 0.3913,
      "step": 17510
    },
    {
      "gate_value": 0.28112155199050903,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 17510
    },
    {
      "grad_norm": 0.6974020600318909,
      "learning_rate": 0.00019257455677198286,
      "loss": 0.3891,
      "step": 17520
    },
    {
      "gate_value": 0.2809901237487793,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 17520
    },
    {
      "grad_norm": 1.8342015743255615,
      "learning_rate": 0.00019245563198190814,
      "loss": 0.3921,
      "step": 17530
    },
    {
      "gate_value": 0.2812676727771759,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 17530
    },
    {
      "grad_norm": 1.2698308229446411,
      "learning_rate": 0.00019233667817380933,
      "loss": 0.394,
      "step": 17540
    },
    {
      "gate_value": 0.2814946174621582,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 17540
    },
    {
      "grad_norm": 0.4935688376426697,
      "learning_rate": 0.0001922176954289902,
      "loss": 0.369,
      "step": 17550
    },
    {
      "gate_value": 0.2818371057510376,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 17550
    },
    {
      "grad_norm": 0.6632453799247742,
      "learning_rate": 0.00019209868382877437,
      "loss": 0.3677,
      "step": 17560
    },
    {
      "gate_value": 0.2822357714176178,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 17560
    },
    {
      "grad_norm": 0.8540254235267639,
      "learning_rate": 0.00019197964345450504,
      "loss": 0.3992,
      "step": 17570
    },
    {
      "gate_value": 0.28277909755706787,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 17570
    },
    {
      "grad_norm": 1.1772438287734985,
      "learning_rate": 0.00019186057438754525,
      "loss": 0.4001,
      "step": 17580
    },
    {
      "gate_value": 0.28297939896583557,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 17580
    },
    {
      "grad_norm": 98.65593719482422,
      "learning_rate": 0.00019174147670927765,
      "loss": 0.3855,
      "step": 17590
    },
    {
      "gate_value": 0.2828541100025177,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 17590
    },
    {
      "grad_norm": 1.0209541320800781,
      "learning_rate": 0.00019162235050110429,
      "loss": 0.3751,
      "step": 17600
    },
    {
      "gate_value": 0.2828318476676941,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 17600
    },
    {
      "grad_norm": 1.796211838722229,
      "learning_rate": 0.00019150319584444682,
      "loss": 0.3911,
      "step": 17610
    },
    {
      "gate_value": 0.28276461362838745,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 17610
    },
    {
      "grad_norm": 21.861438751220703,
      "learning_rate": 0.00019138401282074635,
      "loss": 0.3939,
      "step": 17620
    },
    {
      "gate_value": 0.2829860746860504,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 17620
    },
    {
      "grad_norm": 0.8495699763298035,
      "learning_rate": 0.00019126480151146334,
      "loss": 0.3733,
      "step": 17630
    },
    {
      "gate_value": 0.283086895942688,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 17630
    },
    {
      "grad_norm": 5.792306423187256,
      "learning_rate": 0.0001911455619980776,
      "loss": 0.3805,
      "step": 17640
    },
    {
      "gate_value": 0.2834136188030243,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 17640
    },
    {
      "grad_norm": 1.0493606328964233,
      "learning_rate": 0.00019102629436208824,
      "loss": 0.3987,
      "step": 17650
    },
    {
      "gate_value": 0.28334277868270874,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 17650
    },
    {
      "grad_norm": 1.9529502391815186,
      "learning_rate": 0.00019090699868501353,
      "loss": 0.3736,
      "step": 17660
    },
    {
      "gate_value": 0.28328362107276917,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 17660
    },
    {
      "grad_norm": 2.390843152999878,
      "learning_rate": 0.00019078767504839093,
      "loss": 0.3885,
      "step": 17670
    },
    {
      "gate_value": 0.2835651934146881,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 17670
    },
    {
      "grad_norm": 0.9919568300247192,
      "learning_rate": 0.000190668323533777,
      "loss": 0.3791,
      "step": 17680
    },
    {
      "gate_value": 0.28367140889167786,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 17680
    },
    {
      "grad_norm": 0.7665271759033203,
      "learning_rate": 0.00019054894422274754,
      "loss": 0.3778,
      "step": 17690
    },
    {
      "gate_value": 0.2842426002025604,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 17690
    },
    {
      "grad_norm": 1.6776630878448486,
      "learning_rate": 0.00019042953719689695,
      "loss": 0.394,
      "step": 17700
    },
    {
      "gate_value": 0.2848225235939026,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 17700
    },
    {
      "grad_norm": 1.3831422328948975,
      "learning_rate": 0.00019031010253783896,
      "loss": 0.3791,
      "step": 17710
    },
    {
      "gate_value": 0.28517720103263855,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 17710
    },
    {
      "grad_norm": 0.912163496017456,
      "learning_rate": 0.00019019064032720594,
      "loss": 0.3893,
      "step": 17720
    },
    {
      "gate_value": 0.2853257954120636,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 17720
    },
    {
      "grad_norm": 0.8194653391838074,
      "learning_rate": 0.00019007115064664922,
      "loss": 0.3826,
      "step": 17730
    },
    {
      "gate_value": 0.28528544306755066,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 17730
    },
    {
      "grad_norm": 1.4953533411026,
      "learning_rate": 0.00018995163357783898,
      "loss": 0.3654,
      "step": 17740
    },
    {
      "gate_value": 0.28522640466690063,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 17740
    },
    {
      "grad_norm": 1.0298045873641968,
      "learning_rate": 0.00018983208920246382,
      "loss": 0.3721,
      "step": 17750
    },
    {
      "gate_value": 0.28574541211128235,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 17750
    },
    {
      "grad_norm": 5.820982456207275,
      "learning_rate": 0.00018971251760223137,
      "loss": 0.3592,
      "step": 17760
    },
    {
      "gate_value": 0.28601688146591187,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 17760
    },
    {
      "grad_norm": 107.9212875366211,
      "learning_rate": 0.00018959291885886753,
      "loss": 0.4024,
      "step": 17770
    },
    {
      "gate_value": 0.28597700595855713,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 17770
    },
    {
      "grad_norm": 3.506815195083618,
      "learning_rate": 0.000189473293054117,
      "loss": 0.381,
      "step": 17780
    },
    {
      "gate_value": 0.2859663963317871,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 17780
    },
    {
      "grad_norm": 1.2677297592163086,
      "learning_rate": 0.00018935364026974292,
      "loss": 0.3859,
      "step": 17790
    },
    {
      "gate_value": 0.28602340817451477,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 17790
    },
    {
      "grad_norm": 0.8437561988830566,
      "learning_rate": 0.00018923396058752673,
      "loss": 0.3775,
      "step": 17800
    },
    {
      "gate_value": 0.28563985228538513,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 17800
    },
    {
      "grad_norm": 1.6269166469573975,
      "learning_rate": 0.0001891142540892685,
      "loss": 0.4039,
      "step": 17810
    },
    {
      "gate_value": 0.28554731607437134,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 17810
    },
    {
      "grad_norm": 1.5911264419555664,
      "learning_rate": 0.00018899452085678639,
      "loss": 0.3821,
      "step": 17820
    },
    {
      "gate_value": 0.28552672266960144,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 17820
    },
    {
      "grad_norm": 1.7108643054962158,
      "learning_rate": 0.00018887476097191697,
      "loss": 0.3847,
      "step": 17830
    },
    {
      "gate_value": 0.2859024405479431,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 17830
    },
    {
      "grad_norm": 3.113037347793579,
      "learning_rate": 0.00018875497451651503,
      "loss": 0.3676,
      "step": 17840
    },
    {
      "gate_value": 0.28588712215423584,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 17840
    },
    {
      "grad_norm": 0.8272459506988525,
      "learning_rate": 0.00018863516157245337,
      "loss": 0.3906,
      "step": 17850
    },
    {
      "gate_value": 0.2860240936279297,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 17850
    },
    {
      "grad_norm": 3.2408292293548584,
      "learning_rate": 0.00018851532222162316,
      "loss": 0.3608,
      "step": 17860
    },
    {
      "gate_value": 0.28605082631111145,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 17860
    },
    {
      "grad_norm": 0.8264132142066956,
      "learning_rate": 0.00018839545654593336,
      "loss": 0.3636,
      "step": 17870
    },
    {
      "gate_value": 0.28686094284057617,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 17870
    },
    {
      "grad_norm": 0.7240784764289856,
      "learning_rate": 0.00018827556462731103,
      "loss": 0.4067,
      "step": 17880
    },
    {
      "gate_value": 0.28730401396751404,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 17880
    },
    {
      "grad_norm": 0.634745180606842,
      "learning_rate": 0.00018815564654770125,
      "loss": 0.3767,
      "step": 17890
    },
    {
      "gate_value": 0.28744029998779297,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 17890
    },
    {
      "grad_norm": 5.643700122833252,
      "learning_rate": 0.0001880357023890668,
      "loss": 0.38,
      "step": 17900
    },
    {
      "gate_value": 0.28771013021469116,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 17900
    },
    {
      "grad_norm": 1.5269558429718018,
      "learning_rate": 0.00018791573223338843,
      "loss": 0.4023,
      "step": 17910
    },
    {
      "gate_value": 0.28753605484962463,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 17910
    },
    {
      "grad_norm": 0.5712957978248596,
      "learning_rate": 0.00018779573616266461,
      "loss": 0.4074,
      "step": 17920
    },
    {
      "gate_value": 0.2872239947319031,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 17920
    },
    {
      "grad_norm": 1.1614340543746948,
      "learning_rate": 0.0001876757142589115,
      "loss": 0.3899,
      "step": 17930
    },
    {
      "gate_value": 0.2880617678165436,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 17930
    },
    {
      "grad_norm": 3.2989726066589355,
      "learning_rate": 0.000187555666604163,
      "loss": 0.3554,
      "step": 17940
    },
    {
      "gate_value": 0.2882530987262726,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 17940
    },
    {
      "grad_norm": 0.9225475788116455,
      "learning_rate": 0.00018743559328047044,
      "loss": 0.3769,
      "step": 17950
    },
    {
      "gate_value": 0.2888268828392029,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 17950
    },
    {
      "grad_norm": 3.0278470516204834,
      "learning_rate": 0.00018731549436990292,
      "loss": 0.3699,
      "step": 17960
    },
    {
      "gate_value": 0.2888841927051544,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 17960
    },
    {
      "grad_norm": 1.4872838258743286,
      "learning_rate": 0.00018719536995454684,
      "loss": 0.3831,
      "step": 17970
    },
    {
      "gate_value": 0.28916996717453003,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 17970
    },
    {
      "grad_norm": 1.2059917449951172,
      "learning_rate": 0.0001870752201165061,
      "loss": 0.392,
      "step": 17980
    },
    {
      "gate_value": 0.2897283434867859,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 17980
    },
    {
      "grad_norm": 1.9197864532470703,
      "learning_rate": 0.00018695504493790207,
      "loss": 0.3979,
      "step": 17990
    },
    {
      "gate_value": 0.289908230304718,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 17990
    },
    {
      "grad_norm": 5.284653663635254,
      "learning_rate": 0.00018683484450087324,
      "loss": 0.3796,
      "step": 18000
    },
    {
      "gate_value": 0.2897275686264038,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 18000
    },
    {
      "grad_norm": 1.1138560771942139,
      "learning_rate": 0.00018671461888757556,
      "loss": 0.3793,
      "step": 18010
    },
    {
      "gate_value": 0.2901041805744171,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 18010
    },
    {
      "grad_norm": 1.5257152318954468,
      "learning_rate": 0.00018659436818018208,
      "loss": 0.3853,
      "step": 18020
    },
    {
      "gate_value": 0.28984254598617554,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18020
    },
    {
      "grad_norm": 0.8911436200141907,
      "learning_rate": 0.00018647409246088298,
      "loss": 0.3775,
      "step": 18030
    },
    {
      "gate_value": 0.28980550169944763,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 18030
    },
    {
      "grad_norm": 0.6340241432189941,
      "learning_rate": 0.0001863537918118856,
      "loss": 0.392,
      "step": 18040
    },
    {
      "gate_value": 0.2897473871707916,
      "icl_sequence_length": 56,
      "num_contexts": 3,
      "step": 18040
    },
    {
      "grad_norm": 0.7583088278770447,
      "learning_rate": 0.00018623346631541432,
      "loss": 0.3822,
      "step": 18050
    },
    {
      "gate_value": 0.2898584306240082,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 18050
    },
    {
      "grad_norm": 2.2231905460357666,
      "learning_rate": 0.00018611311605371046,
      "loss": 0.3921,
      "step": 18060
    },
    {
      "gate_value": 0.2902577817440033,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 18060
    },
    {
      "grad_norm": 6.011120796203613,
      "learning_rate": 0.00018599274110903238,
      "loss": 0.3943,
      "step": 18070
    },
    {
      "gate_value": 0.29029032588005066,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 18070
    },
    {
      "grad_norm": 1.924442172050476,
      "learning_rate": 0.00018587234156365506,
      "loss": 0.3914,
      "step": 18080
    },
    {
      "gate_value": 0.2905459702014923,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 18080
    },
    {
      "grad_norm": 8.414761543273926,
      "learning_rate": 0.0001857519174998706,
      "loss": 0.3703,
      "step": 18090
    },
    {
      "gate_value": 0.2910146415233612,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18090
    },
    {
      "grad_norm": 1.9152319431304932,
      "learning_rate": 0.00018563146899998762,
      "loss": 0.3739,
      "step": 18100
    },
    {
      "gate_value": 0.29149147868156433,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 18100
    },
    {
      "grad_norm": 2.153871536254883,
      "learning_rate": 0.00018551099614633155,
      "loss": 0.3723,
      "step": 18110
    },
    {
      "gate_value": 0.2919636070728302,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18110
    },
    {
      "grad_norm": 0.6491678357124329,
      "learning_rate": 0.0001853904990212445,
      "loss": 0.3747,
      "step": 18120
    },
    {
      "gate_value": 0.29191774129867554,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 18120
    },
    {
      "grad_norm": 0.706830620765686,
      "learning_rate": 0.00018526997770708506,
      "loss": 0.3734,
      "step": 18130
    },
    {
      "gate_value": 0.29252272844314575,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 18130
    },
    {
      "grad_norm": 4.9841389656066895,
      "learning_rate": 0.00018514943228622842,
      "loss": 0.3879,
      "step": 18140
    },
    {
      "gate_value": 0.2927410900592804,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 18140
    },
    {
      "grad_norm": 1.3059637546539307,
      "learning_rate": 0.00018502886284106623,
      "loss": 0.4032,
      "step": 18150
    },
    {
      "gate_value": 0.29293057322502136,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 18150
    },
    {
      "grad_norm": 1.498307466506958,
      "learning_rate": 0.00018490826945400662,
      "loss": 0.3856,
      "step": 18160
    },
    {
      "gate_value": 0.2928726375102997,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 18160
    },
    {
      "grad_norm": 5.742775917053223,
      "learning_rate": 0.00018478765220747407,
      "loss": 0.3816,
      "step": 18170
    },
    {
      "gate_value": 0.2927343249320984,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 18170
    },
    {
      "grad_norm": 1.2477624416351318,
      "learning_rate": 0.00018466701118390914,
      "loss": 0.3687,
      "step": 18180
    },
    {
      "gate_value": 0.29233211278915405,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 18180
    },
    {
      "grad_norm": 0.7571000456809998,
      "learning_rate": 0.00018454634646576906,
      "loss": 0.3849,
      "step": 18190
    },
    {
      "gate_value": 0.29229992628097534,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 18190
    },
    {
      "grad_norm": 0.6805962324142456,
      "learning_rate": 0.00018442565813552684,
      "loss": 0.3817,
      "step": 18200
    },
    {
      "gate_value": 0.2926273047924042,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 18200
    },
    {
      "grad_norm": 1.1327909231185913,
      "learning_rate": 0.00018430494627567196,
      "loss": 0.3911,
      "step": 18210
    },
    {
      "gate_value": 0.2928312122821808,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 18210
    },
    {
      "grad_norm": 1.686699628829956,
      "learning_rate": 0.00018418421096870978,
      "loss": 0.3905,
      "step": 18220
    },
    {
      "gate_value": 0.293545126914978,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 18220
    },
    {
      "grad_norm": 1.268710732460022,
      "learning_rate": 0.00018406345229716168,
      "loss": 0.3821,
      "step": 18230
    },
    {
      "gate_value": 0.29413795471191406,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 18230
    },
    {
      "grad_norm": 1.1455614566802979,
      "learning_rate": 0.00018394267034356517,
      "loss": 0.3903,
      "step": 18240
    },
    {
      "gate_value": 0.29432111978530884,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18240
    },
    {
      "grad_norm": 1.1787275075912476,
      "learning_rate": 0.00018382186519047357,
      "loss": 0.3849,
      "step": 18250
    },
    {
      "gate_value": 0.2940574586391449,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 18250
    },
    {
      "grad_norm": 1.0190541744232178,
      "learning_rate": 0.00018370103692045596,
      "loss": 0.3869,
      "step": 18260
    },
    {
      "gate_value": 0.29367583990097046,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 18260
    },
    {
      "grad_norm": 1.1894299983978271,
      "learning_rate": 0.00018358018561609747,
      "loss": 0.3754,
      "step": 18270
    },
    {
      "gate_value": 0.2937644422054291,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 18270
    },
    {
      "grad_norm": 0.9370589256286621,
      "learning_rate": 0.0001834593113599987,
      "loss": 0.3893,
      "step": 18280
    },
    {
      "gate_value": 0.2938789129257202,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 18280
    },
    {
      "grad_norm": 2.4182047843933105,
      "learning_rate": 0.00018333841423477619,
      "loss": 0.4115,
      "step": 18290
    },
    {
      "gate_value": 0.29393965005874634,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 18290
    },
    {
      "grad_norm": 1.9963648319244385,
      "learning_rate": 0.00018321749432306184,
      "loss": 0.3845,
      "step": 18300
    },
    {
      "gate_value": 0.2939322292804718,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 18300
    },
    {
      "grad_norm": 1.7127970457077026,
      "learning_rate": 0.00018309655170750336,
      "loss": 0.3751,
      "step": 18310
    },
    {
      "gate_value": 0.2943023443222046,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 18310
    },
    {
      "grad_norm": 13.848799705505371,
      "learning_rate": 0.0001829755864707639,
      "loss": 0.3866,
      "step": 18320
    },
    {
      "gate_value": 0.2945227324962616,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 18320
    },
    {
      "grad_norm": 0.6617933511734009,
      "learning_rate": 0.00018285459869552199,
      "loss": 0.4022,
      "step": 18330
    },
    {
      "gate_value": 0.2949317991733551,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 18330
    },
    {
      "grad_norm": 3.9121739864349365,
      "learning_rate": 0.00018273358846447168,
      "loss": 0.3735,
      "step": 18340
    },
    {
      "gate_value": 0.29527369141578674,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 18340
    },
    {
      "grad_norm": 5.1793036460876465,
      "learning_rate": 0.00018261255586032234,
      "loss": 0.4012,
      "step": 18350
    },
    {
      "gate_value": 0.295279860496521,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 18350
    },
    {
      "grad_norm": 12.034460067749023,
      "learning_rate": 0.00018249150096579856,
      "loss": 0.3865,
      "step": 18360
    },
    {
      "gate_value": 0.29541224241256714,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 18360
    },
    {
      "grad_norm": 0.7827606797218323,
      "learning_rate": 0.00018237042386364026,
      "loss": 0.376,
      "step": 18370
    },
    {
      "gate_value": 0.295558899641037,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 18370
    },
    {
      "grad_norm": 1.4565706253051758,
      "learning_rate": 0.00018224932463660245,
      "loss": 0.3713,
      "step": 18380
    },
    {
      "gate_value": 0.29634296894073486,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 18380
    },
    {
      "grad_norm": 0.9638983607292175,
      "learning_rate": 0.0001821282033674554,
      "loss": 0.371,
      "step": 18390
    },
    {
      "gate_value": 0.2964816987514496,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 18390
    },
    {
      "grad_norm": 9.79422664642334,
      "learning_rate": 0.0001820070601389843,
      "loss": 0.3794,
      "step": 18400
    },
    {
      "gate_value": 0.29639384150505066,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 18400
    },
    {
      "grad_norm": 2.494586944580078,
      "learning_rate": 0.00018188589503398937,
      "loss": 0.3883,
      "step": 18410
    },
    {
      "gate_value": 0.296492338180542,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 18410
    },
    {
      "grad_norm": 0.950502872467041,
      "learning_rate": 0.00018176470813528585,
      "loss": 0.3799,
      "step": 18420
    },
    {
      "gate_value": 0.29657965898513794,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 18420
    },
    {
      "grad_norm": 1.4491907358169556,
      "learning_rate": 0.0001816434995257038,
      "loss": 0.3954,
      "step": 18430
    },
    {
      "gate_value": 0.2969636619091034,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 18430
    },
    {
      "grad_norm": 1.5316132307052612,
      "learning_rate": 0.0001815222692880883,
      "loss": 0.3973,
      "step": 18440
    },
    {
      "gate_value": 0.29703307151794434,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18440
    },
    {
      "grad_norm": 1.4191664457321167,
      "learning_rate": 0.00018140101750529895,
      "loss": 0.3792,
      "step": 18450
    },
    {
      "gate_value": 0.2971894145011902,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 18450
    },
    {
      "grad_norm": 1.043149709701538,
      "learning_rate": 0.0001812797442602102,
      "loss": 0.3817,
      "step": 18460
    },
    {
      "gate_value": 0.297396183013916,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 18460
    },
    {
      "grad_norm": 4.110941410064697,
      "learning_rate": 0.0001811584496357112,
      "loss": 0.3768,
      "step": 18470
    },
    {
      "gate_value": 0.2977362275123596,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 18470
    },
    {
      "grad_norm": 0.7887822985649109,
      "learning_rate": 0.00018103713371470564,
      "loss": 0.388,
      "step": 18480
    },
    {
      "gate_value": 0.2975766360759735,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18480
    },
    {
      "grad_norm": 6.283224582672119,
      "learning_rate": 0.00018091579658011196,
      "loss": 0.3834,
      "step": 18490
    },
    {
      "gate_value": 0.29731979966163635,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 18490
    },
    {
      "grad_norm": 3.4314112663269043,
      "learning_rate": 0.00018079443831486275,
      "loss": 0.3889,
      "step": 18500
    },
    {
      "gate_value": 0.2973383963108063,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 18500
    },
    {
      "grad_norm": 3.1021149158477783,
      "learning_rate": 0.00018067305900190534,
      "loss": 0.3863,
      "step": 18510
    },
    {
      "gate_value": 0.2972663938999176,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 18510
    },
    {
      "grad_norm": 0.9524483680725098,
      "learning_rate": 0.00018055165872420137,
      "loss": 0.3853,
      "step": 18520
    },
    {
      "gate_value": 0.2974157929420471,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18520
    },
    {
      "grad_norm": 8.135797500610352,
      "learning_rate": 0.0001804302375647267,
      "loss": 0.3788,
      "step": 18530
    },
    {
      "gate_value": 0.2975497543811798,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18530
    },
    {
      "grad_norm": 0.8587509393692017,
      "learning_rate": 0.00018030879560647164,
      "loss": 0.3761,
      "step": 18540
    },
    {
      "gate_value": 0.2978472411632538,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 18540
    },
    {
      "grad_norm": 3.4958364963531494,
      "learning_rate": 0.00018018733293244054,
      "loss": 0.3779,
      "step": 18550
    },
    {
      "gate_value": 0.2981078624725342,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 18550
    },
    {
      "grad_norm": 0.891386091709137,
      "learning_rate": 0.00018006584962565204,
      "loss": 0.3635,
      "step": 18560
    },
    {
      "gate_value": 0.29821139574050903,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 18560
    },
    {
      "grad_norm": 4.393123149871826,
      "learning_rate": 0.00017994434576913882,
      "loss": 0.3784,
      "step": 18570
    },
    {
      "gate_value": 0.29809004068374634,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 18570
    },
    {
      "grad_norm": 1.5946396589279175,
      "learning_rate": 0.00017982282144594767,
      "loss": 0.3702,
      "step": 18580
    },
    {
      "gate_value": 0.29804927110671997,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 18580
    },
    {
      "grad_norm": 0.6159313917160034,
      "learning_rate": 0.0001797012767391392,
      "loss": 0.3688,
      "step": 18590
    },
    {
      "gate_value": 0.29824721813201904,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 18590
    },
    {
      "grad_norm": 1.7293890714645386,
      "learning_rate": 0.0001795797117317882,
      "loss": 0.3851,
      "step": 18600
    },
    {
      "gate_value": 0.2984820604324341,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 18600
    },
    {
      "grad_norm": 39.279598236083984,
      "learning_rate": 0.0001794581265069831,
      "loss": 0.3844,
      "step": 18610
    },
    {
      "gate_value": 0.2984519302845001,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18610
    },
    {
      "grad_norm": 2.361945152282715,
      "learning_rate": 0.00017933652114782636,
      "loss": 0.3984,
      "step": 18620
    },
    {
      "gate_value": 0.2985188663005829,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 18620
    },
    {
      "grad_norm": 11.486175537109375,
      "learning_rate": 0.00017921489573743404,
      "loss": 0.384,
      "step": 18630
    },
    {
      "gate_value": 0.29849180579185486,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 18630
    },
    {
      "grad_norm": 2.7124886512756348,
      "learning_rate": 0.0001790932503589359,
      "loss": 0.3773,
      "step": 18640
    },
    {
      "gate_value": 0.29862406849861145,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 18640
    },
    {
      "grad_norm": 14.254828453063965,
      "learning_rate": 0.00017897158509547556,
      "loss": 0.3846,
      "step": 18650
    },
    {
      "gate_value": 0.2986295521259308,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 18650
    },
    {
      "grad_norm": 2.347095012664795,
      "learning_rate": 0.00017884990003020991,
      "loss": 0.3816,
      "step": 18660
    },
    {
      "gate_value": 0.2986813187599182,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 18660
    },
    {
      "grad_norm": 1.8625816106796265,
      "learning_rate": 0.0001787281952463097,
      "loss": 0.3963,
      "step": 18670
    },
    {
      "gate_value": 0.29919448494911194,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 18670
    },
    {
      "grad_norm": 1.2128583192825317,
      "learning_rate": 0.0001786064708269589,
      "loss": 0.4016,
      "step": 18680
    },
    {
      "gate_value": 0.299286812543869,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 18680
    },
    {
      "grad_norm": 1.9459328651428223,
      "learning_rate": 0.000178484726855355,
      "loss": 0.3903,
      "step": 18690
    },
    {
      "gate_value": 0.29897111654281616,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 18690
    },
    {
      "grad_norm": 2.7386348247528076,
      "learning_rate": 0.00017836296341470896,
      "loss": 0.3939,
      "step": 18700
    },
    {
      "gate_value": 0.29894396662712097,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 18700
    },
    {
      "grad_norm": 5.737401485443115,
      "learning_rate": 0.00017824118058824481,
      "loss": 0.3728,
      "step": 18710
    },
    {
      "gate_value": 0.2992154657840729,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 18710
    },
    {
      "grad_norm": 2.06498384475708,
      "learning_rate": 0.00017811937845920006,
      "loss": 0.3961,
      "step": 18720
    },
    {
      "gate_value": 0.2994868755340576,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 18720
    },
    {
      "grad_norm": 1.8841884136199951,
      "learning_rate": 0.0001779975571108253,
      "loss": 0.3934,
      "step": 18730
    },
    {
      "gate_value": 0.2994190454483032,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 18730
    },
    {
      "grad_norm": 9.062392234802246,
      "learning_rate": 0.00017787571662638418,
      "loss": 0.3905,
      "step": 18740
    },
    {
      "gate_value": 0.29949483275413513,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 18740
    },
    {
      "grad_norm": 13.06369400024414,
      "learning_rate": 0.00017775385708915367,
      "loss": 0.3656,
      "step": 18750
    },
    {
      "gate_value": 0.2998977303504944,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 18750
    },
    {
      "grad_norm": 0.7034562230110168,
      "learning_rate": 0.00017763197858242352,
      "loss": 0.3782,
      "step": 18760
    },
    {
      "gate_value": 0.3001224398612976,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 18760
    },
    {
      "grad_norm": 5.080331802368164,
      "learning_rate": 0.00017751008118949653,
      "loss": 0.3833,
      "step": 18770
    },
    {
      "gate_value": 0.30042609572410583,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 18770
    },
    {
      "grad_norm": 0.9425305724143982,
      "learning_rate": 0.00017738816499368853,
      "loss": 0.3898,
      "step": 18780
    },
    {
      "gate_value": 0.3005761206150055,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 18780
    },
    {
      "grad_norm": 1.4428623914718628,
      "learning_rate": 0.00017726623007832795,
      "loss": 0.3729,
      "step": 18790
    },
    {
      "gate_value": 0.3003407418727875,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 18790
    },
    {
      "grad_norm": 1.1261143684387207,
      "learning_rate": 0.00017714427652675626,
      "loss": 0.377,
      "step": 18800
    },
    {
      "gate_value": 0.3002099096775055,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 18800
    },
    {
      "grad_norm": 3.563581705093384,
      "learning_rate": 0.00017702230442232747,
      "loss": 0.3857,
      "step": 18810
    },
    {
      "gate_value": 0.30062517523765564,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 18810
    },
    {
      "grad_norm": 1.9583052396774292,
      "learning_rate": 0.0001769003138484084,
      "loss": 0.3667,
      "step": 18820
    },
    {
      "gate_value": 0.30108368396759033,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 18820
    },
    {
      "grad_norm": 7.607995510101318,
      "learning_rate": 0.00017677830488837854,
      "loss": 0.3907,
      "step": 18830
    },
    {
      "gate_value": 0.3014366328716278,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 18830
    },
    {
      "grad_norm": 2.672945261001587,
      "learning_rate": 0.00017665627762562973,
      "loss": 0.375,
      "step": 18840
    },
    {
      "gate_value": 0.3011823296546936,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18840
    },
    {
      "grad_norm": 2.078075408935547,
      "learning_rate": 0.00017653423214356655,
      "loss": 0.3736,
      "step": 18850
    },
    {
      "gate_value": 0.30117878317832947,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 18850
    },
    {
      "grad_norm": 2.0478668212890625,
      "learning_rate": 0.00017641216852560594,
      "loss": 0.3922,
      "step": 18860
    },
    {
      "gate_value": 0.3012368977069855,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 18860
    },
    {
      "grad_norm": 3.1022074222564697,
      "learning_rate": 0.0001762900868551771,
      "loss": 0.3634,
      "step": 18870
    },
    {
      "gate_value": 0.30124127864837646,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 18870
    },
    {
      "grad_norm": 2.3055784702301025,
      "learning_rate": 0.00017616798721572185,
      "loss": 0.3762,
      "step": 18880
    },
    {
      "gate_value": 0.30172550678253174,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 18880
    },
    {
      "grad_norm": 1.4114841222763062,
      "learning_rate": 0.00017604586969069408,
      "loss": 0.3907,
      "step": 18890
    },
    {
      "gate_value": 0.3020336627960205,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 18890
    },
    {
      "grad_norm": 1.7120617628097534,
      "learning_rate": 0.00017592373436355998,
      "loss": 0.3888,
      "step": 18900
    },
    {
      "gate_value": 0.30191630125045776,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 18900
    },
    {
      "grad_norm": 2.659731388092041,
      "learning_rate": 0.00017580158131779791,
      "loss": 0.3621,
      "step": 18910
    },
    {
      "gate_value": 0.30202534794807434,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 18910
    },
    {
      "grad_norm": 2.554063320159912,
      "learning_rate": 0.00017567941063689827,
      "loss": 0.3951,
      "step": 18920
    },
    {
      "gate_value": 0.30230647325515747,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18920
    },
    {
      "grad_norm": 1.8052372932434082,
      "learning_rate": 0.0001755572224043636,
      "loss": 0.3651,
      "step": 18930
    },
    {
      "gate_value": 0.3022986054420471,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 18930
    },
    {
      "grad_norm": 1.0668402910232544,
      "learning_rate": 0.0001754350167037084,
      "loss": 0.3835,
      "step": 18940
    },
    {
      "gate_value": 0.30235445499420166,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 18940
    },
    {
      "grad_norm": 1.3296093940734863,
      "learning_rate": 0.0001753127936184592,
      "loss": 0.376,
      "step": 18950
    },
    {
      "gate_value": 0.30267301201820374,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 18950
    },
    {
      "grad_norm": 1.3229838609695435,
      "learning_rate": 0.00017519055323215416,
      "loss": 0.3851,
      "step": 18960
    },
    {
      "gate_value": 0.3031296730041504,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 18960
    },
    {
      "grad_norm": 4.205599308013916,
      "learning_rate": 0.0001750682956283435,
      "loss": 0.3895,
      "step": 18970
    },
    {
      "gate_value": 0.3030919134616852,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 18970
    },
    {
      "grad_norm": 1.8597735166549683,
      "learning_rate": 0.00017494602089058924,
      "loss": 0.377,
      "step": 18980
    },
    {
      "gate_value": 0.3035913407802582,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 18980
    },
    {
      "grad_norm": 27.68014907836914,
      "learning_rate": 0.00017482372910246487,
      "loss": 0.3806,
      "step": 18990
    },
    {
      "gate_value": 0.30397024750709534,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 18990
    },
    {
      "grad_norm": 1.0020619630813599,
      "learning_rate": 0.0001747014203475558,
      "loss": 0.3818,
      "step": 19000
    },
    {
      "gate_value": 0.3041447699069977,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 19000
    },
    {
      "grad_norm": 1.5626013278961182,
      "learning_rate": 0.00017457909470945876,
      "loss": 0.3674,
      "step": 19010
    },
    {
      "gate_value": 0.3043736517429352,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19010
    },
    {
      "grad_norm": 2.424412965774536,
      "learning_rate": 0.00017445675227178227,
      "loss": 0.373,
      "step": 19020
    },
    {
      "gate_value": 0.30459895730018616,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19020
    },
    {
      "grad_norm": 1.1943349838256836,
      "learning_rate": 0.00017433439311814627,
      "loss": 0.3713,
      "step": 19030
    },
    {
      "gate_value": 0.30527937412261963,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 19030
    },
    {
      "grad_norm": 1.907333254814148,
      "learning_rate": 0.00017421201733218195,
      "loss": 0.3864,
      "step": 19040
    },
    {
      "gate_value": 0.305605947971344,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 19040
    },
    {
      "grad_norm": 5.313255310058594,
      "learning_rate": 0.00017408962499753218,
      "loss": 0.3802,
      "step": 19050
    },
    {
      "gate_value": 0.3056756556034088,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19050
    },
    {
      "grad_norm": 0.7734915018081665,
      "learning_rate": 0.00017396721619785085,
      "loss": 0.3919,
      "step": 19060
    },
    {
      "gate_value": 0.305307537317276,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 19060
    },
    {
      "grad_norm": 1.0982208251953125,
      "learning_rate": 0.00017384479101680318,
      "loss": 0.3778,
      "step": 19070
    },
    {
      "gate_value": 0.30533578991889954,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19070
    },
    {
      "grad_norm": 1.1880580186843872,
      "learning_rate": 0.00017372234953806577,
      "loss": 0.3613,
      "step": 19080
    },
    {
      "gate_value": 0.30517280101776123,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 19080
    },
    {
      "grad_norm": 3.1580140590667725,
      "learning_rate": 0.00017359989184532614,
      "loss": 0.3745,
      "step": 19090
    },
    {
      "gate_value": 0.3055513799190521,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 19090
    },
    {
      "grad_norm": 0.8944216370582581,
      "learning_rate": 0.00017347741802228292,
      "loss": 0.3741,
      "step": 19100
    },
    {
      "gate_value": 0.3054380714893341,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 19100
    },
    {
      "grad_norm": 3.154456377029419,
      "learning_rate": 0.00017335492815264588,
      "loss": 0.3932,
      "step": 19110
    },
    {
      "gate_value": 0.30571210384368896,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19110
    },
    {
      "grad_norm": 1.052626132965088,
      "learning_rate": 0.00017323242232013562,
      "loss": 0.3689,
      "step": 19120
    },
    {
      "gate_value": 0.3059786856174469,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 19120
    },
    {
      "grad_norm": 2.3785758018493652,
      "learning_rate": 0.00017310990060848385,
      "loss": 0.3836,
      "step": 19130
    },
    {
      "gate_value": 0.30624398589134216,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 19130
    },
    {
      "grad_norm": 1.4953151941299438,
      "learning_rate": 0.00017298736310143292,
      "loss": 0.3721,
      "step": 19140
    },
    {
      "gate_value": 0.3064724802970886,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19140
    },
    {
      "grad_norm": 0.9571640491485596,
      "learning_rate": 0.00017286480988273607,
      "loss": 0.378,
      "step": 19150
    },
    {
      "gate_value": 0.3065114915370941,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 19150
    },
    {
      "grad_norm": 2.1345882415771484,
      "learning_rate": 0.00017274224103615721,
      "loss": 0.3851,
      "step": 19160
    },
    {
      "gate_value": 0.30675628781318665,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19160
    },
    {
      "grad_norm": 5.007328033447266,
      "learning_rate": 0.0001726196566454711,
      "loss": 0.3687,
      "step": 19170
    },
    {
      "gate_value": 0.3067236840724945,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 19170
    },
    {
      "grad_norm": 2.5974600315093994,
      "learning_rate": 0.00017249705679446296,
      "loss": 0.362,
      "step": 19180
    },
    {
      "gate_value": 0.3066827654838562,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 19180
    },
    {
      "grad_norm": 1.0741844177246094,
      "learning_rate": 0.0001723744415669286,
      "loss": 0.3822,
      "step": 19190
    },
    {
      "gate_value": 0.3067505657672882,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 19190
    },
    {
      "grad_norm": 0.9603002667427063,
      "learning_rate": 0.00017225181104667446,
      "loss": 0.3975,
      "step": 19200
    },
    {
      "gate_value": 0.30703407526016235,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 19200
    },
    {
      "grad_norm": 8.882355690002441,
      "learning_rate": 0.00017212916531751725,
      "loss": 0.3815,
      "step": 19210
    },
    {
      "gate_value": 0.30765148997306824,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 19210
    },
    {
      "grad_norm": 1.7542152404785156,
      "learning_rate": 0.00017200650446328418,
      "loss": 0.3865,
      "step": 19220
    },
    {
      "gate_value": 0.3078744113445282,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 19220
    },
    {
      "grad_norm": 0.6525892019271851,
      "learning_rate": 0.00017188382856781292,
      "loss": 0.3764,
      "step": 19230
    },
    {
      "gate_value": 0.3077229857444763,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 19230
    },
    {
      "grad_norm": 10.071258544921875,
      "learning_rate": 0.0001717611377149511,
      "loss": 0.3827,
      "step": 19240
    },
    {
      "gate_value": 0.3076436221599579,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 19240
    },
    {
      "grad_norm": 0.9834297895431519,
      "learning_rate": 0.00017163843198855685,
      "loss": 0.3681,
      "step": 19250
    },
    {
      "gate_value": 0.3078278601169586,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 19250
    },
    {
      "grad_norm": 1.0656040906906128,
      "learning_rate": 0.00017151571147249844,
      "loss": 0.3908,
      "step": 19260
    },
    {
      "gate_value": 0.3073131740093231,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19260
    },
    {
      "grad_norm": 1.851256012916565,
      "learning_rate": 0.00017139297625065402,
      "loss": 0.3864,
      "step": 19270
    },
    {
      "gate_value": 0.3072352111339569,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19270
    },
    {
      "grad_norm": 2.0325307846069336,
      "learning_rate": 0.00017127022640691218,
      "loss": 0.4071,
      "step": 19280
    },
    {
      "gate_value": 0.30702707171440125,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19280
    },
    {
      "grad_norm": 11.795533180236816,
      "learning_rate": 0.0001711474620251711,
      "loss": 0.3894,
      "step": 19290
    },
    {
      "gate_value": 0.3068554997444153,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 19290
    },
    {
      "grad_norm": 2.296201467514038,
      "learning_rate": 0.0001710246831893391,
      "loss": 0.3815,
      "step": 19300
    },
    {
      "gate_value": 0.3070986568927765,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 19300
    },
    {
      "grad_norm": 3.2662224769592285,
      "learning_rate": 0.00017090188998333442,
      "loss": 0.3754,
      "step": 19310
    },
    {
      "gate_value": 0.30730971693992615,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 19310
    },
    {
      "grad_norm": 1.9833241701126099,
      "learning_rate": 0.000170779082491085,
      "loss": 0.3747,
      "step": 19320
    },
    {
      "gate_value": 0.3075919449329376,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 19320
    },
    {
      "grad_norm": 3.4221928119659424,
      "learning_rate": 0.00017065626079652873,
      "loss": 0.3738,
      "step": 19330
    },
    {
      "gate_value": 0.30789434909820557,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 19330
    },
    {
      "grad_norm": 2.5420238971710205,
      "learning_rate": 0.00017053342498361286,
      "loss": 0.379,
      "step": 19340
    },
    {
      "gate_value": 0.3079376518726349,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 19340
    },
    {
      "grad_norm": 28.063114166259766,
      "learning_rate": 0.00017041057513629467,
      "loss": 0.3751,
      "step": 19350
    },
    {
      "gate_value": 0.30805227160453796,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 19350
    },
    {
      "grad_norm": 1.6978594064712524,
      "learning_rate": 0.00017028771133854086,
      "loss": 0.4035,
      "step": 19360
    },
    {
      "gate_value": 0.3074844181537628,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 19360
    },
    {
      "grad_norm": 0.8512712121009827,
      "learning_rate": 0.00017016483367432767,
      "loss": 0.378,
      "step": 19370
    },
    {
      "gate_value": 0.3074425458908081,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 19370
    },
    {
      "grad_norm": 1.114200472831726,
      "learning_rate": 0.00017004194222764075,
      "loss": 0.3955,
      "step": 19380
    },
    {
      "gate_value": 0.307817280292511,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 19380
    },
    {
      "grad_norm": 1.6457468271255493,
      "learning_rate": 0.00016991903708247534,
      "loss": 0.3736,
      "step": 19390
    },
    {
      "gate_value": 0.3081192076206207,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 19390
    },
    {
      "grad_norm": 1.3923431634902954,
      "learning_rate": 0.00016979611832283588,
      "loss": 0.3708,
      "step": 19400
    },
    {
      "gate_value": 0.3083207905292511,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 19400
    },
    {
      "grad_norm": 0.729960560798645,
      "learning_rate": 0.00016967318603273624,
      "loss": 0.3699,
      "step": 19410
    },
    {
      "gate_value": 0.3086252808570862,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19410
    },
    {
      "grad_norm": 2.3151090145111084,
      "learning_rate": 0.00016955024029619944,
      "loss": 0.3798,
      "step": 19420
    },
    {
      "gate_value": 0.30867984890937805,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 19420
    },
    {
      "grad_norm": 2.409742593765259,
      "learning_rate": 0.00016942728119725777,
      "loss": 0.3731,
      "step": 19430
    },
    {
      "gate_value": 0.30868929624557495,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19430
    },
    {
      "grad_norm": 9.685903549194336,
      "learning_rate": 0.00016930430881995255,
      "loss": 0.38,
      "step": 19440
    },
    {
      "gate_value": 0.3084772229194641,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 19440
    },
    {
      "grad_norm": 3.204035758972168,
      "learning_rate": 0.0001691813232483343,
      "loss": 0.3808,
      "step": 19450
    },
    {
      "gate_value": 0.30828583240509033,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 19450
    },
    {
      "grad_norm": 1.6053801774978638,
      "learning_rate": 0.0001690583245664625,
      "loss": 0.391,
      "step": 19460
    },
    {
      "gate_value": 0.30858343839645386,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 19460
    },
    {
      "grad_norm": 0.9320996999740601,
      "learning_rate": 0.00016893531285840555,
      "loss": 0.3744,
      "step": 19470
    },
    {
      "gate_value": 0.308741956949234,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 19470
    },
    {
      "grad_norm": 1.2532542943954468,
      "learning_rate": 0.0001688122882082408,
      "loss": 0.3756,
      "step": 19480
    },
    {
      "gate_value": 0.3092672824859619,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 19480
    },
    {
      "grad_norm": 1.312092661857605,
      "learning_rate": 0.00016868925070005444,
      "loss": 0.3833,
      "step": 19490
    },
    {
      "gate_value": 0.30962440371513367,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 19490
    },
    {
      "grad_norm": 1.3440266847610474,
      "learning_rate": 0.00016856620041794145,
      "loss": 0.3685,
      "step": 19500
    },
    {
      "gate_value": 0.309724360704422,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 19500
    },
    {
      "grad_norm": 1.1109507083892822,
      "learning_rate": 0.0001684431374460056,
      "loss": 0.3872,
      "step": 19510
    },
    {
      "gate_value": 0.3096853196620941,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 19510
    },
    {
      "grad_norm": 2.074230670928955,
      "learning_rate": 0.00016832006186835916,
      "loss": 0.3962,
      "step": 19520
    },
    {
      "gate_value": 0.30953168869018555,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 19520
    },
    {
      "grad_norm": 1.1803638935089111,
      "learning_rate": 0.0001681969737691232,
      "loss": 0.3739,
      "step": 19530
    },
    {
      "gate_value": 0.30993011593818665,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 19530
    },
    {
      "grad_norm": 11.847143173217773,
      "learning_rate": 0.00016807387323242726,
      "loss": 0.3726,
      "step": 19540
    },
    {
      "gate_value": 0.3103095293045044,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19540
    },
    {
      "grad_norm": 0.9531516432762146,
      "learning_rate": 0.00016795076034240938,
      "loss": 0.3822,
      "step": 19550
    },
    {
      "gate_value": 0.3104497492313385,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19550
    },
    {
      "grad_norm": 1.2661352157592773,
      "learning_rate": 0.00016782763518321611,
      "loss": 0.387,
      "step": 19560
    },
    {
      "gate_value": 0.31011050939559937,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 19560
    },
    {
      "grad_norm": 1.1231858730316162,
      "learning_rate": 0.00016770449783900225,
      "loss": 0.3876,
      "step": 19570
    },
    {
      "gate_value": 0.3103155493736267,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 19570
    },
    {
      "grad_norm": 1.1371983289718628,
      "learning_rate": 0.0001675813483939311,
      "loss": 0.395,
      "step": 19580
    },
    {
      "gate_value": 0.3104420304298401,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 19580
    },
    {
      "grad_norm": 1.1629470586776733,
      "learning_rate": 0.00016745818693217405,
      "loss": 0.3586,
      "step": 19590
    },
    {
      "gate_value": 0.31009596586227417,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19590
    },
    {
      "grad_norm": 2.172532558441162,
      "learning_rate": 0.0001673350135379109,
      "loss": 0.3732,
      "step": 19600
    },
    {
      "gate_value": 0.3102813959121704,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 19600
    },
    {
      "grad_norm": 3.175884246826172,
      "learning_rate": 0.00016721182829532944,
      "loss": 0.3675,
      "step": 19610
    },
    {
      "gate_value": 0.3106948733329773,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 19610
    },
    {
      "grad_norm": 1.5540252923965454,
      "learning_rate": 0.00016708863128862562,
      "loss": 0.3866,
      "step": 19620
    },
    {
      "gate_value": 0.3106013238430023,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 19620
    },
    {
      "grad_norm": 1.5975329875946045,
      "learning_rate": 0.0001669654226020035,
      "loss": 0.3736,
      "step": 19630
    },
    {
      "gate_value": 0.3107665479183197,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 19630
    },
    {
      "grad_norm": 0.9779009222984314,
      "learning_rate": 0.00016684220231967496,
      "loss": 0.3776,
      "step": 19640
    },
    {
      "gate_value": 0.3108340799808502,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 19640
    },
    {
      "grad_norm": 1.3642061948776245,
      "learning_rate": 0.00016671897052585998,
      "loss": 0.3755,
      "step": 19650
    },
    {
      "gate_value": 0.3109137713909149,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19650
    },
    {
      "grad_norm": 1.699450135231018,
      "learning_rate": 0.0001665957273047863,
      "loss": 0.3786,
      "step": 19660
    },
    {
      "gate_value": 0.3109440505504608,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 19660
    },
    {
      "grad_norm": 1.0772122144699097,
      "learning_rate": 0.00016647247274068945,
      "loss": 0.3843,
      "step": 19670
    },
    {
      "gate_value": 0.3111564517021179,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 19670
    },
    {
      "grad_norm": 0.9683305621147156,
      "learning_rate": 0.00016634920691781282,
      "loss": 0.3721,
      "step": 19680
    },
    {
      "gate_value": 0.311585932970047,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 19680
    },
    {
      "grad_norm": 2.59132719039917,
      "learning_rate": 0.00016622592992040743,
      "loss": 0.3803,
      "step": 19690
    },
    {
      "gate_value": 0.31226620078086853,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 19690
    },
    {
      "grad_norm": 3.2184154987335205,
      "learning_rate": 0.00016610264183273196,
      "loss": 0.378,
      "step": 19700
    },
    {
      "gate_value": 0.3124104142189026,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 19700
    },
    {
      "grad_norm": 1.0623730421066284,
      "learning_rate": 0.00016597934273905262,
      "loss": 0.3832,
      "step": 19710
    },
    {
      "gate_value": 0.3129165470600128,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 19710
    },
    {
      "grad_norm": 2.3293986320495605,
      "learning_rate": 0.00016585603272364322,
      "loss": 0.3877,
      "step": 19720
    },
    {
      "gate_value": 0.31323570013046265,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 19720
    },
    {
      "grad_norm": 7.679978370666504,
      "learning_rate": 0.00016573271187078493,
      "loss": 0.3829,
      "step": 19730
    },
    {
      "gate_value": 0.31327542662620544,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 19730
    },
    {
      "grad_norm": 1.5804600715637207,
      "learning_rate": 0.00016560938026476647,
      "loss": 0.3736,
      "step": 19740
    },
    {
      "gate_value": 0.3131873309612274,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 19740
    },
    {
      "grad_norm": 1.0514229536056519,
      "learning_rate": 0.00016548603798988373,
      "loss": 0.384,
      "step": 19750
    },
    {
      "gate_value": 0.31321951746940613,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 19750
    },
    {
      "grad_norm": 47.53086471557617,
      "learning_rate": 0.0001653626851304401,
      "loss": 0.3661,
      "step": 19760
    },
    {
      "gate_value": 0.313575804233551,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19760
    },
    {
      "grad_norm": 2.0501813888549805,
      "learning_rate": 0.00016523932177074597,
      "loss": 0.3891,
      "step": 19770
    },
    {
      "gate_value": 0.31374025344848633,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 19770
    },
    {
      "grad_norm": 11.146804809570312,
      "learning_rate": 0.0001651159479951192,
      "loss": 0.387,
      "step": 19780
    },
    {
      "gate_value": 0.31384482979774475,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 19780
    },
    {
      "grad_norm": 1.0424987077713013,
      "learning_rate": 0.00016499256388788447,
      "loss": 0.396,
      "step": 19790
    },
    {
      "gate_value": 0.31350716948509216,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 19790
    },
    {
      "grad_norm": 3.2343297004699707,
      "learning_rate": 0.0001648691695333737,
      "loss": 0.3627,
      "step": 19800
    },
    {
      "gate_value": 0.31341326236724854,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 19800
    },
    {
      "grad_norm": 1.6307929754257202,
      "learning_rate": 0.00016474576501592574,
      "loss": 0.3692,
      "step": 19810
    },
    {
      "gate_value": 0.31364917755126953,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 19810
    },
    {
      "grad_norm": 0.7756251096725464,
      "learning_rate": 0.00016462235041988642,
      "loss": 0.3693,
      "step": 19820
    },
    {
      "gate_value": 0.3137316107749939,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19820
    },
    {
      "grad_norm": 0.9996859431266785,
      "learning_rate": 0.00016449892582960852,
      "loss": 0.3981,
      "step": 19830
    },
    {
      "gate_value": 0.31357425451278687,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 19830
    },
    {
      "grad_norm": 1.6043193340301514,
      "learning_rate": 0.00016437549132945151,
      "loss": 0.3682,
      "step": 19840
    },
    {
      "gate_value": 0.3138103783130646,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 19840
    },
    {
      "grad_norm": 1.084348201751709,
      "learning_rate": 0.00016425204700378174,
      "loss": 0.3857,
      "step": 19850
    },
    {
      "gate_value": 0.3141813576221466,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 19850
    },
    {
      "grad_norm": 7.152972221374512,
      "learning_rate": 0.00016412859293697224,
      "loss": 0.3833,
      "step": 19860
    },
    {
      "gate_value": 0.31427866220474243,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 19860
    },
    {
      "grad_norm": 0.9384665489196777,
      "learning_rate": 0.00016400512921340265,
      "loss": 0.3848,
      "step": 19870
    },
    {
      "gate_value": 0.3139939606189728,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 19870
    },
    {
      "grad_norm": 0.4689892530441284,
      "learning_rate": 0.00016388165591745934,
      "loss": 0.3623,
      "step": 19880
    },
    {
      "gate_value": 0.31366410851478577,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 19880
    },
    {
      "grad_norm": 1.7431423664093018,
      "learning_rate": 0.0001637581731335351,
      "loss": 0.3971,
      "step": 19890
    },
    {
      "gate_value": 0.3138532340526581,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19890
    },
    {
      "grad_norm": 0.8681080341339111,
      "learning_rate": 0.00016363468094602923,
      "loss": 0.3749,
      "step": 19900
    },
    {
      "gate_value": 0.3135307729244232,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 19900
    },
    {
      "grad_norm": 0.8464221358299255,
      "learning_rate": 0.00016351117943934755,
      "loss": 0.3772,
      "step": 19910
    },
    {
      "gate_value": 0.313311904668808,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 19910
    },
    {
      "grad_norm": 5.371399879455566,
      "learning_rate": 0.00016338766869790206,
      "loss": 0.3933,
      "step": 19920
    },
    {
      "gate_value": 0.3136154115200043,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19920
    },
    {
      "grad_norm": 1.4953688383102417,
      "learning_rate": 0.00016326414880611133,
      "loss": 0.3858,
      "step": 19930
    },
    {
      "gate_value": 0.3138081133365631,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 19930
    },
    {
      "grad_norm": 1.1205352544784546,
      "learning_rate": 0.00016314061984839992,
      "loss": 0.3751,
      "step": 19940
    },
    {
      "gate_value": 0.3141990602016449,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 19940
    },
    {
      "grad_norm": 4.253011703491211,
      "learning_rate": 0.00016301708190919872,
      "loss": 0.3851,
      "step": 19950
    },
    {
      "gate_value": 0.3143591582775116,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 19950
    },
    {
      "grad_norm": 5.624688625335693,
      "learning_rate": 0.00016289353507294483,
      "loss": 0.3721,
      "step": 19960
    },
    {
      "gate_value": 0.31418493390083313,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 19960
    },
    {
      "grad_norm": 3.7468695640563965,
      "learning_rate": 0.00016276997942408128,
      "loss": 0.3649,
      "step": 19970
    },
    {
      "gate_value": 0.3142998516559601,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 19970
    },
    {
      "grad_norm": 0.7305354475975037,
      "learning_rate": 0.00016264641504705723,
      "loss": 0.3788,
      "step": 19980
    },
    {
      "gate_value": 0.31467312574386597,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 19980
    },
    {
      "grad_norm": 2.8758230209350586,
      "learning_rate": 0.00016252284202632772,
      "loss": 0.3707,
      "step": 19990
    },
    {
      "gate_value": 0.31497639417648315,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 19990
    },
    {
      "grad_norm": 5.836865425109863,
      "learning_rate": 0.00016239926044635378,
      "loss": 0.3755,
      "step": 20000
    },
    {
      "gate_value": 0.3148883581161499,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 20000
    },
    {
      "grad_norm": 1.1395916938781738,
      "learning_rate": 0.00016227567039160223,
      "loss": 0.3766,
      "step": 20010
    },
    {
      "gate_value": 0.31475260853767395,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 20010
    },
    {
      "grad_norm": 1.6680474281311035,
      "learning_rate": 0.00016215207194654571,
      "loss": 0.393,
      "step": 20020
    },
    {
      "gate_value": 0.314287006855011,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20020
    },
    {
      "grad_norm": 45.070587158203125,
      "learning_rate": 0.0001620284651956626,
      "loss": 0.3865,
      "step": 20030
    },
    {
      "gate_value": 0.31412649154663086,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20030
    },
    {
      "grad_norm": 1.6862635612487793,
      "learning_rate": 0.000161904850223437,
      "loss": 0.3744,
      "step": 20040
    },
    {
      "gate_value": 0.3144453465938568,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 20040
    },
    {
      "grad_norm": 1.0638667345046997,
      "learning_rate": 0.0001617812271143585,
      "loss": 0.3745,
      "step": 20050
    },
    {
      "gate_value": 0.314824640750885,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20050
    },
    {
      "grad_norm": 2.623652219772339,
      "learning_rate": 0.00016165759595292232,
      "loss": 0.3882,
      "step": 20060
    },
    {
      "gate_value": 0.3149595856666565,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 20060
    },
    {
      "grad_norm": 1.5121970176696777,
      "learning_rate": 0.0001615339568236293,
      "loss": 0.3986,
      "step": 20070
    },
    {
      "gate_value": 0.31516796350479126,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 20070
    },
    {
      "grad_norm": 79.21330261230469,
      "learning_rate": 0.0001614103098109855,
      "loss": 0.3744,
      "step": 20080
    },
    {
      "gate_value": 0.3159092962741852,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20080
    },
    {
      "grad_norm": 1.1513302326202393,
      "learning_rate": 0.00016128665499950254,
      "loss": 0.3859,
      "step": 20090
    },
    {
      "gate_value": 0.31618189811706543,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20090
    },
    {
      "grad_norm": 3.856543779373169,
      "learning_rate": 0.0001611629924736973,
      "loss": 0.3594,
      "step": 20100
    },
    {
      "gate_value": 0.31636902689933777,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 20100
    },
    {
      "grad_norm": 1.5914669036865234,
      "learning_rate": 0.000161039322318092,
      "loss": 0.3943,
      "step": 20110
    },
    {
      "gate_value": 0.31623247265815735,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 20110
    },
    {
      "grad_norm": 0.7739580869674683,
      "learning_rate": 0.000160915644617214,
      "loss": 0.3857,
      "step": 20120
    },
    {
      "gate_value": 0.3166380524635315,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 20120
    },
    {
      "grad_norm": 2.524143695831299,
      "learning_rate": 0.0001607919594555958,
      "loss": 0.366,
      "step": 20130
    },
    {
      "gate_value": 0.3169264495372772,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 20130
    },
    {
      "grad_norm": 4.026458263397217,
      "learning_rate": 0.0001606682669177751,
      "loss": 0.369,
      "step": 20140
    },
    {
      "gate_value": 0.31718090176582336,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 20140
    },
    {
      "grad_norm": 0.697846531867981,
      "learning_rate": 0.0001605445670882945,
      "loss": 0.3765,
      "step": 20150
    },
    {
      "gate_value": 0.31738293170928955,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 20150
    },
    {
      "grad_norm": 0.518775463104248,
      "learning_rate": 0.0001604208600517018,
      "loss": 0.3669,
      "step": 20160
    },
    {
      "gate_value": 0.3173922598361969,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 20160
    },
    {
      "grad_norm": 3.240859270095825,
      "learning_rate": 0.0001602971458925495,
      "loss": 0.3905,
      "step": 20170
    },
    {
      "gate_value": 0.3173951208591461,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20170
    },
    {
      "grad_norm": 1.02534019947052,
      "learning_rate": 0.00016017342469539503,
      "loss": 0.361,
      "step": 20180
    },
    {
      "gate_value": 0.31715652346611023,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20180
    },
    {
      "grad_norm": 2.5602641105651855,
      "learning_rate": 0.00016004969654480079,
      "loss": 0.3677,
      "step": 20190
    },
    {
      "gate_value": 0.3173889219760895,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 20190
    },
    {
      "grad_norm": 1.1085784435272217,
      "learning_rate": 0.00015992596152533364,
      "loss": 0.3866,
      "step": 20200
    },
    {
      "gate_value": 0.31750160455703735,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 20200
    },
    {
      "grad_norm": 2.646415948867798,
      "learning_rate": 0.00015980221972156542,
      "loss": 0.3617,
      "step": 20210
    },
    {
      "gate_value": 0.3176015019416809,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 20210
    },
    {
      "grad_norm": 1.0765670537948608,
      "learning_rate": 0.00015967847121807247,
      "loss": 0.3898,
      "step": 20220
    },
    {
      "gate_value": 0.317767858505249,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 20220
    },
    {
      "grad_norm": 1.4987940788269043,
      "learning_rate": 0.00015955471609943567,
      "loss": 0.3831,
      "step": 20230
    },
    {
      "gate_value": 0.3180540204048157,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 20230
    },
    {
      "grad_norm": 6.206676959991455,
      "learning_rate": 0.00015943095445024056,
      "loss": 0.3755,
      "step": 20240
    },
    {
      "gate_value": 0.31847628951072693,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20240
    },
    {
      "grad_norm": 0.9935562610626221,
      "learning_rate": 0.00015930718635507696,
      "loss": 0.3719,
      "step": 20250
    },
    {
      "gate_value": 0.3188856542110443,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20250
    },
    {
      "grad_norm": 2.0614676475524902,
      "learning_rate": 0.00015918341189853928,
      "loss": 0.3862,
      "step": 20260
    },
    {
      "gate_value": 0.31922855973243713,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20260
    },
    {
      "grad_norm": 78.66289520263672,
      "learning_rate": 0.00015905963116522617,
      "loss": 0.3763,
      "step": 20270
    },
    {
      "gate_value": 0.31915542483329773,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20270
    },
    {
      "grad_norm": 0.7880850434303284,
      "learning_rate": 0.00015893584423974056,
      "loss": 0.3764,
      "step": 20280
    },
    {
      "gate_value": 0.31941351294517517,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 20280
    },
    {
      "grad_norm": 2.9438118934631348,
      "learning_rate": 0.0001588120512066897,
      "loss": 0.395,
      "step": 20290
    },
    {
      "gate_value": 0.31935545802116394,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 20290
    },
    {
      "grad_norm": 64.0041732788086,
      "learning_rate": 0.0001586882521506849,
      "loss": 0.3741,
      "step": 20300
    },
    {
      "gate_value": 0.31919434666633606,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 20300
    },
    {
      "grad_norm": 1.1446757316589355,
      "learning_rate": 0.00015856444715634167,
      "loss": 0.3544,
      "step": 20310
    },
    {
      "gate_value": 0.31936153769493103,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 20310
    },
    {
      "grad_norm": 1.5598700046539307,
      "learning_rate": 0.0001584406363082796,
      "loss": 0.3766,
      "step": 20320
    },
    {
      "gate_value": 0.3194507360458374,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 20320
    },
    {
      "grad_norm": 0.8992844820022583,
      "learning_rate": 0.00015831681969112214,
      "loss": 0.3718,
      "step": 20330
    },
    {
      "gate_value": 0.3197779953479767,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 20330
    },
    {
      "grad_norm": 4.334936618804932,
      "learning_rate": 0.00015819299738949695,
      "loss": 0.3809,
      "step": 20340
    },
    {
      "gate_value": 0.3198601007461548,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20340
    },
    {
      "grad_norm": 1.361315369606018,
      "learning_rate": 0.00015806916948803525,
      "loss": 0.3833,
      "step": 20350
    },
    {
      "gate_value": 0.319922536611557,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 20350
    },
    {
      "grad_norm": 3.1220877170562744,
      "learning_rate": 0.00015794533607137228,
      "loss": 0.3722,
      "step": 20360
    },
    {
      "gate_value": 0.3200428783893585,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 20360
    },
    {
      "grad_norm": 3.3610692024230957,
      "learning_rate": 0.00015782149722414702,
      "loss": 0.3697,
      "step": 20370
    },
    {
      "gate_value": 0.319830060005188,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 20370
    },
    {
      "grad_norm": 5.070735931396484,
      "learning_rate": 0.00015769765303100215,
      "loss": 0.3562,
      "step": 20380
    },
    {
      "gate_value": 0.31996458768844604,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 20380
    },
    {
      "grad_norm": 9.352119445800781,
      "learning_rate": 0.0001575738035765841,
      "loss": 0.3728,
      "step": 20390
    },
    {
      "gate_value": 0.31997665762901306,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 20390
    },
    {
      "grad_norm": 1.9495102167129517,
      "learning_rate": 0.00015744994894554263,
      "loss": 0.3838,
      "step": 20400
    },
    {
      "gate_value": 0.3198350667953491,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20400
    },
    {
      "grad_norm": 2.5276057720184326,
      "learning_rate": 0.00015732608922253136,
      "loss": 0.3896,
      "step": 20410
    },
    {
      "gate_value": 0.31992819905281067,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 20410
    },
    {
      "grad_norm": 6.810713768005371,
      "learning_rate": 0.00015720222449220716,
      "loss": 0.3698,
      "step": 20420
    },
    {
      "gate_value": 0.3202584683895111,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 20420
    },
    {
      "grad_norm": 1.0463260412216187,
      "learning_rate": 0.00015707835483923043,
      "loss": 0.3519,
      "step": 20430
    },
    {
      "gate_value": 0.3203528821468353,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20430
    },
    {
      "grad_norm": 2.1792423725128174,
      "learning_rate": 0.00015695448034826494,
      "loss": 0.3746,
      "step": 20440
    },
    {
      "gate_value": 0.32040420174598694,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 20440
    },
    {
      "grad_norm": 1.2219457626342773,
      "learning_rate": 0.00015683060110397768,
      "loss": 0.3657,
      "step": 20450
    },
    {
      "gate_value": 0.32081183791160583,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 20450
    },
    {
      "grad_norm": 4.518383979797363,
      "learning_rate": 0.00015670671719103898,
      "loss": 0.3767,
      "step": 20460
    },
    {
      "gate_value": 0.321494460105896,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 20460
    },
    {
      "grad_norm": 5.510248184204102,
      "learning_rate": 0.00015658282869412233,
      "loss": 0.3645,
      "step": 20470
    },
    {
      "gate_value": 0.32175546884536743,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 20470
    },
    {
      "grad_norm": 2.041200876235962,
      "learning_rate": 0.00015645893569790428,
      "loss": 0.3933,
      "step": 20480
    },
    {
      "gate_value": 0.3220183849334717,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 20480
    },
    {
      "grad_norm": 9.356095314025879,
      "learning_rate": 0.00015633503828706467,
      "loss": 0.3667,
      "step": 20490
    },
    {
      "gate_value": 0.3222240209579468,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 20490
    },
    {
      "grad_norm": 52.88069152832031,
      "learning_rate": 0.00015621113654628612,
      "loss": 0.3883,
      "step": 20500
    },
    {
      "gate_value": 0.3223245441913605,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20500
    },
    {
      "grad_norm": 2.949965476989746,
      "learning_rate": 0.00015608723056025425,
      "loss": 0.3662,
      "step": 20510
    },
    {
      "gate_value": 0.3225291073322296,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 20510
    },
    {
      "grad_norm": 31.80426597595215,
      "learning_rate": 0.00015596332041365775,
      "loss": 0.3581,
      "step": 20520
    },
    {
      "gate_value": 0.32278192043304443,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20520
    },
    {
      "grad_norm": 1.4811211824417114,
      "learning_rate": 0.00015583940619118793,
      "loss": 0.3805,
      "step": 20530
    },
    {
      "gate_value": 0.3228819966316223,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 20530
    },
    {
      "grad_norm": 8.001051902770996,
      "learning_rate": 0.00015571548797753906,
      "loss": 0.3505,
      "step": 20540
    },
    {
      "gate_value": 0.32308506965637207,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20540
    },
    {
      "grad_norm": 2.4301977157592773,
      "learning_rate": 0.00015559156585740808,
      "loss": 0.3809,
      "step": 20550
    },
    {
      "gate_value": 0.3232133984565735,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 20550
    },
    {
      "grad_norm": 2.834927797317505,
      "learning_rate": 0.00015546763991549452,
      "loss": 0.3784,
      "step": 20560
    },
    {
      "gate_value": 0.3232627809047699,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 20560
    },
    {
      "grad_norm": 11.861950874328613,
      "learning_rate": 0.00015534371023650067,
      "loss": 0.378,
      "step": 20570
    },
    {
      "gate_value": 0.32333946228027344,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20570
    },
    {
      "grad_norm": 16.24566078186035,
      "learning_rate": 0.00015521977690513124,
      "loss": 0.3764,
      "step": 20580
    },
    {
      "gate_value": 0.32341697812080383,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20580
    },
    {
      "grad_norm": 3.6404664516448975,
      "learning_rate": 0.0001550958400060935,
      "loss": 0.3775,
      "step": 20590
    },
    {
      "gate_value": 0.32357144355773926,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 20590
    },
    {
      "grad_norm": 1.6172356605529785,
      "learning_rate": 0.0001549718996240972,
      "loss": 0.3794,
      "step": 20600
    },
    {
      "gate_value": 0.323573499917984,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 20600
    },
    {
      "grad_norm": 5.064068794250488,
      "learning_rate": 0.00015484795584385432,
      "loss": 0.3727,
      "step": 20610
    },
    {
      "gate_value": 0.3236609399318695,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 20610
    },
    {
      "grad_norm": 5.703829765319824,
      "learning_rate": 0.00015472400875007943,
      "loss": 0.3676,
      "step": 20620
    },
    {
      "gate_value": 0.32396653294563293,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20620
    },
    {
      "grad_norm": 4.807945728302002,
      "learning_rate": 0.00015460005842748905,
      "loss": 0.3829,
      "step": 20630
    },
    {
      "gate_value": 0.32434651255607605,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 20630
    },
    {
      "grad_norm": 26.915267944335938,
      "learning_rate": 0.0001544761049608021,
      "loss": 0.3684,
      "step": 20640
    },
    {
      "gate_value": 0.32444456219673157,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 20640
    },
    {
      "grad_norm": 2.3243069648742676,
      "learning_rate": 0.00015435214843473964,
      "loss": 0.3878,
      "step": 20650
    },
    {
      "gate_value": 0.32470571994781494,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 20650
    },
    {
      "grad_norm": 13.799264907836914,
      "learning_rate": 0.00015422818893402477,
      "loss": 0.3961,
      "step": 20660
    },
    {
      "gate_value": 0.3247778117656708,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 20660
    },
    {
      "grad_norm": 54.99798583984375,
      "learning_rate": 0.00015410422654338265,
      "loss": 0.3866,
      "step": 20670
    },
    {
      "gate_value": 0.32493239641189575,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 20670
    },
    {
      "grad_norm": 2.2542836666107178,
      "learning_rate": 0.00015398026134754036,
      "loss": 0.3703,
      "step": 20680
    },
    {
      "gate_value": 0.32522520422935486,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 20680
    },
    {
      "grad_norm": 2.2880892753601074,
      "learning_rate": 0.00015385629343122695,
      "loss": 0.3565,
      "step": 20690
    },
    {
      "gate_value": 0.3254174292087555,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 20690
    },
    {
      "grad_norm": 3.325277090072632,
      "learning_rate": 0.0001537323228791734,
      "loss": 0.3657,
      "step": 20700
    },
    {
      "gate_value": 0.32538145780563354,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20700
    },
    {
      "grad_norm": 4.995002746582031,
      "learning_rate": 0.00015360834977611227,
      "loss": 0.3799,
      "step": 20710
    },
    {
      "gate_value": 0.3254181742668152,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 20710
    },
    {
      "grad_norm": 2.443607807159424,
      "learning_rate": 0.0001534843742067782,
      "loss": 0.3817,
      "step": 20720
    },
    {
      "gate_value": 0.325557678937912,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 20720
    },
    {
      "grad_norm": 1.345854640007019,
      "learning_rate": 0.00015336039625590714,
      "loss": 0.3697,
      "step": 20730
    },
    {
      "gate_value": 0.3256950080394745,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 20730
    },
    {
      "grad_norm": 249.50662231445312,
      "learning_rate": 0.00015323641600823693,
      "loss": 0.3654,
      "step": 20740
    },
    {
      "gate_value": 0.32584673166275024,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 20740
    },
    {
      "grad_norm": 21.927440643310547,
      "learning_rate": 0.00015311243354850692,
      "loss": 0.3975,
      "step": 20750
    },
    {
      "gate_value": 0.32607078552246094,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 20750
    },
    {
      "grad_norm": 2.4049830436706543,
      "learning_rate": 0.0001529884489614579,
      "loss": 0.365,
      "step": 20760
    },
    {
      "gate_value": 0.3262093663215637,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 20760
    },
    {
      "grad_norm": 149.2460174560547,
      "learning_rate": 0.00015286446233183225,
      "loss": 0.3816,
      "step": 20770
    },
    {
      "gate_value": 0.3263449966907501,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 20770
    },
    {
      "grad_norm": 3.6130568981170654,
      "learning_rate": 0.00015274047374437354,
      "loss": 0.3588,
      "step": 20780
    },
    {
      "gate_value": 0.32662174105644226,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 20780
    },
    {
      "grad_norm": 14.619281768798828,
      "learning_rate": 0.0001526164832838269,
      "loss": 0.3855,
      "step": 20790
    },
    {
      "gate_value": 0.32708847522735596,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20790
    },
    {
      "grad_norm": 27.90945053100586,
      "learning_rate": 0.0001524924910349386,
      "loss": 0.3747,
      "step": 20800
    },
    {
      "gate_value": 0.32717645168304443,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 20800
    },
    {
      "grad_norm": 2.590574264526367,
      "learning_rate": 0.00015236849708245617,
      "loss": 0.3668,
      "step": 20810
    },
    {
      "gate_value": 0.32714733481407166,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 20810
    },
    {
      "grad_norm": 4.525856018066406,
      "learning_rate": 0.0001522445015111284,
      "loss": 0.3645,
      "step": 20820
    },
    {
      "gate_value": 0.327190101146698,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 20820
    },
    {
      "grad_norm": 36.371551513671875,
      "learning_rate": 0.00015212050440570492,
      "loss": 0.3843,
      "step": 20830
    },
    {
      "gate_value": 0.32727375626564026,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20830
    },
    {
      "grad_norm": 2.06235671043396,
      "learning_rate": 0.00015199650585093669,
      "loss": 0.3631,
      "step": 20840
    },
    {
      "gate_value": 0.32733866572380066,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 20840
    },
    {
      "grad_norm": 31.483142852783203,
      "learning_rate": 0.00015187250593157552,
      "loss": 0.3772,
      "step": 20850
    },
    {
      "gate_value": 0.32737815380096436,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 20850
    },
    {
      "grad_norm": 2.382258415222168,
      "learning_rate": 0.00015174850473237425,
      "loss": 0.38,
      "step": 20860
    },
    {
      "gate_value": 0.3275003731250763,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 20860
    },
    {
      "grad_norm": 2.1796786785125732,
      "learning_rate": 0.00015162450233808646,
      "loss": 0.3658,
      "step": 20870
    },
    {
      "gate_value": 0.3276655077934265,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 20870
    },
    {
      "grad_norm": 9.256353378295898,
      "learning_rate": 0.0001515004988334666,
      "loss": 0.3795,
      "step": 20880
    },
    {
      "gate_value": 0.3275793790817261,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 20880
    },
    {
      "grad_norm": 5.035216808319092,
      "learning_rate": 0.00015137649430327,
      "loss": 0.3836,
      "step": 20890
    },
    {
      "gate_value": 0.3275890052318573,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 20890
    },
    {
      "grad_norm": 6.4507527351379395,
      "learning_rate": 0.0001512524888322525,
      "loss": 0.3782,
      "step": 20900
    },
    {
      "gate_value": 0.3277978301048279,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 20900
    },
    {
      "grad_norm": 26.93199920654297,
      "learning_rate": 0.0001511284825051707,
      "loss": 0.3752,
      "step": 20910
    },
    {
      "gate_value": 0.3279309570789337,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 20910
    },
    {
      "grad_norm": 3.2107746601104736,
      "learning_rate": 0.00015100447540678178,
      "loss": 0.3715,
      "step": 20920
    },
    {
      "gate_value": 0.3280923664569855,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 20920
    },
    {
      "grad_norm": 1.704662561416626,
      "learning_rate": 0.0001508804676218433,
      "loss": 0.3785,
      "step": 20930
    },
    {
      "gate_value": 0.3281390368938446,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 20930
    },
    {
      "grad_norm": 6.8710551261901855,
      "learning_rate": 0.00015075645923511355,
      "loss": 0.369,
      "step": 20940
    },
    {
      "gate_value": 0.3282588720321655,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 20940
    },
    {
      "grad_norm": 100.34821319580078,
      "learning_rate": 0.0001506324503313511,
      "loss": 0.3687,
      "step": 20950
    },
    {
      "gate_value": 0.32841092348098755,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 20950
    },
    {
      "grad_norm": 16.857439041137695,
      "learning_rate": 0.00015050844099531474,
      "loss": 0.3728,
      "step": 20960
    },
    {
      "gate_value": 0.3285426199436188,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 20960
    },
    {
      "grad_norm": 7.527184963226318,
      "learning_rate": 0.00015038443131176377,
      "loss": 0.3741,
      "step": 20970
    },
    {
      "gate_value": 0.3288117051124573,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 20970
    },
    {
      "grad_norm": 4.734735488891602,
      "learning_rate": 0.00015026042136545762,
      "loss": 0.3713,
      "step": 20980
    },
    {
      "gate_value": 0.32907599210739136,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 20980
    },
    {
      "grad_norm": 10.151193618774414,
      "learning_rate": 0.00015013641124115592,
      "loss": 0.3578,
      "step": 20990
    },
    {
      "gate_value": 0.3292025029659271,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 20990
    },
    {
      "grad_norm": 41.710880279541016,
      "learning_rate": 0.0001500124010236185,
      "loss": 0.3581,
      "step": 21000
    },
    {
      "gate_value": 0.3292621374130249,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 21000
    },
    {
      "grad_norm": 11.441078186035156,
      "learning_rate": 0.00014988839079760496,
      "loss": 0.3614,
      "step": 21010
    },
    {
      "gate_value": 0.32934167981147766,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 21010
    },
    {
      "grad_norm": 127.89350891113281,
      "learning_rate": 0.00014976438064787537,
      "loss": 0.3897,
      "step": 21020
    },
    {
      "gate_value": 0.329465389251709,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 21020
    },
    {
      "grad_norm": 9.434091567993164,
      "learning_rate": 0.00014964037065918936,
      "loss": 0.3745,
      "step": 21030
    },
    {
      "gate_value": 0.3296959102153778,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 21030
    },
    {
      "grad_norm": 71.84469604492188,
      "learning_rate": 0.0001495163609163066,
      "loss": 0.3718,
      "step": 21040
    },
    {
      "gate_value": 0.3298918604850769,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 21040
    },
    {
      "grad_norm": 8.007318496704102,
      "learning_rate": 0.0001493923515039866,
      "loss": 0.3768,
      "step": 21050
    },
    {
      "gate_value": 0.3300043046474457,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 21050
    },
    {
      "grad_norm": 5.606169700622559,
      "learning_rate": 0.00014926834250698857,
      "loss": 0.3848,
      "step": 21060
    },
    {
      "gate_value": 0.3299660384654999,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21060
    },
    {
      "grad_norm": 18.77029800415039,
      "learning_rate": 0.00014914433401007162,
      "loss": 0.3851,
      "step": 21070
    },
    {
      "gate_value": 0.3300321102142334,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21070
    },
    {
      "grad_norm": 59.220279693603516,
      "learning_rate": 0.0001490203260979942,
      "loss": 0.3835,
      "step": 21080
    },
    {
      "gate_value": 0.33016636967658997,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 21080
    },
    {
      "grad_norm": 27.8267765045166,
      "learning_rate": 0.00014889631885551472,
      "loss": 0.3909,
      "step": 21090
    },
    {
      "gate_value": 0.3302639126777649,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 21090
    },
    {
      "grad_norm": 29.264480590820312,
      "learning_rate": 0.0001487723123673909,
      "loss": 0.3811,
      "step": 21100
    },
    {
      "gate_value": 0.3303276598453522,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 21100
    },
    {
      "grad_norm": 3.484534978866577,
      "learning_rate": 0.00014864830671837997,
      "loss": 0.3698,
      "step": 21110
    },
    {
      "gate_value": 0.33042216300964355,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 21110
    },
    {
      "grad_norm": 136.35006713867188,
      "learning_rate": 0.00014852430199323871,
      "loss": 0.3573,
      "step": 21120
    },
    {
      "gate_value": 0.3304680585861206,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 21120
    },
    {
      "grad_norm": 29.405948638916016,
      "learning_rate": 0.00014840029827672312,
      "loss": 0.376,
      "step": 21130
    },
    {
      "gate_value": 0.33045992255210876,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21130
    },
    {
      "grad_norm": 2.838723659515381,
      "learning_rate": 0.00014827629565358853,
      "loss": 0.3611,
      "step": 21140
    },
    {
      "gate_value": 0.33051925897598267,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21140
    },
    {
      "grad_norm": 111.33936309814453,
      "learning_rate": 0.00014815229420858962,
      "loss": 0.384,
      "step": 21150
    },
    {
      "gate_value": 0.33056896924972534,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 21150
    },
    {
      "grad_norm": 2.623337745666504,
      "learning_rate": 0.00014802829402648016,
      "loss": 0.3814,
      "step": 21160
    },
    {
      "gate_value": 0.3307247757911682,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 21160
    },
    {
      "grad_norm": 53.253849029541016,
      "learning_rate": 0.00014790429519201322,
      "loss": 0.3619,
      "step": 21170
    },
    {
      "gate_value": 0.330961138010025,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 21170
    },
    {
      "grad_norm": 8.739072799682617,
      "learning_rate": 0.00014778029778994068,
      "loss": 0.3898,
      "step": 21180
    },
    {
      "gate_value": 0.33113235235214233,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 21180
    },
    {
      "grad_norm": 3.783287286758423,
      "learning_rate": 0.00014765630190501368,
      "loss": 0.3705,
      "step": 21190
    },
    {
      "gate_value": 0.3310948610305786,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 21190
    },
    {
      "grad_norm": 2.4001498222351074,
      "learning_rate": 0.00014753230762198227,
      "loss": 0.3639,
      "step": 21200
    },
    {
      "gate_value": 0.33120977878570557,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 21200
    },
    {
      "grad_norm": 4.936431407928467,
      "learning_rate": 0.00014740831502559534,
      "loss": 0.3806,
      "step": 21210
    },
    {
      "gate_value": 0.33138588070869446,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 21210
    },
    {
      "grad_norm": 14.909619331359863,
      "learning_rate": 0.0001472843242006006,
      "loss": 0.3693,
      "step": 21220
    },
    {
      "gate_value": 0.3314835727214813,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 21220
    },
    {
      "grad_norm": 1.7902761697769165,
      "learning_rate": 0.0001471603352317447,
      "loss": 0.3644,
      "step": 21230
    },
    {
      "gate_value": 0.33160167932510376,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 21230
    },
    {
      "grad_norm": 32.85311508178711,
      "learning_rate": 0.00014703634820377286,
      "loss": 0.3717,
      "step": 21240
    },
    {
      "gate_value": 0.3315005898475647,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 21240
    },
    {
      "grad_norm": 2.701444625854492,
      "learning_rate": 0.0001469123632014291,
      "loss": 0.3538,
      "step": 21250
    },
    {
      "gate_value": 0.33133813738822937,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 21250
    },
    {
      "grad_norm": 31.169008255004883,
      "learning_rate": 0.00014678838030945593,
      "loss": 0.3972,
      "step": 21260
    },
    {
      "gate_value": 0.33133530616760254,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21260
    },
    {
      "grad_norm": 6.129772186279297,
      "learning_rate": 0.0001466643996125946,
      "loss": 0.4004,
      "step": 21270
    },
    {
      "gate_value": 0.33128389716148376,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 21270
    },
    {
      "grad_norm": 4.4965314865112305,
      "learning_rate": 0.00014654042119558464,
      "loss": 0.3586,
      "step": 21280
    },
    {
      "gate_value": 0.3313887417316437,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 21280
    },
    {
      "grad_norm": 5.3050456047058105,
      "learning_rate": 0.00014641644514316418,
      "loss": 0.3583,
      "step": 21290
    },
    {
      "gate_value": 0.33156993985176086,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 21290
    },
    {
      "grad_norm": 14.692100524902344,
      "learning_rate": 0.00014629247154006973,
      "loss": 0.3723,
      "step": 21300
    },
    {
      "gate_value": 0.3315286934375763,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21300
    },
    {
      "grad_norm": 3.1017932891845703,
      "learning_rate": 0.00014616850047103597,
      "loss": 0.3956,
      "step": 21310
    },
    {
      "gate_value": 0.3315356373786926,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 21310
    },
    {
      "grad_norm": 9.048178672790527,
      "learning_rate": 0.000146044532020796,
      "loss": 0.376,
      "step": 21320
    },
    {
      "gate_value": 0.331571102142334,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 21320
    },
    {
      "grad_norm": 5.0327630043029785,
      "learning_rate": 0.0001459205662740811,
      "loss": 0.3802,
      "step": 21330
    },
    {
      "gate_value": 0.3317570686340332,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 21330
    },
    {
      "grad_norm": 8.62702751159668,
      "learning_rate": 0.0001457966033156207,
      "loss": 0.374,
      "step": 21340
    },
    {
      "gate_value": 0.3319891095161438,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 21340
    },
    {
      "grad_norm": 3.6938068866729736,
      "learning_rate": 0.00014567264323014228,
      "loss": 0.3571,
      "step": 21350
    },
    {
      "gate_value": 0.332073837518692,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 21350
    },
    {
      "grad_norm": 361.4523620605469,
      "learning_rate": 0.0001455486861023714,
      "loss": 0.3681,
      "step": 21360
    },
    {
      "gate_value": 0.3321523368358612,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 21360
    },
    {
      "grad_norm": 10.629288673400879,
      "learning_rate": 0.00014542473201703163,
      "loss": 0.3674,
      "step": 21370
    },
    {
      "gate_value": 0.3323724567890167,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 21370
    },
    {
      "grad_norm": 145.07919311523438,
      "learning_rate": 0.00014530078105884435,
      "loss": 0.3646,
      "step": 21380
    },
    {
      "gate_value": 0.33256107568740845,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21380
    },
    {
      "grad_norm": 10.726923942565918,
      "learning_rate": 0.0001451768333125289,
      "loss": 0.3641,
      "step": 21390
    },
    {
      "gate_value": 0.33275166153907776,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 21390
    },
    {
      "grad_norm": 51.94292068481445,
      "learning_rate": 0.0001450528888628025,
      "loss": 0.3949,
      "step": 21400
    },
    {
      "gate_value": 0.3329571485519409,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 21400
    },
    {
      "grad_norm": 8.739526748657227,
      "learning_rate": 0.00014492894779437985,
      "loss": 0.3752,
      "step": 21410
    },
    {
      "gate_value": 0.33292508125305176,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 21410
    },
    {
      "grad_norm": 6.976619720458984,
      "learning_rate": 0.00014480501019197353,
      "loss": 0.3626,
      "step": 21420
    },
    {
      "gate_value": 0.3328370749950409,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 21420
    },
    {
      "grad_norm": 2.2733330726623535,
      "learning_rate": 0.0001446810761402938,
      "loss": 0.3961,
      "step": 21430
    },
    {
      "gate_value": 0.33283504843711853,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 21430
    },
    {
      "grad_norm": 4.6120524406433105,
      "learning_rate": 0.00014455714572404833,
      "loss": 0.3714,
      "step": 21440
    },
    {
      "gate_value": 0.3329114019870758,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 21440
    },
    {
      "grad_norm": 230.9474639892578,
      "learning_rate": 0.00014443321902794248,
      "loss": 0.3815,
      "step": 21450
    },
    {
      "gate_value": 0.3328808546066284,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 21450
    },
    {
      "grad_norm": 1.880590558052063,
      "learning_rate": 0.0001443092961366789,
      "loss": 0.3749,
      "step": 21460
    },
    {
      "gate_value": 0.3330598771572113,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 21460
    },
    {
      "grad_norm": 2.525514841079712,
      "learning_rate": 0.00014418537713495775,
      "loss": 0.3768,
      "step": 21470
    },
    {
      "gate_value": 0.333394318819046,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21470
    },
    {
      "grad_norm": 1.9454431533813477,
      "learning_rate": 0.0001440614621074765,
      "loss": 0.3853,
      "step": 21480
    },
    {
      "gate_value": 0.3335770070552826,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 21480
    },
    {
      "grad_norm": 1.8111017942428589,
      "learning_rate": 0.00014393755113892997,
      "loss": 0.3682,
      "step": 21490
    },
    {
      "gate_value": 0.33381104469299316,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21490
    },
    {
      "grad_norm": 4.904592514038086,
      "learning_rate": 0.00014381364431401,
      "loss": 0.3897,
      "step": 21500
    },
    {
      "gate_value": 0.333828330039978,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 21500
    },
    {
      "grad_norm": 2.9814579486846924,
      "learning_rate": 0.00014368974171740585,
      "loss": 0.3776,
      "step": 21510
    },
    {
      "gate_value": 0.333970844745636,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 21510
    },
    {
      "grad_norm": 44.95869064331055,
      "learning_rate": 0.00014356584343380366,
      "loss": 0.365,
      "step": 21520
    },
    {
      "gate_value": 0.3343297243118286,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 21520
    },
    {
      "grad_norm": 14.722635269165039,
      "learning_rate": 0.0001434419495478869,
      "loss": 0.3879,
      "step": 21530
    },
    {
      "gate_value": 0.3345628082752228,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 21530
    },
    {
      "grad_norm": 2.9341938495635986,
      "learning_rate": 0.00014331806014433572,
      "loss": 0.3726,
      "step": 21540
    },
    {
      "gate_value": 0.3346128761768341,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 21540
    },
    {
      "grad_norm": 16.510175704956055,
      "learning_rate": 0.00014319417530782744,
      "loss": 0.3785,
      "step": 21550
    },
    {
      "gate_value": 0.3347155749797821,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 21550
    },
    {
      "grad_norm": 1.5997096300125122,
      "learning_rate": 0.00014307029512303617,
      "loss": 0.3938,
      "step": 21560
    },
    {
      "gate_value": 0.33479592204093933,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21560
    },
    {
      "grad_norm": 1.6002236604690552,
      "learning_rate": 0.00014294641967463282,
      "loss": 0.3648,
      "step": 21570
    },
    {
      "gate_value": 0.33493053913116455,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 21570
    },
    {
      "grad_norm": 288.05035400390625,
      "learning_rate": 0.0001428225490472852,
      "loss": 0.3557,
      "step": 21580
    },
    {
      "gate_value": 0.3353734314441681,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 21580
    },
    {
      "grad_norm": 2.035752534866333,
      "learning_rate": 0.00014269868332565755,
      "loss": 0.3721,
      "step": 21590
    },
    {
      "gate_value": 0.3355160653591156,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21590
    },
    {
      "grad_norm": 7.391807556152344,
      "learning_rate": 0.000142574822594411,
      "loss": 0.3924,
      "step": 21600
    },
    {
      "gate_value": 0.33554723858833313,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 21600
    },
    {
      "grad_norm": 5.8543195724487305,
      "learning_rate": 0.00014245096693820322,
      "loss": 0.3779,
      "step": 21610
    },
    {
      "gate_value": 0.3356199860572815,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 21610
    },
    {
      "grad_norm": 2.9621081352233887,
      "learning_rate": 0.0001423271164416883,
      "loss": 0.3571,
      "step": 21620
    },
    {
      "gate_value": 0.33589836955070496,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21620
    },
    {
      "grad_norm": 4.828012466430664,
      "learning_rate": 0.000142203271189517,
      "loss": 0.3721,
      "step": 21630
    },
    {
      "gate_value": 0.33621811866760254,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 21630
    },
    {
      "grad_norm": 4.43719482421875,
      "learning_rate": 0.0001420794312663363,
      "loss": 0.3671,
      "step": 21640
    },
    {
      "gate_value": 0.3363811671733856,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21640
    },
    {
      "grad_norm": 12.81857681274414,
      "learning_rate": 0.00014195559675678963,
      "loss": 0.3775,
      "step": 21650
    },
    {
      "gate_value": 0.33632469177246094,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21650
    },
    {
      "grad_norm": 2.1967718601226807,
      "learning_rate": 0.00014183176774551672,
      "loss": 0.3817,
      "step": 21660
    },
    {
      "gate_value": 0.336483359336853,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 21660
    },
    {
      "grad_norm": 6.7688140869140625,
      "learning_rate": 0.00014170794431715353,
      "loss": 0.3945,
      "step": 21670
    },
    {
      "gate_value": 0.3365987539291382,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 21670
    },
    {
      "grad_norm": 2.248080015182495,
      "learning_rate": 0.0001415841265563323,
      "loss": 0.3865,
      "step": 21680
    },
    {
      "gate_value": 0.3365277647972107,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21680
    },
    {
      "grad_norm": 7.746638774871826,
      "learning_rate": 0.00014146031454768113,
      "loss": 0.3612,
      "step": 21690
    },
    {
      "gate_value": 0.3365982472896576,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21690
    },
    {
      "grad_norm": 56.98549270629883,
      "learning_rate": 0.00014133650837582445,
      "loss": 0.3678,
      "step": 21700
    },
    {
      "gate_value": 0.3367842435836792,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 21700
    },
    {
      "grad_norm": 4.436330795288086,
      "learning_rate": 0.00014121270812538262,
      "loss": 0.398,
      "step": 21710
    },
    {
      "gate_value": 0.3368651270866394,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 21710
    },
    {
      "grad_norm": 3.1496005058288574,
      "learning_rate": 0.0001410889138809719,
      "loss": 0.3745,
      "step": 21720
    },
    {
      "gate_value": 0.33694028854370117,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 21720
    },
    {
      "grad_norm": 3.2747929096221924,
      "learning_rate": 0.00014096512572720453,
      "loss": 0.3676,
      "step": 21730
    },
    {
      "gate_value": 0.3370567262172699,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21730
    },
    {
      "grad_norm": 3.3517582416534424,
      "learning_rate": 0.0001408413437486885,
      "loss": 0.374,
      "step": 21740
    },
    {
      "gate_value": 0.3370315134525299,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 21740
    },
    {
      "grad_norm": 9.315972328186035,
      "learning_rate": 0.00014071756803002772,
      "loss": 0.3617,
      "step": 21750
    },
    {
      "gate_value": 0.33682939410209656,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 21750
    },
    {
      "grad_norm": 6.208014011383057,
      "learning_rate": 0.00014059379865582163,
      "loss": 0.3576,
      "step": 21760
    },
    {
      "gate_value": 0.33689308166503906,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 21760
    },
    {
      "grad_norm": 18.423473358154297,
      "learning_rate": 0.0001404700357106655,
      "loss": 0.3841,
      "step": 21770
    },
    {
      "gate_value": 0.3369765877723694,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 21770
    },
    {
      "grad_norm": 5.565635681152344,
      "learning_rate": 0.00014034627927915006,
      "loss": 0.3669,
      "step": 21780
    },
    {
      "gate_value": 0.33682945370674133,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21780
    },
    {
      "grad_norm": 9.092445373535156,
      "learning_rate": 0.0001402225294458617,
      "loss": 0.3609,
      "step": 21790
    },
    {
      "gate_value": 0.3367845416069031,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21790
    },
    {
      "grad_norm": 3.235175848007202,
      "learning_rate": 0.00014009878629538225,
      "loss": 0.3716,
      "step": 21800
    },
    {
      "gate_value": 0.33702075481414795,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21800
    },
    {
      "grad_norm": 2.2112083435058594,
      "learning_rate": 0.00013997504991228906,
      "loss": 0.3923,
      "step": 21810
    },
    {
      "gate_value": 0.3373863995075226,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 21810
    },
    {
      "grad_norm": 1262.691650390625,
      "learning_rate": 0.00013985132038115466,
      "loss": 0.3723,
      "step": 21820
    },
    {
      "gate_value": 0.33774295449256897,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 21820
    },
    {
      "grad_norm": 7.494561672210693,
      "learning_rate": 0.00013972759778654715,
      "loss": 0.3738,
      "step": 21830
    },
    {
      "gate_value": 0.33784347772598267,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 21830
    },
    {
      "grad_norm": 5.770691394805908,
      "learning_rate": 0.00013960388221302962,
      "loss": 0.3621,
      "step": 21840
    },
    {
      "gate_value": 0.33791738748550415,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 21840
    },
    {
      "grad_norm": 318.8439025878906,
      "learning_rate": 0.00013948017374516063,
      "loss": 0.3611,
      "step": 21850
    },
    {
      "gate_value": 0.3379685580730438,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 21850
    },
    {
      "grad_norm": 43.76850509643555,
      "learning_rate": 0.00013935647246749372,
      "loss": 0.3729,
      "step": 21860
    },
    {
      "gate_value": 0.338062584400177,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21860
    },
    {
      "grad_norm": 88.09666442871094,
      "learning_rate": 0.00013923277846457743,
      "loss": 0.3917,
      "step": 21870
    },
    {
      "gate_value": 0.3383744955062866,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 21870
    },
    {
      "grad_norm": 11.775474548339844,
      "learning_rate": 0.00013910909182095554,
      "loss": 0.3741,
      "step": 21880
    },
    {
      "gate_value": 0.33857882022857666,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 21880
    },
    {
      "grad_norm": 103.6872329711914,
      "learning_rate": 0.00013898541262116675,
      "loss": 0.3704,
      "step": 21890
    },
    {
      "gate_value": 0.3385585844516754,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 21890
    },
    {
      "grad_norm": 1.8290860652923584,
      "learning_rate": 0.0001388617409497445,
      "loss": 0.3702,
      "step": 21900
    },
    {
      "gate_value": 0.33865779638290405,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 21900
    },
    {
      "grad_norm": 1.7455637454986572,
      "learning_rate": 0.00013873807689121736,
      "loss": 0.3851,
      "step": 21910
    },
    {
      "gate_value": 0.33894455432891846,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 21910
    },
    {
      "grad_norm": 19.623218536376953,
      "learning_rate": 0.00013861442053010841,
      "loss": 0.3834,
      "step": 21920
    },
    {
      "gate_value": 0.3391704261302948,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 21920
    },
    {
      "grad_norm": 7.914072036743164,
      "learning_rate": 0.00013849077195093572,
      "loss": 0.3822,
      "step": 21930
    },
    {
      "gate_value": 0.33932042121887207,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 21930
    },
    {
      "grad_norm": 7.193912029266357,
      "learning_rate": 0.0001383671312382118,
      "loss": 0.3685,
      "step": 21940
    },
    {
      "gate_value": 0.3394797742366791,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 21940
    },
    {
      "grad_norm": 6.832879543304443,
      "learning_rate": 0.00013824349847644407,
      "loss": 0.3638,
      "step": 21950
    },
    {
      "gate_value": 0.33957207202911377,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 21950
    },
    {
      "grad_norm": 22.681137084960938,
      "learning_rate": 0.00013811987375013428,
      "loss": 0.3832,
      "step": 21960
    },
    {
      "gate_value": 0.3396352231502533,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 21960
    },
    {
      "grad_norm": 16.05438995361328,
      "learning_rate": 0.0001379962571437787,
      "loss": 0.3688,
      "step": 21970
    },
    {
      "gate_value": 0.33961719274520874,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 21970
    },
    {
      "grad_norm": 205.68336486816406,
      "learning_rate": 0.00013787264874186818,
      "loss": 0.3691,
      "step": 21980
    },
    {
      "gate_value": 0.3396041989326477,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 21980
    },
    {
      "grad_norm": 8.141767501831055,
      "learning_rate": 0.00013774904862888792,
      "loss": 0.3897,
      "step": 21990
    },
    {
      "gate_value": 0.33965280652046204,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 21990
    },
    {
      "grad_norm": 23.583955764770508,
      "learning_rate": 0.00013762545688931737,
      "loss": 0.3658,
      "step": 22000
    },
    {
      "gate_value": 0.3397328555583954,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 22000
    },
    {
      "grad_norm": 7.479121685028076,
      "learning_rate": 0.00013750187360763038,
      "loss": 0.3654,
      "step": 22010
    },
    {
      "gate_value": 0.33978742361068726,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 22010
    },
    {
      "grad_norm": 3.828936815261841,
      "learning_rate": 0.0001373782988682949,
      "loss": 0.3588,
      "step": 22020
    },
    {
      "gate_value": 0.3398081064224243,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 22020
    },
    {
      "grad_norm": 12.60776138305664,
      "learning_rate": 0.00013725473275577314,
      "loss": 0.379,
      "step": 22030
    },
    {
      "gate_value": 0.3398773670196533,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 22030
    },
    {
      "grad_norm": 5.787103652954102,
      "learning_rate": 0.00013713117535452135,
      "loss": 0.3601,
      "step": 22040
    },
    {
      "gate_value": 0.3399393856525421,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 22040
    },
    {
      "grad_norm": 3.2277674674987793,
      "learning_rate": 0.00013700762674898992,
      "loss": 0.3653,
      "step": 22050
    },
    {
      "gate_value": 0.3401627838611603,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 22050
    },
    {
      "grad_norm": 21.32828712463379,
      "learning_rate": 0.00013688408702362308,
      "loss": 0.3684,
      "step": 22060
    },
    {
      "gate_value": 0.34032222628593445,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 22060
    },
    {
      "grad_norm": 9.054245948791504,
      "learning_rate": 0.00013676055626285903,
      "loss": 0.3559,
      "step": 22070
    },
    {
      "gate_value": 0.34050172567367554,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 22070
    },
    {
      "grad_norm": 3.0994348526000977,
      "learning_rate": 0.00013663703455112994,
      "loss": 0.3766,
      "step": 22080
    },
    {
      "gate_value": 0.3407020568847656,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 22080
    },
    {
      "grad_norm": 10.302519798278809,
      "learning_rate": 0.00013651352197286177,
      "loss": 0.3635,
      "step": 22090
    },
    {
      "gate_value": 0.34067562222480774,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 22090
    },
    {
      "grad_norm": 9.207005500793457,
      "learning_rate": 0.0001363900186124741,
      "loss": 0.3746,
      "step": 22100
    },
    {
      "gate_value": 0.34073901176452637,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22100
    },
    {
      "grad_norm": 18.054229736328125,
      "learning_rate": 0.00013626652455438044,
      "loss": 0.3584,
      "step": 22110
    },
    {
      "gate_value": 0.3407975435256958,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22110
    },
    {
      "grad_norm": 62.869850158691406,
      "learning_rate": 0.0001361430398829877,
      "loss": 0.3601,
      "step": 22120
    },
    {
      "gate_value": 0.3408588171005249,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 22120
    },
    {
      "grad_norm": 7.5538835525512695,
      "learning_rate": 0.00013601956468269657,
      "loss": 0.3597,
      "step": 22130
    },
    {
      "gate_value": 0.3410506248474121,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 22130
    },
    {
      "grad_norm": 2.9426703453063965,
      "learning_rate": 0.00013589609903790119,
      "loss": 0.3644,
      "step": 22140
    },
    {
      "gate_value": 0.34119319915771484,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 22140
    },
    {
      "grad_norm": 12.853556632995605,
      "learning_rate": 0.00013577264303298907,
      "loss": 0.3637,
      "step": 22150
    },
    {
      "gate_value": 0.34135085344314575,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22150
    },
    {
      "grad_norm": 4.324552536010742,
      "learning_rate": 0.00013564919675234128,
      "loss": 0.3793,
      "step": 22160
    },
    {
      "gate_value": 0.34146222472190857,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 22160
    },
    {
      "grad_norm": 5.883782386779785,
      "learning_rate": 0.00013552576028033218,
      "loss": 0.3737,
      "step": 22170
    },
    {
      "gate_value": 0.3415074646472931,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 22170
    },
    {
      "grad_norm": 10.931557655334473,
      "learning_rate": 0.00013540233370132944,
      "loss": 0.3589,
      "step": 22180
    },
    {
      "gate_value": 0.3415931165218353,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 22180
    },
    {
      "grad_norm": 17.461891174316406,
      "learning_rate": 0.000135278917099694,
      "loss": 0.3777,
      "step": 22190
    },
    {
      "gate_value": 0.3416183888912201,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22190
    },
    {
      "grad_norm": 3.816490650177002,
      "learning_rate": 0.00013515551055977987,
      "loss": 0.3762,
      "step": 22200
    },
    {
      "gate_value": 0.341708779335022,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22200
    },
    {
      "grad_norm": 11.506010055541992,
      "learning_rate": 0.00013503211416593435,
      "loss": 0.3672,
      "step": 22210
    },
    {
      "gate_value": 0.34182965755462646,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 22210
    },
    {
      "grad_norm": 7.808537483215332,
      "learning_rate": 0.00013490872800249763,
      "loss": 0.367,
      "step": 22220
    },
    {
      "gate_value": 0.3420865833759308,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 22220
    },
    {
      "grad_norm": 6.892821788787842,
      "learning_rate": 0.000134785352153803,
      "loss": 0.3692,
      "step": 22230
    },
    {
      "gate_value": 0.34221896529197693,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 22230
    },
    {
      "grad_norm": 30.9285888671875,
      "learning_rate": 0.0001346619867041768,
      "loss": 0.3762,
      "step": 22240
    },
    {
      "gate_value": 0.34225592017173767,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22240
    },
    {
      "grad_norm": 103.91609191894531,
      "learning_rate": 0.00013453863173793797,
      "loss": 0.3689,
      "step": 22250
    },
    {
      "gate_value": 0.3423255681991577,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 22250
    },
    {
      "grad_norm": 37.15800094604492,
      "learning_rate": 0.0001344152873393986,
      "loss": 0.361,
      "step": 22260
    },
    {
      "gate_value": 0.3423844873905182,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 22260
    },
    {
      "grad_norm": 10.238519668579102,
      "learning_rate": 0.00013429195359286332,
      "loss": 0.3649,
      "step": 22270
    },
    {
      "gate_value": 0.3424571454524994,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22270
    },
    {
      "grad_norm": 6.922329902648926,
      "learning_rate": 0.00013416863058262967,
      "loss": 0.3755,
      "step": 22280
    },
    {
      "gate_value": 0.34252744913101196,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 22280
    },
    {
      "grad_norm": 4.249993801116943,
      "learning_rate": 0.00013404531839298774,
      "loss": 0.3693,
      "step": 22290
    },
    {
      "gate_value": 0.3425276577472687,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 22290
    },
    {
      "grad_norm": 2.9230473041534424,
      "learning_rate": 0.00013392201710822022,
      "loss": 0.3579,
      "step": 22300
    },
    {
      "gate_value": 0.34264761209487915,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 22300
    },
    {
      "grad_norm": 19.932083129882812,
      "learning_rate": 0.00013379872681260245,
      "loss": 0.3587,
      "step": 22310
    },
    {
      "gate_value": 0.3429376780986786,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 22310
    },
    {
      "grad_norm": 14.836755752563477,
      "learning_rate": 0.0001336754475904021,
      "loss": 0.3836,
      "step": 22320
    },
    {
      "gate_value": 0.3430907726287842,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22320
    },
    {
      "grad_norm": 8.244352340698242,
      "learning_rate": 0.00013355217952587943,
      "loss": 0.3618,
      "step": 22330
    },
    {
      "gate_value": 0.343147337436676,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 22330
    },
    {
      "grad_norm": 18.00697898864746,
      "learning_rate": 0.00013342892270328696,
      "loss": 0.3703,
      "step": 22340
    },
    {
      "gate_value": 0.3431670665740967,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 22340
    },
    {
      "grad_norm": 8.288820266723633,
      "learning_rate": 0.0001333056772068695,
      "loss": 0.3639,
      "step": 22350
    },
    {
      "gate_value": 0.343341201543808,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 22350
    },
    {
      "grad_norm": 4.5525288581848145,
      "learning_rate": 0.0001331824431208643,
      "loss": 0.3646,
      "step": 22360
    },
    {
      "gate_value": 0.34348997473716736,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 22360
    },
    {
      "grad_norm": 20.53244400024414,
      "learning_rate": 0.00013305922052950063,
      "loss": 0.3857,
      "step": 22370
    },
    {
      "gate_value": 0.3435961902141571,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 22370
    },
    {
      "grad_norm": 4.72353458404541,
      "learning_rate": 0.00013293600951699996,
      "loss": 0.3781,
      "step": 22380
    },
    {
      "gate_value": 0.3436838984489441,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22380
    },
    {
      "grad_norm": 5.0068135261535645,
      "learning_rate": 0.00013281281016757593,
      "loss": 0.36,
      "step": 22390
    },
    {
      "gate_value": 0.3438510298728943,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 22390
    },
    {
      "grad_norm": 4.280457496643066,
      "learning_rate": 0.00013268962256543404,
      "loss": 0.3698,
      "step": 22400
    },
    {
      "gate_value": 0.3440343141555786,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 22400
    },
    {
      "grad_norm": 38.63629913330078,
      "learning_rate": 0.00013256644679477195,
      "loss": 0.3626,
      "step": 22410
    },
    {
      "gate_value": 0.3442992568016052,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 22410
    },
    {
      "grad_norm": 16.1657657623291,
      "learning_rate": 0.00013244328293977913,
      "loss": 0.3695,
      "step": 22420
    },
    {
      "gate_value": 0.34445565938949585,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22420
    },
    {
      "grad_norm": 11.632641792297363,
      "learning_rate": 0.00013232013108463678,
      "loss": 0.3674,
      "step": 22430
    },
    {
      "gate_value": 0.3445255756378174,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 22430
    },
    {
      "grad_norm": 13.374603271484375,
      "learning_rate": 0.00013219699131351815,
      "loss": 0.3747,
      "step": 22440
    },
    {
      "gate_value": 0.34462597966194153,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 22440
    },
    {
      "grad_norm": 35.441802978515625,
      "learning_rate": 0.00013207386371058807,
      "loss": 0.3587,
      "step": 22450
    },
    {
      "gate_value": 0.3447570502758026,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 22450
    },
    {
      "grad_norm": 7.294458389282227,
      "learning_rate": 0.00013195074836000313,
      "loss": 0.3637,
      "step": 22460
    },
    {
      "gate_value": 0.3447839617729187,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 22460
    },
    {
      "grad_norm": 5.761556625366211,
      "learning_rate": 0.00013182764534591147,
      "loss": 0.3711,
      "step": 22470
    },
    {
      "gate_value": 0.34476137161254883,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22470
    },
    {
      "grad_norm": 10.302973747253418,
      "learning_rate": 0.00013170455475245284,
      "loss": 0.3596,
      "step": 22480
    },
    {
      "gate_value": 0.3446881175041199,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 22480
    },
    {
      "grad_norm": 53.81081771850586,
      "learning_rate": 0.00013158147666375857,
      "loss": 0.3793,
      "step": 22490
    },
    {
      "gate_value": 0.3447395861148834,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22490
    },
    {
      "grad_norm": 7.093109607696533,
      "learning_rate": 0.00013145841116395132,
      "loss": 0.3738,
      "step": 22500
    },
    {
      "gate_value": 0.3448319733142853,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 22500
    },
    {
      "grad_norm": 8.175000190734863,
      "learning_rate": 0.00013133535833714522,
      "loss": 0.3624,
      "step": 22510
    },
    {
      "gate_value": 0.3448798656463623,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 22510
    },
    {
      "grad_norm": 8.24658489227295,
      "learning_rate": 0.0001312123182674457,
      "loss": 0.3696,
      "step": 22520
    },
    {
      "gate_value": 0.34493088722229004,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 22520
    },
    {
      "grad_norm": 9.602054595947266,
      "learning_rate": 0.00013108929103894943,
      "loss": 0.3786,
      "step": 22530
    },
    {
      "gate_value": 0.3449787199497223,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 22530
    },
    {
      "grad_norm": 14.278233528137207,
      "learning_rate": 0.00013096627673574445,
      "loss": 0.3816,
      "step": 22540
    },
    {
      "gate_value": 0.34499263763427734,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 22540
    },
    {
      "grad_norm": 19.222492218017578,
      "learning_rate": 0.00013084327544190982,
      "loss": 0.3667,
      "step": 22550
    },
    {
      "gate_value": 0.3449796736240387,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 22550
    },
    {
      "grad_norm": 24.835494995117188,
      "learning_rate": 0.00013072028724151583,
      "loss": 0.3662,
      "step": 22560
    },
    {
      "gate_value": 0.345020592212677,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22560
    },
    {
      "grad_norm": 7.1763153076171875,
      "learning_rate": 0.00013059731221862366,
      "loss": 0.3691,
      "step": 22570
    },
    {
      "gate_value": 0.3450656235218048,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22570
    },
    {
      "grad_norm": 4.902071952819824,
      "learning_rate": 0.00013047435045728567,
      "loss": 0.3872,
      "step": 22580
    },
    {
      "gate_value": 0.34517911076545715,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 22580
    },
    {
      "grad_norm": 19.429061889648438,
      "learning_rate": 0.000130351402041545,
      "loss": 0.3782,
      "step": 22590
    },
    {
      "gate_value": 0.3452775776386261,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22590
    },
    {
      "grad_norm": 30.76957893371582,
      "learning_rate": 0.00013022846705543578,
      "loss": 0.3711,
      "step": 22600
    },
    {
      "gate_value": 0.3454075753688812,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22600
    },
    {
      "grad_norm": 36.29285430908203,
      "learning_rate": 0.00013010554558298294,
      "loss": 0.3873,
      "step": 22610
    },
    {
      "gate_value": 0.34549373388290405,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 22610
    },
    {
      "grad_norm": 15.497030258178711,
      "learning_rate": 0.00012998263770820206,
      "loss": 0.3655,
      "step": 22620
    },
    {
      "gate_value": 0.3456413447856903,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 22620
    },
    {
      "grad_norm": 9.922740936279297,
      "learning_rate": 0.00012985974351509955,
      "loss": 0.3735,
      "step": 22630
    },
    {
      "gate_value": 0.3457619249820709,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 22630
    },
    {
      "grad_norm": 6.184462070465088,
      "learning_rate": 0.00012973686308767244,
      "loss": 0.3595,
      "step": 22640
    },
    {
      "gate_value": 0.34587883949279785,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 22640
    },
    {
      "grad_norm": 8.652168273925781,
      "learning_rate": 0.0001296139965099083,
      "loss": 0.3737,
      "step": 22650
    },
    {
      "gate_value": 0.34601011872291565,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 22650
    },
    {
      "grad_norm": 4.410395622253418,
      "learning_rate": 0.00012949114386578538,
      "loss": 0.3758,
      "step": 22660
    },
    {
      "gate_value": 0.3462025225162506,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22660
    },
    {
      "grad_norm": 7.113274574279785,
      "learning_rate": 0.00012936830523927218,
      "loss": 0.3733,
      "step": 22670
    },
    {
      "gate_value": 0.3464626967906952,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 22670
    },
    {
      "grad_norm": 3.876377820968628,
      "learning_rate": 0.00012924548071432783,
      "loss": 0.3706,
      "step": 22680
    },
    {
      "gate_value": 0.34653839468955994,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 22680
    },
    {
      "grad_norm": 24.962556838989258,
      "learning_rate": 0.00012912267037490174,
      "loss": 0.3503,
      "step": 22690
    },
    {
      "gate_value": 0.34656399488449097,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 22690
    },
    {
      "grad_norm": 5.596436023712158,
      "learning_rate": 0.0001289998743049336,
      "loss": 0.3641,
      "step": 22700
    },
    {
      "gate_value": 0.3467022776603699,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 22700
    },
    {
      "grad_norm": 9.493281364440918,
      "learning_rate": 0.00012887709258835328,
      "loss": 0.372,
      "step": 22710
    },
    {
      "gate_value": 0.34704118967056274,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 22710
    },
    {
      "grad_norm": 15.705657005310059,
      "learning_rate": 0.00012875432530908107,
      "loss": 0.3802,
      "step": 22720
    },
    {
      "gate_value": 0.3472428023815155,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 22720
    },
    {
      "grad_norm": 26.376834869384766,
      "learning_rate": 0.0001286315725510271,
      "loss": 0.364,
      "step": 22730
    },
    {
      "gate_value": 0.34734347462654114,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 22730
    },
    {
      "grad_norm": 6.039391040802002,
      "learning_rate": 0.00012850883439809188,
      "loss": 0.3738,
      "step": 22740
    },
    {
      "gate_value": 0.3473823666572571,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 22740
    },
    {
      "grad_norm": 114.58489990234375,
      "learning_rate": 0.00012838611093416564,
      "loss": 0.3636,
      "step": 22750
    },
    {
      "gate_value": 0.3474694490432739,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22750
    },
    {
      "grad_norm": 347.19915771484375,
      "learning_rate": 0.00012826340224312874,
      "loss": 0.3635,
      "step": 22760
    },
    {
      "gate_value": 0.34752917289733887,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 22760
    },
    {
      "grad_norm": 15.640114784240723,
      "learning_rate": 0.00012814070840885152,
      "loss": 0.347,
      "step": 22770
    },
    {
      "gate_value": 0.34775689244270325,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 22770
    },
    {
      "grad_norm": 11.231976509094238,
      "learning_rate": 0.00012801802951519393,
      "loss": 0.3707,
      "step": 22780
    },
    {
      "gate_value": 0.34792718291282654,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 22780
    },
    {
      "grad_norm": 62.72665023803711,
      "learning_rate": 0.00012789536564600595,
      "loss": 0.3661,
      "step": 22790
    },
    {
      "gate_value": 0.34802743792533875,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 22790
    },
    {
      "grad_norm": 10.408900260925293,
      "learning_rate": 0.0001277727168851271,
      "loss": 0.3529,
      "step": 22800
    },
    {
      "gate_value": 0.34815752506256104,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22800
    },
    {
      "grad_norm": 8.37360954284668,
      "learning_rate": 0.00012765008331638663,
      "loss": 0.3933,
      "step": 22810
    },
    {
      "gate_value": 0.34828460216522217,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 22810
    },
    {
      "grad_norm": 6.180023193359375,
      "learning_rate": 0.00012752746502360347,
      "loss": 0.3553,
      "step": 22820
    },
    {
      "gate_value": 0.3484150171279907,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 22820
    },
    {
      "grad_norm": 6.112893104553223,
      "learning_rate": 0.00012740486209058608,
      "loss": 0.3482,
      "step": 22830
    },
    {
      "gate_value": 0.3484891951084137,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 22830
    },
    {
      "grad_norm": 104.99455261230469,
      "learning_rate": 0.0001272822746011324,
      "loss": 0.3768,
      "step": 22840
    },
    {
      "gate_value": 0.3485853672027588,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 22840
    },
    {
      "grad_norm": 9.214521408081055,
      "learning_rate": 0.00012715970263902978,
      "loss": 0.3643,
      "step": 22850
    },
    {
      "gate_value": 0.3486069142818451,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 22850
    },
    {
      "grad_norm": 29.462339401245117,
      "learning_rate": 0.00012703714628805503,
      "loss": 0.3769,
      "step": 22860
    },
    {
      "gate_value": 0.34865206480026245,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 22860
    },
    {
      "grad_norm": 24.155452728271484,
      "learning_rate": 0.0001269146056319743,
      "loss": 0.3508,
      "step": 22870
    },
    {
      "gate_value": 0.3487738072872162,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 22870
    },
    {
      "grad_norm": 51.79835891723633,
      "learning_rate": 0.00012679208075454292,
      "loss": 0.3564,
      "step": 22880
    },
    {
      "gate_value": 0.34891486167907715,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 22880
    },
    {
      "grad_norm": 5.804798603057861,
      "learning_rate": 0.00012666957173950558,
      "loss": 0.372,
      "step": 22890
    },
    {
      "gate_value": 0.34904196858406067,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22890
    },
    {
      "grad_norm": 16.304189682006836,
      "learning_rate": 0.0001265470786705959,
      "loss": 0.3567,
      "step": 22900
    },
    {
      "gate_value": 0.34922027587890625,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 22900
    },
    {
      "grad_norm": 4.497400760650635,
      "learning_rate": 0.00012642460163153678,
      "loss": 0.3704,
      "step": 22910
    },
    {
      "gate_value": 0.34943127632141113,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 22910
    },
    {
      "grad_norm": 17.74970245361328,
      "learning_rate": 0.00012630214070604017,
      "loss": 0.3658,
      "step": 22920
    },
    {
      "gate_value": 0.3495607078075409,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 22920
    },
    {
      "grad_norm": 13.01252269744873,
      "learning_rate": 0.00012617969597780693,
      "loss": 0.3718,
      "step": 22930
    },
    {
      "gate_value": 0.3495199382305145,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 22930
    },
    {
      "grad_norm": 3.868537425994873,
      "learning_rate": 0.00012605726753052687,
      "loss": 0.3695,
      "step": 22940
    },
    {
      "gate_value": 0.3495273292064667,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 22940
    },
    {
      "grad_norm": 487.12237548828125,
      "learning_rate": 0.00012593485544787868,
      "loss": 0.3789,
      "step": 22950
    },
    {
      "gate_value": 0.3496508002281189,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 22950
    },
    {
      "grad_norm": 3.913041830062866,
      "learning_rate": 0.00012581245981352986,
      "loss": 0.3912,
      "step": 22960
    },
    {
      "gate_value": 0.34978440403938293,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 22960
    },
    {
      "grad_norm": 25.10056495666504,
      "learning_rate": 0.00012569008071113672,
      "loss": 0.3656,
      "step": 22970
    },
    {
      "gate_value": 0.34976524114608765,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 22970
    },
    {
      "grad_norm": 39.096656799316406,
      "learning_rate": 0.0001255677182243442,
      "loss": 0.364,
      "step": 22980
    },
    {
      "gate_value": 0.3497866094112396,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 22980
    },
    {
      "grad_norm": 6.127499103546143,
      "learning_rate": 0.00012544537243678583,
      "loss": 0.361,
      "step": 22990
    },
    {
      "gate_value": 0.34986892342567444,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 22990
    },
    {
      "grad_norm": 7.031398296356201,
      "learning_rate": 0.0001253230434320839,
      "loss": 0.3651,
      "step": 23000
    },
    {
      "gate_value": 0.34987005591392517,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 23000
    },
    {
      "grad_norm": 11.885310173034668,
      "learning_rate": 0.00012520073129384908,
      "loss": 0.3806,
      "step": 23010
    },
    {
      "gate_value": 0.34989798069000244,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 23010
    },
    {
      "grad_norm": 10.68913459777832,
      "learning_rate": 0.00012507843610568058,
      "loss": 0.3606,
      "step": 23020
    },
    {
      "gate_value": 0.3498261868953705,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23020
    },
    {
      "grad_norm": 6.32187032699585,
      "learning_rate": 0.000124956157951166,
      "loss": 0.3536,
      "step": 23030
    },
    {
      "gate_value": 0.349841833114624,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 23030
    },
    {
      "grad_norm": 3.669666051864624,
      "learning_rate": 0.00012483389691388133,
      "loss": 0.366,
      "step": 23040
    },
    {
      "gate_value": 0.3500032126903534,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 23040
    },
    {
      "grad_norm": 4.756706237792969,
      "learning_rate": 0.00012471165307739078,
      "loss": 0.3562,
      "step": 23050
    },
    {
      "gate_value": 0.3502030074596405,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23050
    },
    {
      "grad_norm": 6.585669994354248,
      "learning_rate": 0.0001245894265252469,
      "loss": 0.358,
      "step": 23060
    },
    {
      "gate_value": 0.35031023621559143,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 23060
    },
    {
      "grad_norm": 4.768988609313965,
      "learning_rate": 0.00012446721734099046,
      "loss": 0.3816,
      "step": 23070
    },
    {
      "gate_value": 0.35037532448768616,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 23070
    },
    {
      "grad_norm": 7.378237724304199,
      "learning_rate": 0.00012434502560815017,
      "loss": 0.3819,
      "step": 23080
    },
    {
      "gate_value": 0.3505246937274933,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 23080
    },
    {
      "grad_norm": 15.408870697021484,
      "learning_rate": 0.00012422285141024293,
      "loss": 0.3707,
      "step": 23090
    },
    {
      "gate_value": 0.35071197152137756,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 23090
    },
    {
      "grad_norm": 6.119144439697266,
      "learning_rate": 0.0001241006948307737,
      "loss": 0.3516,
      "step": 23100
    },
    {
      "gate_value": 0.3509138226509094,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23100
    },
    {
      "grad_norm": 19.70357894897461,
      "learning_rate": 0.00012397855595323534,
      "loss": 0.3648,
      "step": 23110
    },
    {
      "gate_value": 0.35107842087745667,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 23110
    },
    {
      "grad_norm": 4.001468181610107,
      "learning_rate": 0.00012385643486110864,
      "loss": 0.3521,
      "step": 23120
    },
    {
      "gate_value": 0.3513704538345337,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 23120
    },
    {
      "grad_norm": 6.317371845245361,
      "learning_rate": 0.00012373433163786216,
      "loss": 0.3623,
      "step": 23130
    },
    {
      "gate_value": 0.35159164667129517,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 23130
    },
    {
      "grad_norm": 7.321961879730225,
      "learning_rate": 0.00012361224636695236,
      "loss": 0.3586,
      "step": 23140
    },
    {
      "gate_value": 0.35160747170448303,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 23140
    },
    {
      "grad_norm": 20.925722122192383,
      "learning_rate": 0.0001234901791318233,
      "loss": 0.3752,
      "step": 23150
    },
    {
      "gate_value": 0.3516046404838562,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 23150
    },
    {
      "grad_norm": 11.033568382263184,
      "learning_rate": 0.00012336813001590684,
      "loss": 0.369,
      "step": 23160
    },
    {
      "gate_value": 0.3517070412635803,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23160
    },
    {
      "grad_norm": 6.5204339027404785,
      "learning_rate": 0.0001232460991026225,
      "loss": 0.3717,
      "step": 23170
    },
    {
      "gate_value": 0.3518829345703125,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 23170
    },
    {
      "grad_norm": 8.151719093322754,
      "learning_rate": 0.00012312408647537705,
      "loss": 0.3689,
      "step": 23180
    },
    {
      "gate_value": 0.35193541646003723,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 23180
    },
    {
      "grad_norm": 4.423820495605469,
      "learning_rate": 0.00012300209221756506,
      "loss": 0.3564,
      "step": 23190
    },
    {
      "gate_value": 0.35213330388069153,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23190
    },
    {
      "grad_norm": 34.13200378417969,
      "learning_rate": 0.0001228801164125685,
      "loss": 0.3685,
      "step": 23200
    },
    {
      "gate_value": 0.3523845374584198,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 23200
    },
    {
      "grad_norm": 10.104931831359863,
      "learning_rate": 0.00012275815914375662,
      "loss": 0.3572,
      "step": 23210
    },
    {
      "gate_value": 0.3526081144809723,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 23210
    },
    {
      "grad_norm": 8.901763916015625,
      "learning_rate": 0.00012263622049448614,
      "loss": 0.383,
      "step": 23220
    },
    {
      "gate_value": 0.35284721851348877,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23220
    },
    {
      "grad_norm": 7.556049346923828,
      "learning_rate": 0.00012251430054810086,
      "loss": 0.3847,
      "step": 23230
    },
    {
      "gate_value": 0.3529858887195587,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 23230
    },
    {
      "grad_norm": 21.75843048095703,
      "learning_rate": 0.00012239239938793204,
      "loss": 0.3719,
      "step": 23240
    },
    {
      "gate_value": 0.3530533015727997,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 23240
    },
    {
      "grad_norm": 80.72978973388672,
      "learning_rate": 0.00012227051709729785,
      "loss": 0.3807,
      "step": 23250
    },
    {
      "gate_value": 0.35311317443847656,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 23250
    },
    {
      "grad_norm": 8.681241989135742,
      "learning_rate": 0.00012214865375950385,
      "loss": 0.3587,
      "step": 23260
    },
    {
      "gate_value": 0.3531028926372528,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23260
    },
    {
      "grad_norm": 34.17095184326172,
      "learning_rate": 0.0001220268094578423,
      "loss": 0.3773,
      "step": 23270
    },
    {
      "gate_value": 0.3529850244522095,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23270
    },
    {
      "grad_norm": 9.34839916229248,
      "learning_rate": 0.00012190498427559274,
      "loss": 0.3868,
      "step": 23280
    },
    {
      "gate_value": 0.35284411907196045,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 23280
    },
    {
      "grad_norm": 5.790085315704346,
      "learning_rate": 0.0001217831782960215,
      "loss": 0.3751,
      "step": 23290
    },
    {
      "gate_value": 0.3530334234237671,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23290
    },
    {
      "grad_norm": 46.09357833862305,
      "learning_rate": 0.00012166139160238184,
      "loss": 0.3757,
      "step": 23300
    },
    {
      "gate_value": 0.35317593812942505,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23300
    },
    {
      "grad_norm": 5.113748550415039,
      "learning_rate": 0.00012153962427791376,
      "loss": 0.3573,
      "step": 23310
    },
    {
      "gate_value": 0.35320231318473816,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23310
    },
    {
      "grad_norm": 34.75386047363281,
      "learning_rate": 0.00012141787640584418,
      "loss": 0.368,
      "step": 23320
    },
    {
      "gate_value": 0.35337674617767334,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23320
    },
    {
      "grad_norm": 7.989880084991455,
      "learning_rate": 0.00012129614806938652,
      "loss": 0.3824,
      "step": 23330
    },
    {
      "gate_value": 0.3536171317100525,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 23330
    },
    {
      "grad_norm": 6.5942840576171875,
      "learning_rate": 0.00012117443935174101,
      "loss": 0.3701,
      "step": 23340
    },
    {
      "gate_value": 0.3538232743740082,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 23340
    },
    {
      "grad_norm": 14.675801277160645,
      "learning_rate": 0.00012105275033609445,
      "loss": 0.3856,
      "step": 23350
    },
    {
      "gate_value": 0.3539339303970337,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 23350
    },
    {
      "grad_norm": 26.552522659301758,
      "learning_rate": 0.00012093108110562001,
      "loss": 0.3625,
      "step": 23360
    },
    {
      "gate_value": 0.35394522547721863,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23360
    },
    {
      "grad_norm": 5.980011940002441,
      "learning_rate": 0.00012080943174347752,
      "loss": 0.3599,
      "step": 23370
    },
    {
      "gate_value": 0.3539412319660187,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 23370
    },
    {
      "grad_norm": 8.659141540527344,
      "learning_rate": 0.00012068780233281322,
      "loss": 0.3612,
      "step": 23380
    },
    {
      "gate_value": 0.3540070652961731,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 23380
    },
    {
      "grad_norm": 40.22635269165039,
      "learning_rate": 0.00012056619295675959,
      "loss": 0.3601,
      "step": 23390
    },
    {
      "gate_value": 0.35405296087265015,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 23390
    },
    {
      "grad_norm": 6.858108997344971,
      "learning_rate": 0.00012044460369843556,
      "loss": 0.367,
      "step": 23400
    },
    {
      "gate_value": 0.354130357503891,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 23400
    },
    {
      "grad_norm": 7.824497222900391,
      "learning_rate": 0.00012032303464094619,
      "loss": 0.376,
      "step": 23410
    },
    {
      "gate_value": 0.3540987968444824,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23410
    },
    {
      "grad_norm": 13.613783836364746,
      "learning_rate": 0.00012020148586738284,
      "loss": 0.3676,
      "step": 23420
    },
    {
      "gate_value": 0.3542543947696686,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 23420
    },
    {
      "grad_norm": 10.455016136169434,
      "learning_rate": 0.00012007995746082288,
      "loss": 0.3565,
      "step": 23430
    },
    {
      "gate_value": 0.3542884290218353,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23430
    },
    {
      "grad_norm": 5.269458770751953,
      "learning_rate": 0.0001199584495043299,
      "loss": 0.3641,
      "step": 23440
    },
    {
      "gate_value": 0.3544134199619293,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 23440
    },
    {
      "grad_norm": 18.11334228515625,
      "learning_rate": 0.00011983696208095342,
      "loss": 0.3571,
      "step": 23450
    },
    {
      "gate_value": 0.35484281182289124,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 23450
    },
    {
      "grad_norm": 33.68678665161133,
      "learning_rate": 0.0001197154952737289,
      "loss": 0.3826,
      "step": 23460
    },
    {
      "gate_value": 0.3551397919654846,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 23460
    },
    {
      "grad_norm": 11.15624713897705,
      "learning_rate": 0.0001195940491656778,
      "loss": 0.3778,
      "step": 23470
    },
    {
      "gate_value": 0.35536882281303406,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 23470
    },
    {
      "grad_norm": 108.3549575805664,
      "learning_rate": 0.00011947262383980739,
      "loss": 0.3563,
      "step": 23480
    },
    {
      "gate_value": 0.35545867681503296,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 23480
    },
    {
      "grad_norm": 12.360928535461426,
      "learning_rate": 0.00011935121937911072,
      "loss": 0.3551,
      "step": 23490
    },
    {
      "gate_value": 0.3555486500263214,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23490
    },
    {
      "grad_norm": 5.003032684326172,
      "learning_rate": 0.00011922983586656662,
      "loss": 0.3574,
      "step": 23500
    },
    {
      "gate_value": 0.3557662069797516,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23500
    },
    {
      "grad_norm": 4.80068302154541,
      "learning_rate": 0.00011910847338513953,
      "loss": 0.3841,
      "step": 23510
    },
    {
      "gate_value": 0.35597875714302063,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 23510
    },
    {
      "grad_norm": 6.734328746795654,
      "learning_rate": 0.00011898713201777963,
      "loss": 0.3725,
      "step": 23520
    },
    {
      "gate_value": 0.3559677302837372,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 23520
    },
    {
      "grad_norm": 12.359806060791016,
      "learning_rate": 0.00011886581184742252,
      "loss": 0.3892,
      "step": 23530
    },
    {
      "gate_value": 0.35603559017181396,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 23530
    },
    {
      "grad_norm": 7.419463634490967,
      "learning_rate": 0.00011874451295698951,
      "loss": 0.3613,
      "step": 23540
    },
    {
      "gate_value": 0.35623571276664734,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 23540
    },
    {
      "grad_norm": 11.626155853271484,
      "learning_rate": 0.00011862323542938713,
      "loss": 0.3709,
      "step": 23550
    },
    {
      "gate_value": 0.35649681091308594,
      "icl_sequence_length": 56,
      "num_contexts": 3,
      "step": 23550
    },
    {
      "grad_norm": 4.980889797210693,
      "learning_rate": 0.00011850197934750746,
      "loss": 0.3743,
      "step": 23560
    },
    {
      "gate_value": 0.3565058708190918,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23560
    },
    {
      "grad_norm": 7.546865940093994,
      "learning_rate": 0.00011838074479422787,
      "loss": 0.3708,
      "step": 23570
    },
    {
      "gate_value": 0.3565681278705597,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23570
    },
    {
      "grad_norm": 8.491681098937988,
      "learning_rate": 0.0001182595318524111,
      "loss": 0.3679,
      "step": 23580
    },
    {
      "gate_value": 0.35658058524131775,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 23580
    },
    {
      "grad_norm": 10.987086296081543,
      "learning_rate": 0.000118138340604905,
      "loss": 0.3723,
      "step": 23590
    },
    {
      "gate_value": 0.35657426714897156,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 23590
    },
    {
      "grad_norm": 8.058330535888672,
      "learning_rate": 0.00011801717113454266,
      "loss": 0.3835,
      "step": 23600
    },
    {
      "gate_value": 0.35659709572792053,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23600
    },
    {
      "grad_norm": 9.122367858886719,
      "learning_rate": 0.00011789602352414227,
      "loss": 0.3658,
      "step": 23610
    },
    {
      "gate_value": 0.35666391253471375,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23610
    },
    {
      "grad_norm": 6.175402641296387,
      "learning_rate": 0.0001177748978565071,
      "loss": 0.3492,
      "step": 23620
    },
    {
      "gate_value": 0.3566833734512329,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23620
    },
    {
      "grad_norm": 10.220946311950684,
      "learning_rate": 0.0001176537942144254,
      "loss": 0.38,
      "step": 23630
    },
    {
      "gate_value": 0.3567415773868561,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23630
    },
    {
      "grad_norm": 15.779232025146484,
      "learning_rate": 0.0001175327126806703,
      "loss": 0.3552,
      "step": 23640
    },
    {
      "gate_value": 0.3568035364151001,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 23640
    },
    {
      "grad_norm": 9.174988746643066,
      "learning_rate": 0.00011741165333799996,
      "loss": 0.3688,
      "step": 23650
    },
    {
      "gate_value": 0.35696929693222046,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 23650
    },
    {
      "grad_norm": 11.645748138427734,
      "learning_rate": 0.00011729061626915723,
      "loss": 0.3737,
      "step": 23660
    },
    {
      "gate_value": 0.35708051919937134,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 23660
    },
    {
      "grad_norm": 6.972232818603516,
      "learning_rate": 0.00011716960155686986,
      "loss": 0.3481,
      "step": 23670
    },
    {
      "gate_value": 0.3572835326194763,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23670
    },
    {
      "grad_norm": 6.440983772277832,
      "learning_rate": 0.00011704860928385028,
      "loss": 0.36,
      "step": 23680
    },
    {
      "gate_value": 0.3574623465538025,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 23680
    },
    {
      "grad_norm": 12.463038444519043,
      "learning_rate": 0.00011692763953279552,
      "loss": 0.3795,
      "step": 23690
    },
    {
      "gate_value": 0.3576089143753052,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23690
    },
    {
      "grad_norm": 10.757335662841797,
      "learning_rate": 0.00011680669238638731,
      "loss": 0.3795,
      "step": 23700
    },
    {
      "gate_value": 0.3578938841819763,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 23700
    },
    {
      "grad_norm": 8.984125137329102,
      "learning_rate": 0.00011668576792729182,
      "loss": 0.3545,
      "step": 23710
    },
    {
      "gate_value": 0.3580709993839264,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23710
    },
    {
      "grad_norm": 3.919456958770752,
      "learning_rate": 0.00011656486623815987,
      "loss": 0.3886,
      "step": 23720
    },
    {
      "gate_value": 0.35818031430244446,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23720
    },
    {
      "grad_norm": 5.882258892059326,
      "learning_rate": 0.00011644398740162659,
      "loss": 0.3733,
      "step": 23730
    },
    {
      "gate_value": 0.3582715690135956,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 23730
    },
    {
      "grad_norm": 7.3552350997924805,
      "learning_rate": 0.00011632313150031144,
      "loss": 0.3645,
      "step": 23740
    },
    {
      "gate_value": 0.35839003324508667,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 23740
    },
    {
      "grad_norm": 5.509767055511475,
      "learning_rate": 0.0001162022986168184,
      "loss": 0.3778,
      "step": 23750
    },
    {
      "gate_value": 0.35857802629470825,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 23750
    },
    {
      "grad_norm": 3.078181743621826,
      "learning_rate": 0.00011608148883373552,
      "loss": 0.3715,
      "step": 23760
    },
    {
      "gate_value": 0.3587779402732849,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 23760
    },
    {
      "grad_norm": 4.447902202606201,
      "learning_rate": 0.00011596070223363518,
      "loss": 0.3671,
      "step": 23770
    },
    {
      "gate_value": 0.3586958646774292,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23770
    },
    {
      "grad_norm": 171.47337341308594,
      "learning_rate": 0.00011583993889907394,
      "loss": 0.3671,
      "step": 23780
    },
    {
      "gate_value": 0.358342707157135,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 23780
    },
    {
      "grad_norm": 979.877685546875,
      "learning_rate": 0.00011571919891259232,
      "loss": 0.3736,
      "step": 23790
    },
    {
      "gate_value": 0.3582010269165039,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23790
    },
    {
      "grad_norm": 796.0517578125,
      "learning_rate": 0.00011559848235671502,
      "loss": 0.3792,
      "step": 23800
    },
    {
      "gate_value": 0.3581511974334717,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 23800
    },
    {
      "grad_norm": 504.9575500488281,
      "learning_rate": 0.00011547778931395063,
      "loss": 0.3782,
      "step": 23810
    },
    {
      "gate_value": 0.35812196135520935,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23810
    },
    {
      "grad_norm": 93.6175537109375,
      "learning_rate": 0.00011535711986679174,
      "loss": 0.3551,
      "step": 23820
    },
    {
      "gate_value": 0.35810139775276184,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23820
    },
    {
      "grad_norm": 245.25450134277344,
      "learning_rate": 0.00011523647409771476,
      "loss": 0.3864,
      "step": 23830
    },
    {
      "gate_value": 0.35808372497558594,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23830
    },
    {
      "grad_norm": 345.7886657714844,
      "learning_rate": 0.00011511585208917989,
      "loss": 0.3719,
      "step": 23840
    },
    {
      "gate_value": 0.35808631777763367,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 23840
    },
    {
      "grad_norm": 258.2336730957031,
      "learning_rate": 0.00011499525392363123,
      "loss": 0.3563,
      "step": 23850
    },
    {
      "gate_value": 0.3580664396286011,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23850
    },
    {
      "grad_norm": 555.8143310546875,
      "learning_rate": 0.00011487467968349639,
      "loss": 0.3903,
      "step": 23860
    },
    {
      "gate_value": 0.3580431044101715,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23860
    },
    {
      "grad_norm": 139.11825561523438,
      "learning_rate": 0.00011475412945118677,
      "loss": 0.3692,
      "step": 23870
    },
    {
      "gate_value": 0.35803794860839844,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 23870
    },
    {
      "grad_norm": 123.66503143310547,
      "learning_rate": 0.00011463360330909737,
      "loss": 0.3847,
      "step": 23880
    },
    {
      "gate_value": 0.35804980993270874,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23880
    },
    {
      "grad_norm": 554.3246459960938,
      "learning_rate": 0.00011451310133960658,
      "loss": 0.3758,
      "step": 23890
    },
    {
      "gate_value": 0.35810431838035583,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23890
    },
    {
      "grad_norm": 50.21940231323242,
      "learning_rate": 0.00011439262362507644,
      "loss": 0.3665,
      "step": 23900
    },
    {
      "gate_value": 0.3581521511077881,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 23900
    },
    {
      "grad_norm": 32.6104736328125,
      "learning_rate": 0.00011427217024785232,
      "loss": 0.3556,
      "step": 23910
    },
    {
      "gate_value": 0.3581998348236084,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 23910
    },
    {
      "grad_norm": 239.99615478515625,
      "learning_rate": 0.00011415174129026288,
      "loss": 0.3566,
      "step": 23920
    },
    {
      "gate_value": 0.3582267165184021,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 23920
    },
    {
      "grad_norm": 405.5616455078125,
      "learning_rate": 0.00011403133683462027,
      "loss": 0.3926,
      "step": 23930
    },
    {
      "gate_value": 0.35823509097099304,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 23930
    },
    {
      "grad_norm": 1989.842529296875,
      "learning_rate": 0.00011391095696321974,
      "loss": 0.3832,
      "step": 23940
    },
    {
      "gate_value": 0.3582380712032318,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23940
    },
    {
      "grad_norm": 7928.72412109375,
      "learning_rate": 0.00011379060175833986,
      "loss": 0.3617,
      "step": 23950
    },
    {
      "gate_value": 0.3582387864589691,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23950
    },
    {
      "grad_norm": 1746.5020751953125,
      "learning_rate": 0.0001136702713022422,
      "loss": 0.3707,
      "step": 23960
    },
    {
      "gate_value": 0.3582383990287781,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 23960
    },
    {
      "grad_norm": 175.7129669189453,
      "learning_rate": 0.00011354996567717156,
      "loss": 0.3878,
      "step": 23970
    },
    {
      "gate_value": 0.3582378029823303,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 23970
    },
    {
      "grad_norm": 361.2430419921875,
      "learning_rate": 0.00011342968496535568,
      "loss": 0.3773,
      "step": 23980
    },
    {
      "gate_value": 0.3582334816455841,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 23980
    },
    {
      "grad_norm": 293.09912109375,
      "learning_rate": 0.00011330942924900529,
      "loss": 0.3734,
      "step": 23990
    },
    {
      "gate_value": 0.3582324683666229,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 23990
    },
    {
      "grad_norm": 143.7857666015625,
      "learning_rate": 0.00011318919861031403,
      "loss": 0.3632,
      "step": 24000
    },
    {
      "gate_value": 0.35823488235473633,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 24000
    },
    {
      "grad_norm": 1418.47119140625,
      "learning_rate": 0.00011306899313145848,
      "loss": 0.3703,
      "step": 24010
    },
    {
      "gate_value": 0.3582359850406647,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24010
    },
    {
      "grad_norm": 150.08474731445312,
      "learning_rate": 0.00011294881289459782,
      "loss": 0.3712,
      "step": 24020
    },
    {
      "gate_value": 0.3582356572151184,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 24020
    },
    {
      "grad_norm": 91.68392944335938,
      "learning_rate": 0.00011282865798187417,
      "loss": 0.3697,
      "step": 24030
    },
    {
      "gate_value": 0.3582378625869751,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24030
    },
    {
      "grad_norm": 155.62538146972656,
      "learning_rate": 0.00011270852847541228,
      "loss": 0.382,
      "step": 24040
    },
    {
      "gate_value": 0.3582373559474945,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 24040
    },
    {
      "grad_norm": 579.9232788085938,
      "learning_rate": 0.00011258842445731954,
      "loss": 0.352,
      "step": 24050
    },
    {
      "gate_value": 0.3582456409931183,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 24050
    },
    {
      "grad_norm": 5258.4619140625,
      "learning_rate": 0.00011246834600968594,
      "loss": 0.3899,
      "step": 24060
    },
    {
      "gate_value": 0.3582465946674347,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 24060
    },
    {
      "grad_norm": 508.95184326171875,
      "learning_rate": 0.00011234829321458392,
      "loss": 0.3826,
      "step": 24070
    },
    {
      "gate_value": 0.3582475483417511,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24070
    },
    {
      "grad_norm": 120.9805908203125,
      "learning_rate": 0.00011222826615406848,
      "loss": 0.3579,
      "step": 24080
    },
    {
      "gate_value": 0.3582461178302765,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24080
    },
    {
      "grad_norm": 90.79205322265625,
      "learning_rate": 0.00011210826491017692,
      "loss": 0.3545,
      "step": 24090
    },
    {
      "gate_value": 0.35825929045677185,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 24090
    },
    {
      "grad_norm": 478.6510925292969,
      "learning_rate": 0.00011198828956492907,
      "loss": 0.3541,
      "step": 24100
    },
    {
      "gate_value": 0.3582655191421509,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 24100
    },
    {
      "grad_norm": 139.04241943359375,
      "learning_rate": 0.00011186834020032682,
      "loss": 0.3758,
      "step": 24110
    },
    {
      "gate_value": 0.3582736849784851,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 24110
    },
    {
      "grad_norm": 107.1037826538086,
      "learning_rate": 0.00011174841689835446,
      "loss": 0.3903,
      "step": 24120
    },
    {
      "gate_value": 0.3582884967327118,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 24120
    },
    {
      "grad_norm": 246.1360626220703,
      "learning_rate": 0.0001116285197409785,
      "loss": 0.3665,
      "step": 24130
    },
    {
      "gate_value": 0.35830172896385193,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24130
    },
    {
      "grad_norm": 419.01885986328125,
      "learning_rate": 0.00011150864881014744,
      "loss": 0.36,
      "step": 24140
    },
    {
      "gate_value": 0.3583115041255951,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 24140
    },
    {
      "grad_norm": 1557.142333984375,
      "learning_rate": 0.00011138880418779196,
      "loss": 0.354,
      "step": 24150
    },
    {
      "gate_value": 0.35832417011260986,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 24150
    },
    {
      "grad_norm": 193.56314086914062,
      "learning_rate": 0.00011126898595582478,
      "loss": 0.3682,
      "step": 24160
    },
    {
      "gate_value": 0.3583417236804962,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 24160
    },
    {
      "grad_norm": 2367.890625,
      "learning_rate": 0.00011114919419614045,
      "loss": 0.3756,
      "step": 24170
    },
    {
      "gate_value": 0.3583694100379944,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24170
    },
    {
      "grad_norm": 17580.013671875,
      "learning_rate": 0.00011102942899061557,
      "loss": 0.3674,
      "step": 24180
    },
    {
      "gate_value": 0.3583917021751404,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 24180
    },
    {
      "grad_norm": 766.1683959960938,
      "learning_rate": 0.00011090969042110854,
      "loss": 0.3536,
      "step": 24190
    },
    {
      "gate_value": 0.35841184854507446,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24190
    },
    {
      "grad_norm": 102.75213623046875,
      "learning_rate": 0.00011078997856945947,
      "loss": 0.3688,
      "step": 24200
    },
    {
      "gate_value": 0.3584311306476593,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 24200
    },
    {
      "grad_norm": 593.046630859375,
      "learning_rate": 0.00011067029351749032,
      "loss": 0.3568,
      "step": 24210
    },
    {
      "gate_value": 0.3584307134151459,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24210
    },
    {
      "grad_norm": 256.4482116699219,
      "learning_rate": 0.00011055063534700468,
      "loss": 0.358,
      "step": 24220
    },
    {
      "gate_value": 0.3584342896938324,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24220
    },
    {
      "grad_norm": 180.25892639160156,
      "learning_rate": 0.00011043100413978781,
      "loss": 0.3772,
      "step": 24230
    },
    {
      "gate_value": 0.3584383428096771,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 24230
    },
    {
      "grad_norm": 46.64565658569336,
      "learning_rate": 0.00011031139997760648,
      "loss": 0.3727,
      "step": 24240
    },
    {
      "gate_value": 0.35844966769218445,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24240
    },
    {
      "grad_norm": 50854.26171875,
      "learning_rate": 0.000110191822942209,
      "loss": 0.3704,
      "step": 24250
    },
    {
      "gate_value": 0.35844776034355164,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 24250
    },
    {
      "grad_norm": 65.85983276367188,
      "learning_rate": 0.00011007227311532522,
      "loss": 0.3737,
      "step": 24260
    },
    {
      "gate_value": 0.3584485948085785,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 24260
    },
    {
      "grad_norm": 1420.6707763671875,
      "learning_rate": 0.00010995275057866624,
      "loss": 0.361,
      "step": 24270
    },
    {
      "gate_value": 0.35847553610801697,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24270
    },
    {
      "grad_norm": 78.28116607666016,
      "learning_rate": 0.00010983325541392469,
      "loss": 0.3515,
      "step": 24280
    },
    {
      "gate_value": 0.3584812879562378,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 24280
    },
    {
      "grad_norm": 233.1837158203125,
      "learning_rate": 0.00010971378770277426,
      "loss": 0.3608,
      "step": 24290
    },
    {
      "gate_value": 0.3584992289543152,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 24290
    },
    {
      "grad_norm": 728.71630859375,
      "learning_rate": 0.00010959434752687004,
      "loss": 0.3722,
      "step": 24300
    },
    {
      "gate_value": 0.3585113286972046,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 24300
    },
    {
      "grad_norm": 5302.78564453125,
      "learning_rate": 0.00010947493496784829,
      "loss": 0.3637,
      "step": 24310
    },
    {
      "gate_value": 0.35852137207984924,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 24310
    },
    {
      "grad_norm": 34129.52734375,
      "learning_rate": 0.00010935555010732636,
      "loss": 0.358,
      "step": 24320
    },
    {
      "gate_value": 0.3585313558578491,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 24320
    },
    {
      "grad_norm": 298.13037109375,
      "learning_rate": 0.0001092361930269027,
      "loss": 0.3602,
      "step": 24330
    },
    {
      "gate_value": 0.3585515320301056,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 24330
    },
    {
      "grad_norm": 56.191341400146484,
      "learning_rate": 0.00010911686380815671,
      "loss": 0.3825,
      "step": 24340
    },
    {
      "gate_value": 0.35857197642326355,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 24340
    },
    {
      "grad_norm": 186.09190368652344,
      "learning_rate": 0.00010899756253264879,
      "loss": 0.3624,
      "step": 24350
    },
    {
      "gate_value": 0.35859212279319763,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 24350
    },
    {
      "grad_norm": 259.10968017578125,
      "learning_rate": 0.00010887828928192026,
      "loss": 0.366,
      "step": 24360
    },
    {
      "gate_value": 0.35859882831573486,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 24360
    },
    {
      "grad_norm": 705.4097290039062,
      "learning_rate": 0.00010875904413749324,
      "loss": 0.3629,
      "step": 24370
    },
    {
      "gate_value": 0.35860174894332886,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24370
    },
    {
      "grad_norm": 906.2277221679688,
      "learning_rate": 0.00010863982718087074,
      "loss": 0.3606,
      "step": 24380
    },
    {
      "gate_value": 0.35860127210617065,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 24380
    },
    {
      "grad_norm": 122.35824584960938,
      "learning_rate": 0.0001085206384935363,
      "loss": 0.3591,
      "step": 24390
    },
    {
      "gate_value": 0.3586183786392212,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 24390
    },
    {
      "grad_norm": 111.16336059570312,
      "learning_rate": 0.00010840147815695433,
      "loss": 0.3618,
      "step": 24400
    },
    {
      "gate_value": 0.3586495816707611,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 24400
    },
    {
      "grad_norm": 131.8732147216797,
      "learning_rate": 0.0001082823462525698,
      "loss": 0.3708,
      "step": 24410
    },
    {
      "gate_value": 0.35866519808769226,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 24410
    },
    {
      "grad_norm": 65.82928466796875,
      "learning_rate": 0.0001081632428618082,
      "loss": 0.3634,
      "step": 24420
    },
    {
      "gate_value": 0.3587072193622589,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 24420
    },
    {
      "grad_norm": 30504.029296875,
      "learning_rate": 0.00010804416806607563,
      "loss": 0.372,
      "step": 24430
    },
    {
      "gate_value": 0.35872122645378113,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 24430
    },
    {
      "grad_norm": 50.18363571166992,
      "learning_rate": 0.00010792512194675855,
      "loss": 0.3558,
      "step": 24440
    },
    {
      "gate_value": 0.35872602462768555,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 24440
    },
    {
      "grad_norm": 92.87120056152344,
      "learning_rate": 0.0001078061045852239,
      "loss": 0.3688,
      "step": 24450
    },
    {
      "gate_value": 0.3587706387042999,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24450
    },
    {
      "grad_norm": 62.7234992980957,
      "learning_rate": 0.00010768711606281889,
      "loss": 0.3725,
      "step": 24460
    },
    {
      "gate_value": 0.3588024973869324,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 24460
    },
    {
      "grad_norm": 70.35800170898438,
      "learning_rate": 0.00010756815646087111,
      "loss": 0.3723,
      "step": 24470
    },
    {
      "gate_value": 0.358820378780365,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24470
    },
    {
      "grad_norm": 96.2275161743164,
      "learning_rate": 0.00010744922586068823,
      "loss": 0.369,
      "step": 24480
    },
    {
      "gate_value": 0.3588525950908661,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 24480
    },
    {
      "grad_norm": 71.24588775634766,
      "learning_rate": 0.00010733032434355827,
      "loss": 0.361,
      "step": 24490
    },
    {
      "gate_value": 0.35888391733169556,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 24490
    },
    {
      "grad_norm": 69.96412658691406,
      "learning_rate": 0.00010721145199074923,
      "loss": 0.359,
      "step": 24500
    },
    {
      "gate_value": 0.3589116930961609,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 24500
    },
    {
      "grad_norm": 99.85425567626953,
      "learning_rate": 0.00010709260888350931,
      "loss": 0.3724,
      "step": 24510
    },
    {
      "gate_value": 0.3589341938495636,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 24510
    },
    {
      "grad_norm": 118.61331176757812,
      "learning_rate": 0.0001069737951030666,
      "loss": 0.3793,
      "step": 24520
    },
    {
      "gate_value": 0.35894277691841125,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 24520
    },
    {
      "grad_norm": 524.7838134765625,
      "learning_rate": 0.00010685501073062927,
      "loss": 0.3666,
      "step": 24530
    },
    {
      "gate_value": 0.3589576482772827,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 24530
    },
    {
      "grad_norm": 2431.64501953125,
      "learning_rate": 0.00010673625584738523,
      "loss": 0.3836,
      "step": 24540
    },
    {
      "gate_value": 0.3589611351490021,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 24540
    },
    {
      "grad_norm": 881.9794311523438,
      "learning_rate": 0.00010661753053450237,
      "loss": 0.3519,
      "step": 24550
    },
    {
      "gate_value": 0.3589634299278259,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 24550
    },
    {
      "grad_norm": 132.37796020507812,
      "learning_rate": 0.00010649883487312836,
      "loss": 0.371,
      "step": 24560
    },
    {
      "gate_value": 0.3589780330657959,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 24560
    },
    {
      "grad_norm": 98.35554504394531,
      "learning_rate": 0.00010638016894439051,
      "loss": 0.3563,
      "step": 24570
    },
    {
      "gate_value": 0.35899582505226135,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 24570
    },
    {
      "grad_norm": 107.20555114746094,
      "learning_rate": 0.00010626153282939586,
      "loss": 0.3631,
      "step": 24580
    },
    {
      "gate_value": 0.35901913046836853,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 24580
    },
    {
      "grad_norm": 213.93869018554688,
      "learning_rate": 0.00010614292660923108,
      "loss": 0.3753,
      "step": 24590
    },
    {
      "gate_value": 0.3590436279773712,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 24590
    },
    {
      "grad_norm": 93.29708099365234,
      "learning_rate": 0.00010602435036496243,
      "loss": 0.361,
      "step": 24600
    },
    {
      "gate_value": 0.35905933380126953,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 24600
    },
    {
      "grad_norm": 68.2582015991211,
      "learning_rate": 0.00010590580417763564,
      "loss": 0.3781,
      "step": 24610
    },
    {
      "gate_value": 0.3590523898601532,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24610
    },
    {
      "grad_norm": 176.111572265625,
      "learning_rate": 0.00010578728812827589,
      "loss": 0.3723,
      "step": 24620
    },
    {
      "gate_value": 0.3590545058250427,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24620
    },
    {
      "grad_norm": 85.26171875,
      "learning_rate": 0.00010566880229788784,
      "loss": 0.3428,
      "step": 24630
    },
    {
      "gate_value": 0.359076589345932,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24630
    },
    {
      "grad_norm": 114.50090789794922,
      "learning_rate": 0.00010555034676745537,
      "loss": 0.3815,
      "step": 24640
    },
    {
      "gate_value": 0.35911232233047485,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24640
    },
    {
      "grad_norm": 80.09989166259766,
      "learning_rate": 0.00010543192161794174,
      "loss": 0.3737,
      "step": 24650
    },
    {
      "gate_value": 0.35913920402526855,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 24650
    },
    {
      "grad_norm": 61.79484176635742,
      "learning_rate": 0.00010531352693028951,
      "loss": 0.349,
      "step": 24660
    },
    {
      "gate_value": 0.35916125774383545,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 24660
    },
    {
      "grad_norm": 139.99099731445312,
      "learning_rate": 0.0001051951627854202,
      "loss": 0.3705,
      "step": 24670
    },
    {
      "gate_value": 0.35917332768440247,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 24670
    },
    {
      "grad_norm": 70.59046936035156,
      "learning_rate": 0.00010507682926423463,
      "loss": 0.3577,
      "step": 24680
    },
    {
      "gate_value": 0.3591974973678589,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 24680
    },
    {
      "grad_norm": 50.57143020629883,
      "learning_rate": 0.00010495852644761268,
      "loss": 0.3644,
      "step": 24690
    },
    {
      "gate_value": 0.35923057794570923,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 24690
    },
    {
      "grad_norm": 81.95945739746094,
      "learning_rate": 0.00010484025441641315,
      "loss": 0.3672,
      "step": 24700
    },
    {
      "gate_value": 0.3592517673969269,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 24700
    },
    {
      "grad_norm": 213.39706420898438,
      "learning_rate": 0.00010472201325147395,
      "loss": 0.3712,
      "step": 24710
    },
    {
      "gate_value": 0.35926130414009094,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 24710
    },
    {
      "grad_norm": 67.57164764404297,
      "learning_rate": 0.0001046038030336117,
      "loss": 0.3726,
      "step": 24720
    },
    {
      "gate_value": 0.3592642843723297,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 24720
    },
    {
      "grad_norm": 68.6855697631836,
      "learning_rate": 0.00010448562384362204,
      "loss": 0.3476,
      "step": 24730
    },
    {
      "gate_value": 0.35928410291671753,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24730
    },
    {
      "grad_norm": 804.9916381835938,
      "learning_rate": 0.00010436747576227928,
      "loss": 0.3575,
      "step": 24740
    },
    {
      "gate_value": 0.3593326210975647,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 24740
    },
    {
      "grad_norm": 251.33657836914062,
      "learning_rate": 0.0001042493588703366,
      "loss": 0.3619,
      "step": 24750
    },
    {
      "gate_value": 0.35940855741500854,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 24750
    },
    {
      "grad_norm": 360.7458190917969,
      "learning_rate": 0.00010413127324852569,
      "loss": 0.3761,
      "step": 24760
    },
    {
      "gate_value": 0.35946398973464966,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 24760
    },
    {
      "grad_norm": 629.7045288085938,
      "learning_rate": 0.00010401321897755703,
      "loss": 0.357,
      "step": 24770
    },
    {
      "gate_value": 0.3594951033592224,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 24770
    },
    {
      "grad_norm": 667.2528686523438,
      "learning_rate": 0.00010389519613811952,
      "loss": 0.3638,
      "step": 24780
    },
    {
      "gate_value": 0.359512060880661,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 24780
    },
    {
      "grad_norm": 322.2542419433594,
      "learning_rate": 0.00010377720481088076,
      "loss": 0.3624,
      "step": 24790
    },
    {
      "gate_value": 0.3595242500305176,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 24790
    },
    {
      "grad_norm": 302.0667724609375,
      "learning_rate": 0.0001036592450764866,
      "loss": 0.3551,
      "step": 24800
    },
    {
      "gate_value": 0.35954952239990234,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 24800
    },
    {
      "grad_norm": 508.2187805175781,
      "learning_rate": 0.00010354131701556152,
      "loss": 0.3738,
      "step": 24810
    },
    {
      "gate_value": 0.3595583140850067,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 24810
    },
    {
      "grad_norm": 240.36524963378906,
      "learning_rate": 0.00010342342070870813,
      "loss": 0.3611,
      "step": 24820
    },
    {
      "gate_value": 0.3595639169216156,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 24820
    },
    {
      "grad_norm": 594.7509155273438,
      "learning_rate": 0.00010330555623650753,
      "loss": 0.3784,
      "step": 24830
    },
    {
      "gate_value": 0.35957181453704834,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 24830
    },
    {
      "grad_norm": 141.8068084716797,
      "learning_rate": 0.00010318772367951898,
      "loss": 0.3478,
      "step": 24840
    },
    {
      "gate_value": 0.359592080116272,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 24840
    },
    {
      "grad_norm": 89.36084747314453,
      "learning_rate": 0.00010306992311827981,
      "loss": 0.37,
      "step": 24850
    },
    {
      "gate_value": 0.35961490869522095,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 24850
    },
    {
      "grad_norm": 53.824249267578125,
      "learning_rate": 0.00010295215463330568,
      "loss": 0.3767,
      "step": 24860
    },
    {
      "gate_value": 0.35963571071624756,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 24860
    },
    {
      "grad_norm": 55.34401321411133,
      "learning_rate": 0.00010283441830509023,
      "loss": 0.3731,
      "step": 24870
    },
    {
      "gate_value": 0.3596397340297699,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 24870
    },
    {
      "grad_norm": 100.6485366821289,
      "learning_rate": 0.0001027167142141051,
      "loss": 0.379,
      "step": 24880
    },
    {
      "gate_value": 0.35964563488960266,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24880
    },
    {
      "grad_norm": 31.52804946899414,
      "learning_rate": 0.00010259904244079998,
      "loss": 0.364,
      "step": 24890
    },
    {
      "gate_value": 0.359668493270874,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24890
    },
    {
      "grad_norm": 260.1759338378906,
      "learning_rate": 0.00010248140306560238,
      "loss": 0.3591,
      "step": 24900
    },
    {
      "gate_value": 0.3596940040588379,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 24900
    },
    {
      "grad_norm": 939.1127319335938,
      "learning_rate": 0.00010236379616891772,
      "loss": 0.3494,
      "step": 24910
    },
    {
      "gate_value": 0.35971933603286743,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 24910
    },
    {
      "grad_norm": 137.58267211914062,
      "learning_rate": 0.00010224622183112916,
      "loss": 0.3571,
      "step": 24920
    },
    {
      "gate_value": 0.3597603440284729,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 24920
    },
    {
      "grad_norm": 361.5756530761719,
      "learning_rate": 0.00010212868013259772,
      "loss": 0.3558,
      "step": 24930
    },
    {
      "gate_value": 0.35976681113243103,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 24930
    },
    {
      "grad_norm": 204.4798126220703,
      "learning_rate": 0.00010201117115366207,
      "loss": 0.3776,
      "step": 24940
    },
    {
      "gate_value": 0.35976898670196533,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 24940
    },
    {
      "grad_norm": 919.7255249023438,
      "learning_rate": 0.00010189369497463835,
      "loss": 0.3559,
      "step": 24950
    },
    {
      "gate_value": 0.3597671389579773,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 24950
    },
    {
      "grad_norm": 72.86383056640625,
      "learning_rate": 0.00010177625167582049,
      "loss": 0.3801,
      "step": 24960
    },
    {
      "gate_value": 0.359756737947464,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 24960
    },
    {
      "grad_norm": 110.57682800292969,
      "learning_rate": 0.00010165884133747992,
      "loss": 0.3441,
      "step": 24970
    },
    {
      "gate_value": 0.3597812354564667,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 24970
    },
    {
      "grad_norm": 171.7178497314453,
      "learning_rate": 0.00010154146403986543,
      "loss": 0.3577,
      "step": 24980
    },
    {
      "gate_value": 0.3598121106624603,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 24980
    },
    {
      "grad_norm": 124.2730941772461,
      "learning_rate": 0.00010142411986320337,
      "loss": 0.3652,
      "step": 24990
    },
    {
      "gate_value": 0.35984185338020325,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 24990
    },
    {
      "grad_norm": 577.5195922851562,
      "learning_rate": 0.00010130680888769732,
      "loss": 0.3576,
      "step": 25000
    },
    {
      "gate_value": 0.3598731458187103,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 25000
    },
    {
      "grad_norm": 1175.1915283203125,
      "learning_rate": 0.00010118953119352826,
      "loss": 0.3549,
      "step": 25010
    },
    {
      "gate_value": 0.3598865568637848,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 25010
    },
    {
      "grad_norm": 1535.7991943359375,
      "learning_rate": 0.00010107228686085436,
      "loss": 0.3691,
      "step": 25020
    },
    {
      "gate_value": 0.35989508032798767,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 25020
    },
    {
      "grad_norm": 64.09869384765625,
      "learning_rate": 0.00010095507596981107,
      "loss": 0.3592,
      "step": 25030
    },
    {
      "gate_value": 0.35991427302360535,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25030
    },
    {
      "grad_norm": 45.077537536621094,
      "learning_rate": 0.00010083789860051089,
      "loss": 0.3735,
      "step": 25040
    },
    {
      "gate_value": 0.35991591215133667,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 25040
    },
    {
      "grad_norm": 28.698278427124023,
      "learning_rate": 0.0001007207548330434,
      "loss": 0.3663,
      "step": 25050
    },
    {
      "gate_value": 0.35997167229652405,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 25050
    },
    {
      "grad_norm": 24.68749237060547,
      "learning_rate": 0.00010060364474747528,
      "loss": 0.3595,
      "step": 25060
    },
    {
      "gate_value": 0.3600045144557953,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 25060
    },
    {
      "grad_norm": 243.453125,
      "learning_rate": 0.00010048656842385024,
      "loss": 0.3663,
      "step": 25070
    },
    {
      "gate_value": 0.3599931597709656,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 25070
    },
    {
      "grad_norm": 103.076904296875,
      "learning_rate": 0.0001003695259421888,
      "loss": 0.3619,
      "step": 25080
    },
    {
      "gate_value": 0.3600130081176758,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 25080
    },
    {
      "grad_norm": 85.31201934814453,
      "learning_rate": 0.00010025251738248838,
      "loss": 0.3681,
      "step": 25090
    },
    {
      "gate_value": 0.36003732681274414,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 25090
    },
    {
      "grad_norm": 267.9420166015625,
      "learning_rate": 0.00010013554282472323,
      "loss": 0.3666,
      "step": 25100
    },
    {
      "gate_value": 0.3600807189941406,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 25100
    },
    {
      "grad_norm": 149.02096557617188,
      "learning_rate": 0.00010001860234884439,
      "loss": 0.3587,
      "step": 25110
    },
    {
      "gate_value": 0.3601251244544983,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 25110
    },
    {
      "grad_norm": 601.15576171875,
      "learning_rate": 9.990169603477957e-05,
      "loss": 0.3519,
      "step": 25120
    },
    {
      "gate_value": 0.360137939453125,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 25120
    },
    {
      "grad_norm": 28.030532836914062,
      "learning_rate": 9.978482396243307e-05,
      "loss": 0.3592,
      "step": 25130
    },
    {
      "gate_value": 0.3601973354816437,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 25130
    },
    {
      "grad_norm": 64.57613372802734,
      "learning_rate": 9.96679862116859e-05,
      "loss": 0.3585,
      "step": 25140
    },
    {
      "gate_value": 0.3602369725704193,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 25140
    },
    {
      "grad_norm": 77.8062515258789,
      "learning_rate": 9.955118286239554e-05,
      "loss": 0.3532,
      "step": 25150
    },
    {
      "gate_value": 0.36026737093925476,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 25150
    },
    {
      "grad_norm": 41.49631118774414,
      "learning_rate": 9.943441399439599e-05,
      "loss": 0.3543,
      "step": 25160
    },
    {
      "gate_value": 0.3602963984012604,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 25160
    },
    {
      "grad_norm": 114.31262969970703,
      "learning_rate": 9.931767968749768e-05,
      "loss": 0.3705,
      "step": 25170
    },
    {
      "gate_value": 0.360359251499176,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 25170
    },
    {
      "grad_norm": 54.857337951660156,
      "learning_rate": 9.920098002148738e-05,
      "loss": 0.3624,
      "step": 25180
    },
    {
      "gate_value": 0.3604171574115753,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 25180
    },
    {
      "grad_norm": 5679.38330078125,
      "learning_rate": 9.908431507612825e-05,
      "loss": 0.3712,
      "step": 25190
    },
    {
      "gate_value": 0.36049506068229675,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 25190
    },
    {
      "grad_norm": 30.065919876098633,
      "learning_rate": 9.896768493115966e-05,
      "loss": 0.3559,
      "step": 25200
    },
    {
      "gate_value": 0.360569566488266,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 25200
    },
    {
      "grad_norm": 34.60041809082031,
      "learning_rate": 9.885108966629721e-05,
      "loss": 0.3562,
      "step": 25210
    },
    {
      "gate_value": 0.3606504797935486,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 25210
    },
    {
      "grad_norm": 291.844482421875,
      "learning_rate": 9.873452936123271e-05,
      "loss": 0.3677,
      "step": 25220
    },
    {
      "gate_value": 0.3607202172279358,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 25220
    },
    {
      "grad_norm": 54.88591384887695,
      "learning_rate": 9.861800409563392e-05,
      "loss": 0.3562,
      "step": 25230
    },
    {
      "gate_value": 0.3607614040374756,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 25230
    },
    {
      "grad_norm": 74439.96875,
      "learning_rate": 9.850151394914485e-05,
      "loss": 0.3719,
      "step": 25240
    },
    {
      "gate_value": 0.36080268025398254,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 25240
    },
    {
      "grad_norm": 111.2549057006836,
      "learning_rate": 9.838505900138539e-05,
      "loss": 0.3777,
      "step": 25250
    },
    {
      "gate_value": 0.36084768176078796,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 25250
    },
    {
      "grad_norm": 723.0596313476562,
      "learning_rate": 9.82686393319514e-05,
      "loss": 0.367,
      "step": 25260
    },
    {
      "gate_value": 0.3609139323234558,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 25260
    },
    {
      "grad_norm": 111.521240234375,
      "learning_rate": 9.815225502041463e-05,
      "loss": 0.3632,
      "step": 25270
    },
    {
      "gate_value": 0.36096981167793274,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 25270
    },
    {
      "grad_norm": 3092.884521484375,
      "learning_rate": 9.803590614632267e-05,
      "loss": 0.3654,
      "step": 25280
    },
    {
      "gate_value": 0.36098742485046387,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 25280
    },
    {
      "grad_norm": 232.3154296875,
      "learning_rate": 9.791959278919887e-05,
      "loss": 0.35,
      "step": 25290
    },
    {
      "gate_value": 0.36098426580429077,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 25290
    },
    {
      "grad_norm": 31.037199020385742,
      "learning_rate": 9.780331502854229e-05,
      "loss": 0.3785,
      "step": 25300
    },
    {
      "gate_value": 0.3610406219959259,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 25300
    },
    {
      "grad_norm": 66.40868377685547,
      "learning_rate": 9.768707294382775e-05,
      "loss": 0.3707,
      "step": 25310
    },
    {
      "gate_value": 0.36106961965560913,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25310
    },
    {
      "grad_norm": 71.33839416503906,
      "learning_rate": 9.757086661450556e-05,
      "loss": 0.3671,
      "step": 25320
    },
    {
      "gate_value": 0.3610745370388031,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 25320
    },
    {
      "grad_norm": 111.21866607666016,
      "learning_rate": 9.745469612000161e-05,
      "loss": 0.3707,
      "step": 25330
    },
    {
      "gate_value": 0.36109820008277893,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 25330
    },
    {
      "grad_norm": 118.41226959228516,
      "learning_rate": 9.73385615397174e-05,
      "loss": 0.3745,
      "step": 25340
    },
    {
      "gate_value": 0.3611301779747009,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 25340
    },
    {
      "grad_norm": 80.08364868164062,
      "learning_rate": 9.722246295302983e-05,
      "loss": 0.3817,
      "step": 25350
    },
    {
      "gate_value": 0.361128568649292,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 25350
    },
    {
      "grad_norm": 28.752838134765625,
      "learning_rate": 9.710640043929116e-05,
      "loss": 0.3787,
      "step": 25360
    },
    {
      "gate_value": 0.36116471886634827,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 25360
    },
    {
      "grad_norm": 29.80974769592285,
      "learning_rate": 9.699037407782905e-05,
      "loss": 0.3691,
      "step": 25370
    },
    {
      "gate_value": 0.3612363636493683,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 25370
    },
    {
      "grad_norm": 23.709779739379883,
      "learning_rate": 9.687438394794637e-05,
      "loss": 0.353,
      "step": 25380
    },
    {
      "gate_value": 0.36146095395088196,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 25380
    },
    {
      "grad_norm": 22.217329025268555,
      "learning_rate": 9.675843012892136e-05,
      "loss": 0.3595,
      "step": 25390
    },
    {
      "gate_value": 0.36164140701293945,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25390
    },
    {
      "grad_norm": 16.554452896118164,
      "learning_rate": 9.664251270000735e-05,
      "loss": 0.3723,
      "step": 25400
    },
    {
      "gate_value": 0.3616686165332794,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 25400
    },
    {
      "grad_norm": 39.989200592041016,
      "learning_rate": 9.652663174043273e-05,
      "loss": 0.3671,
      "step": 25410
    },
    {
      "gate_value": 0.36170732975006104,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 25410
    },
    {
      "grad_norm": 29.332035064697266,
      "learning_rate": 9.64107873294011e-05,
      "loss": 0.359,
      "step": 25420
    },
    {
      "gate_value": 0.36175501346588135,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 25420
    },
    {
      "grad_norm": 19.313488006591797,
      "learning_rate": 9.629497954609098e-05,
      "loss": 0.3781,
      "step": 25430
    },
    {
      "gate_value": 0.36181315779685974,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 25430
    },
    {
      "grad_norm": 26.557374954223633,
      "learning_rate": 9.617920846965595e-05,
      "loss": 0.3523,
      "step": 25440
    },
    {
      "gate_value": 0.3619093596935272,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 25440
    },
    {
      "grad_norm": 29.910465240478516,
      "learning_rate": 9.606347417922444e-05,
      "loss": 0.3549,
      "step": 25450
    },
    {
      "gate_value": 0.36200904846191406,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 25450
    },
    {
      "grad_norm": 37.020931243896484,
      "learning_rate": 9.594777675389973e-05,
      "loss": 0.3795,
      "step": 25460
    },
    {
      "gate_value": 0.3620182275772095,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 25460
    },
    {
      "grad_norm": 24.904870986938477,
      "learning_rate": 9.583211627275995e-05,
      "loss": 0.3793,
      "step": 25470
    },
    {
      "gate_value": 0.36203330755233765,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25470
    },
    {
      "grad_norm": 22.63786506652832,
      "learning_rate": 9.571649281485788e-05,
      "loss": 0.3602,
      "step": 25480
    },
    {
      "gate_value": 0.362061470746994,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 25480
    },
    {
      "grad_norm": 430.0863037109375,
      "learning_rate": 9.560090645922115e-05,
      "loss": 0.3583,
      "step": 25490
    },
    {
      "gate_value": 0.36207452416419983,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25490
    },
    {
      "grad_norm": 38.97439193725586,
      "learning_rate": 9.548535728485194e-05,
      "loss": 0.3654,
      "step": 25500
    },
    {
      "gate_value": 0.36213162541389465,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 25500
    },
    {
      "grad_norm": 28.70587730407715,
      "learning_rate": 9.536984537072693e-05,
      "loss": 0.3579,
      "step": 25510
    },
    {
      "gate_value": 0.3622555732727051,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25510
    },
    {
      "grad_norm": 47.876365661621094,
      "learning_rate": 9.525437079579749e-05,
      "loss": 0.3678,
      "step": 25520
    },
    {
      "gate_value": 0.36234137415885925,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 25520
    },
    {
      "grad_norm": 51.66835403442383,
      "learning_rate": 9.513893363898934e-05,
      "loss": 0.3694,
      "step": 25530
    },
    {
      "gate_value": 0.36236605048179626,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25530
    },
    {
      "grad_norm": 38.49001693725586,
      "learning_rate": 9.502353397920278e-05,
      "loss": 0.367,
      "step": 25540
    },
    {
      "gate_value": 0.36243191361427307,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 25540
    },
    {
      "grad_norm": 26.23487663269043,
      "learning_rate": 9.490817189531236e-05,
      "loss": 0.3601,
      "step": 25550
    },
    {
      "gate_value": 0.3624683618545532,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 25550
    },
    {
      "grad_norm": 74.08128356933594,
      "learning_rate": 9.479284746616693e-05,
      "loss": 0.3824,
      "step": 25560
    },
    {
      "gate_value": 0.3625290095806122,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 25560
    },
    {
      "grad_norm": 134.24107360839844,
      "learning_rate": 9.467756077058973e-05,
      "loss": 0.3648,
      "step": 25570
    },
    {
      "gate_value": 0.3626077473163605,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 25570
    },
    {
      "grad_norm": 49.08687210083008,
      "learning_rate": 9.456231188737805e-05,
      "loss": 0.3611,
      "step": 25580
    },
    {
      "gate_value": 0.36267179250717163,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 25580
    },
    {
      "grad_norm": 76.60491180419922,
      "learning_rate": 9.444710089530349e-05,
      "loss": 0.3755,
      "step": 25590
    },
    {
      "gate_value": 0.36272481083869934,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 25590
    },
    {
      "grad_norm": 38.12582778930664,
      "learning_rate": 9.433192787311161e-05,
      "loss": 0.3705,
      "step": 25600
    },
    {
      "gate_value": 0.36277905106544495,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25600
    },
    {
      "grad_norm": 74.66680908203125,
      "learning_rate": 9.42167928995221e-05,
      "loss": 0.3744,
      "step": 25610
    },
    {
      "gate_value": 0.36285629868507385,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 25610
    },
    {
      "grad_norm": 92.35078430175781,
      "learning_rate": 9.410169605322864e-05,
      "loss": 0.3821,
      "step": 25620
    },
    {
      "gate_value": 0.36290407180786133,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 25620
    },
    {
      "grad_norm": 352.0303649902344,
      "learning_rate": 9.398663741289883e-05,
      "loss": 0.3659,
      "step": 25630
    },
    {
      "gate_value": 0.36290207505226135,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 25630
    },
    {
      "grad_norm": 266.07586669921875,
      "learning_rate": 9.387161705717418e-05,
      "loss": 0.3652,
      "step": 25640
    },
    {
      "gate_value": 0.36293622851371765,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25640
    },
    {
      "grad_norm": 235221.78125,
      "learning_rate": 9.375663506467004e-05,
      "loss": 0.353,
      "step": 25650
    },
    {
      "gate_value": 0.36297622323036194,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 25650
    },
    {
      "grad_norm": 734.5687255859375,
      "learning_rate": 9.364169151397546e-05,
      "loss": 0.3647,
      "step": 25660
    },
    {
      "gate_value": 0.3630070984363556,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 25660
    },
    {
      "grad_norm": 72.2925033569336,
      "learning_rate": 9.352678648365332e-05,
      "loss": 0.3741,
      "step": 25670
    },
    {
      "gate_value": 0.3630395531654358,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 25670
    },
    {
      "grad_norm": 1104.2977294921875,
      "learning_rate": 9.341192005224013e-05,
      "loss": 0.356,
      "step": 25680
    },
    {
      "gate_value": 0.36307305097579956,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 25680
    },
    {
      "grad_norm": 68.21739196777344,
      "learning_rate": 9.329709229824595e-05,
      "loss": 0.3667,
      "step": 25690
    },
    {
      "gate_value": 0.3630922734737396,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 25690
    },
    {
      "grad_norm": 168.8650665283203,
      "learning_rate": 9.318230330015453e-05,
      "loss": 0.3719,
      "step": 25700
    },
    {
      "gate_value": 0.36309897899627686,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25700
    },
    {
      "grad_norm": 81.41837310791016,
      "learning_rate": 9.306755313642301e-05,
      "loss": 0.353,
      "step": 25710
    },
    {
      "gate_value": 0.3631479740142822,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 25710
    },
    {
      "grad_norm": 167.31732177734375,
      "learning_rate": 9.295284188548212e-05,
      "loss": 0.3636,
      "step": 25720
    },
    {
      "gate_value": 0.3632040023803711,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25720
    },
    {
      "grad_norm": 62.20192337036133,
      "learning_rate": 9.283816962573586e-05,
      "loss": 0.3711,
      "step": 25730
    },
    {
      "gate_value": 0.36324024200439453,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 25730
    },
    {
      "grad_norm": 167.32057189941406,
      "learning_rate": 9.272353643556163e-05,
      "loss": 0.3564,
      "step": 25740
    },
    {
      "gate_value": 0.3632681965827942,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 25740
    },
    {
      "grad_norm": 121.6691665649414,
      "learning_rate": 9.260894239331023e-05,
      "loss": 0.3642,
      "step": 25750
    },
    {
      "gate_value": 0.36329635977745056,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 25750
    },
    {
      "grad_norm": 71.03226470947266,
      "learning_rate": 9.249438757730547e-05,
      "loss": 0.3722,
      "step": 25760
    },
    {
      "gate_value": 0.36331915855407715,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 25760
    },
    {
      "grad_norm": 73.70002746582031,
      "learning_rate": 9.237987206584462e-05,
      "loss": 0.3526,
      "step": 25770
    },
    {
      "gate_value": 0.363363116979599,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 25770
    },
    {
      "grad_norm": 119.9651870727539,
      "learning_rate": 9.226539593719789e-05,
      "loss": 0.3611,
      "step": 25780
    },
    {
      "gate_value": 0.36337050795555115,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 25780
    },
    {
      "grad_norm": 136.77578735351562,
      "learning_rate": 9.215095926960856e-05,
      "loss": 0.3763,
      "step": 25790
    },
    {
      "gate_value": 0.3633963465690613,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 25790
    },
    {
      "grad_norm": 77.38294982910156,
      "learning_rate": 9.203656214129313e-05,
      "loss": 0.3457,
      "step": 25800
    },
    {
      "gate_value": 0.36343178153038025,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 25800
    },
    {
      "grad_norm": 70.8493423461914,
      "learning_rate": 9.192220463044089e-05,
      "loss": 0.3741,
      "step": 25810
    },
    {
      "gate_value": 0.3634765148162842,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 25810
    },
    {
      "grad_norm": 103.03206634521484,
      "learning_rate": 9.180788681521418e-05,
      "loss": 0.3752,
      "step": 25820
    },
    {
      "gate_value": 0.3635123074054718,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 25820
    },
    {
      "grad_norm": 109.7791519165039,
      "learning_rate": 9.169360877374808e-05,
      "loss": 0.361,
      "step": 25830
    },
    {
      "gate_value": 0.3635486662387848,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 25830
    },
    {
      "grad_norm": 38.19389724731445,
      "learning_rate": 9.157937058415058e-05,
      "loss": 0.3738,
      "step": 25840
    },
    {
      "gate_value": 0.3635652959346771,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 25840
    },
    {
      "grad_norm": 112.97542572021484,
      "learning_rate": 9.146517232450244e-05,
      "loss": 0.3551,
      "step": 25850
    },
    {
      "gate_value": 0.3635725975036621,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 25850
    },
    {
      "grad_norm": 108.82026672363281,
      "learning_rate": 9.135101407285704e-05,
      "loss": 0.3769,
      "step": 25860
    },
    {
      "gate_value": 0.3635868430137634,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 25860
    },
    {
      "grad_norm": 1242.8558349609375,
      "learning_rate": 9.123689590724056e-05,
      "loss": 0.381,
      "step": 25870
    },
    {
      "gate_value": 0.36360442638397217,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 25870
    },
    {
      "grad_norm": 216.93637084960938,
      "learning_rate": 9.112281790565159e-05,
      "loss": 0.3533,
      "step": 25880
    },
    {
      "gate_value": 0.3636187016963959,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 25880
    },
    {
      "grad_norm": 205.6892547607422,
      "learning_rate": 9.100878014606137e-05,
      "loss": 0.3564,
      "step": 25890
    },
    {
      "gate_value": 0.36363720893859863,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 25890
    },
    {
      "grad_norm": 3794.68994140625,
      "learning_rate": 9.089478270641368e-05,
      "loss": 0.3688,
      "step": 25900
    },
    {
      "gate_value": 0.36364829540252686,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 25900
    },
    {
      "grad_norm": 2179.69482421875,
      "learning_rate": 9.078082566462469e-05,
      "loss": 0.3596,
      "step": 25910
    },
    {
      "gate_value": 0.3636602759361267,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 25910
    },
    {
      "grad_norm": 2942.58740234375,
      "learning_rate": 9.066690909858296e-05,
      "loss": 0.3491,
      "step": 25920
    },
    {
      "gate_value": 0.3636675775051117,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 25920
    },
    {
      "grad_norm": 31502.810546875,
      "learning_rate": 9.055303308614935e-05,
      "loss": 0.3668,
      "step": 25930
    },
    {
      "gate_value": 0.36367008090019226,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 25930
    },
    {
      "grad_norm": 12904.177734375,
      "learning_rate": 9.04391977051571e-05,
      "loss": 0.3676,
      "step": 25940
    },
    {
      "gate_value": 0.36367350816726685,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 25940
    },
    {
      "grad_norm": 1463.4390869140625,
      "learning_rate": 9.032540303341158e-05,
      "loss": 0.3664,
      "step": 25950
    },
    {
      "gate_value": 0.36367329955101013,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 25950
    },
    {
      "grad_norm": 100.67179870605469,
      "learning_rate": 9.021164914869046e-05,
      "loss": 0.3667,
      "step": 25960
    },
    {
      "gate_value": 0.36367326974868774,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 25960
    },
    {
      "grad_norm": 25260.05078125,
      "learning_rate": 9.00979361287433e-05,
      "loss": 0.3707,
      "step": 25970
    },
    {
      "gate_value": 0.36368072032928467,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 25970
    },
    {
      "grad_norm": 126501.125,
      "learning_rate": 8.998426405129198e-05,
      "loss": 0.3662,
      "step": 25980
    },
    {
      "gate_value": 0.3636826276779175,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 25980
    },
    {
      "grad_norm": 3729.87744140625,
      "learning_rate": 8.987063299403024e-05,
      "loss": 0.36,
      "step": 25990
    },
    {
      "gate_value": 0.36368346214294434,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 25990
    },
    {
      "grad_norm": 408.446533203125,
      "learning_rate": 8.97570430346239e-05,
      "loss": 0.3716,
      "step": 26000
    },
    {
      "gate_value": 0.36368486285209656,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 26000
    },
    {
      "grad_norm": 819.3681640625,
      "learning_rate": 8.964349425071056e-05,
      "loss": 0.3739,
      "step": 26010
    },
    {
      "gate_value": 0.36369675397872925,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 26010
    },
    {
      "grad_norm": 33350.34375,
      "learning_rate": 8.952998671989977e-05,
      "loss": 0.3791,
      "step": 26020
    },
    {
      "gate_value": 0.3637213706970215,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26020
    },
    {
      "grad_norm": 1744.698486328125,
      "learning_rate": 8.941652051977286e-05,
      "loss": 0.3665,
      "step": 26030
    },
    {
      "gate_value": 0.36372846364974976,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 26030
    },
    {
      "grad_norm": 151.7212371826172,
      "learning_rate": 8.930309572788289e-05,
      "loss": 0.3709,
      "step": 26040
    },
    {
      "gate_value": 0.36373579502105713,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26040
    },
    {
      "grad_norm": 119.85762023925781,
      "learning_rate": 8.918971242175473e-05,
      "loss": 0.3653,
      "step": 26050
    },
    {
      "gate_value": 0.36377817392349243,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26050
    },
    {
      "grad_norm": 741.9052734375,
      "learning_rate": 8.907637067888468e-05,
      "loss": 0.3534,
      "step": 26060
    },
    {
      "gate_value": 0.363788366317749,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 26060
    },
    {
      "grad_norm": 1784.32470703125,
      "learning_rate": 8.896307057674078e-05,
      "loss": 0.3607,
      "step": 26070
    },
    {
      "gate_value": 0.363790899515152,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 26070
    },
    {
      "grad_norm": 98.61390686035156,
      "learning_rate": 8.88498121927626e-05,
      "loss": 0.3609,
      "step": 26080
    },
    {
      "gate_value": 0.36381369829177856,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 26080
    },
    {
      "grad_norm": 67.09032440185547,
      "learning_rate": 8.873659560436119e-05,
      "loss": 0.3603,
      "step": 26090
    },
    {
      "gate_value": 0.3638378381729126,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 26090
    },
    {
      "grad_norm": 454.86260986328125,
      "learning_rate": 8.8623420888919e-05,
      "loss": 0.3503,
      "step": 26100
    },
    {
      "gate_value": 0.36386924982070923,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26100
    },
    {
      "grad_norm": 12161.5546875,
      "learning_rate": 8.851028812378986e-05,
      "loss": 0.3569,
      "step": 26110
    },
    {
      "gate_value": 0.36391758918762207,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 26110
    },
    {
      "grad_norm": 117.7813949584961,
      "learning_rate": 8.8397197386299e-05,
      "loss": 0.3522,
      "step": 26120
    },
    {
      "gate_value": 0.36397236585617065,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 26120
    },
    {
      "grad_norm": 106.986083984375,
      "learning_rate": 8.828414875374281e-05,
      "loss": 0.3651,
      "step": 26130
    },
    {
      "gate_value": 0.3640117347240448,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 26130
    },
    {
      "grad_norm": 74.00639343261719,
      "learning_rate": 8.817114230338902e-05,
      "loss": 0.3615,
      "step": 26140
    },
    {
      "gate_value": 0.3640309274196625,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 26140
    },
    {
      "grad_norm": 2831.8740234375,
      "learning_rate": 8.805817811247651e-05,
      "loss": 0.3649,
      "step": 26150
    },
    {
      "gate_value": 0.36402857303619385,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 26150
    },
    {
      "grad_norm": 287.6517028808594,
      "learning_rate": 8.794525625821514e-05,
      "loss": 0.3715,
      "step": 26160
    },
    {
      "gate_value": 0.36404845118522644,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26160
    },
    {
      "grad_norm": 98.52863311767578,
      "learning_rate": 8.783237681778597e-05,
      "loss": 0.3434,
      "step": 26170
    },
    {
      "gate_value": 0.3640698492527008,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 26170
    },
    {
      "grad_norm": 109.10649108886719,
      "learning_rate": 8.771953986834106e-05,
      "loss": 0.37,
      "step": 26180
    },
    {
      "gate_value": 0.3640887439250946,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 26180
    },
    {
      "grad_norm": 763.0786743164062,
      "learning_rate": 8.760674548700336e-05,
      "loss": 0.3443,
      "step": 26190
    },
    {
      "gate_value": 0.36416587233543396,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 26190
    },
    {
      "grad_norm": 243.73666381835938,
      "learning_rate": 8.74939937508668e-05,
      "loss": 0.3742,
      "step": 26200
    },
    {
      "gate_value": 0.3642013967037201,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 26200
    },
    {
      "grad_norm": 77.64506530761719,
      "learning_rate": 8.738128473699609e-05,
      "loss": 0.3718,
      "step": 26210
    },
    {
      "gate_value": 0.364237517118454,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 26210
    },
    {
      "grad_norm": 80.43402099609375,
      "learning_rate": 8.72686185224268e-05,
      "loss": 0.3706,
      "step": 26220
    },
    {
      "gate_value": 0.3642655909061432,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26220
    },
    {
      "grad_norm": 717.6832275390625,
      "learning_rate": 8.715599518416523e-05,
      "loss": 0.3657,
      "step": 26230
    },
    {
      "gate_value": 0.3643067181110382,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26230
    },
    {
      "grad_norm": 155.02511596679688,
      "learning_rate": 8.704341479918843e-05,
      "loss": 0.3736,
      "step": 26240
    },
    {
      "gate_value": 0.3643386662006378,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26240
    },
    {
      "grad_norm": 333.3760681152344,
      "learning_rate": 8.693087744444398e-05,
      "loss": 0.3678,
      "step": 26250
    },
    {
      "gate_value": 0.3643483519554138,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 26250
    },
    {
      "grad_norm": 112.97294616699219,
      "learning_rate": 8.681838319685e-05,
      "loss": 0.3718,
      "step": 26260
    },
    {
      "gate_value": 0.36436960101127625,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26260
    },
    {
      "grad_norm": 129.95945739746094,
      "learning_rate": 8.670593213329537e-05,
      "loss": 0.3698,
      "step": 26270
    },
    {
      "gate_value": 0.3643975853919983,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 26270
    },
    {
      "grad_norm": 58.365657806396484,
      "learning_rate": 8.659352433063928e-05,
      "loss": 0.3561,
      "step": 26280
    },
    {
      "gate_value": 0.3644281029701233,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 26280
    },
    {
      "grad_norm": 114.4413833618164,
      "learning_rate": 8.64811598657115e-05,
      "loss": 0.3587,
      "step": 26290
    },
    {
      "gate_value": 0.3644927740097046,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 26290
    },
    {
      "grad_norm": 58.796695709228516,
      "learning_rate": 8.636883881531194e-05,
      "loss": 0.3701,
      "step": 26300
    },
    {
      "gate_value": 0.3645520508289337,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 26300
    },
    {
      "grad_norm": 121.88422393798828,
      "learning_rate": 8.625656125621103e-05,
      "loss": 0.3662,
      "step": 26310
    },
    {
      "gate_value": 0.36459311842918396,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 26310
    },
    {
      "grad_norm": 76.7275619506836,
      "learning_rate": 8.614432726514944e-05,
      "loss": 0.3492,
      "step": 26320
    },
    {
      "gate_value": 0.3646358847618103,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 26320
    },
    {
      "grad_norm": 76.93498229980469,
      "learning_rate": 8.60321369188381e-05,
      "loss": 0.3613,
      "step": 26330
    },
    {
      "gate_value": 0.3646531105041504,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 26330
    },
    {
      "grad_norm": 42.61845779418945,
      "learning_rate": 8.591999029395795e-05,
      "loss": 0.3684,
      "step": 26340
    },
    {
      "gate_value": 0.3646797239780426,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 26340
    },
    {
      "grad_norm": 109.5250015258789,
      "learning_rate": 8.580788746716024e-05,
      "loss": 0.3689,
      "step": 26350
    },
    {
      "gate_value": 0.3647087514400482,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 26350
    },
    {
      "grad_norm": 99.25482940673828,
      "learning_rate": 8.56958285150661e-05,
      "loss": 0.3449,
      "step": 26360
    },
    {
      "gate_value": 0.36472317576408386,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26360
    },
    {
      "grad_norm": 54.96424102783203,
      "learning_rate": 8.558381351426681e-05,
      "loss": 0.3416,
      "step": 26370
    },
    {
      "gate_value": 0.3647831678390503,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 26370
    },
    {
      "grad_norm": 72.82083129882812,
      "learning_rate": 8.547184254132358e-05,
      "loss": 0.3728,
      "step": 26380
    },
    {
      "gate_value": 0.36487022042274475,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 26380
    },
    {
      "grad_norm": 103.92755889892578,
      "learning_rate": 8.535991567276758e-05,
      "loss": 0.3583,
      "step": 26390
    },
    {
      "gate_value": 0.3649507462978363,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 26390
    },
    {
      "grad_norm": 129.11090087890625,
      "learning_rate": 8.524803298509963e-05,
      "loss": 0.371,
      "step": 26400
    },
    {
      "gate_value": 0.364972323179245,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 26400
    },
    {
      "grad_norm": 61.047943115234375,
      "learning_rate": 8.513619455479056e-05,
      "loss": 0.3592,
      "step": 26410
    },
    {
      "gate_value": 0.3650073707103729,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 26410
    },
    {
      "grad_norm": 808.7735595703125,
      "learning_rate": 8.502440045828087e-05,
      "loss": 0.3536,
      "step": 26420
    },
    {
      "gate_value": 0.36508437991142273,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 26420
    },
    {
      "grad_norm": 65.29513549804688,
      "learning_rate": 8.491265077198085e-05,
      "loss": 0.3723,
      "step": 26430
    },
    {
      "gate_value": 0.3651318848133087,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26430
    },
    {
      "grad_norm": 45.95816421508789,
      "learning_rate": 8.480094557227022e-05,
      "loss": 0.3473,
      "step": 26440
    },
    {
      "gate_value": 0.3651773929595947,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26440
    },
    {
      "grad_norm": 62.62614822387695,
      "learning_rate": 8.468928493549858e-05,
      "loss": 0.3681,
      "step": 26450
    },
    {
      "gate_value": 0.3652338981628418,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 26450
    },
    {
      "grad_norm": 57.45429229736328,
      "learning_rate": 8.457766893798478e-05,
      "loss": 0.3672,
      "step": 26460
    },
    {
      "gate_value": 0.3652498722076416,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 26460
    },
    {
      "grad_norm": 89.05564880371094,
      "learning_rate": 8.446609765601736e-05,
      "loss": 0.3666,
      "step": 26470
    },
    {
      "gate_value": 0.3652762174606323,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 26470
    },
    {
      "grad_norm": 177.4752655029297,
      "learning_rate": 8.435457116585426e-05,
      "loss": 0.3681,
      "step": 26480
    },
    {
      "gate_value": 0.365335613489151,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 26480
    },
    {
      "grad_norm": 618.664306640625,
      "learning_rate": 8.424308954372282e-05,
      "loss": 0.3676,
      "step": 26490
    },
    {
      "gate_value": 0.3653985857963562,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 26490
    },
    {
      "grad_norm": 649.4828491210938,
      "learning_rate": 8.413165286581956e-05,
      "loss": 0.3701,
      "step": 26500
    },
    {
      "gate_value": 0.3654293417930603,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 26500
    },
    {
      "grad_norm": 119.72760772705078,
      "learning_rate": 8.402026120831047e-05,
      "loss": 0.3764,
      "step": 26510
    },
    {
      "gate_value": 0.36545631289482117,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 26510
    },
    {
      "grad_norm": 58.20319747924805,
      "learning_rate": 8.390891464733074e-05,
      "loss": 0.3595,
      "step": 26520
    },
    {
      "gate_value": 0.365516722202301,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26520
    },
    {
      "grad_norm": 68.46391296386719,
      "learning_rate": 8.37976132589846e-05,
      "loss": 0.3445,
      "step": 26530
    },
    {
      "gate_value": 0.36553603410720825,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 26530
    },
    {
      "grad_norm": 48.550350189208984,
      "learning_rate": 8.368635711934554e-05,
      "loss": 0.3589,
      "step": 26540
    },
    {
      "gate_value": 0.3655546009540558,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 26540
    },
    {
      "grad_norm": 99.2035140991211,
      "learning_rate": 8.357514630445617e-05,
      "loss": 0.342,
      "step": 26550
    },
    {
      "gate_value": 0.36558255553245544,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 26550
    },
    {
      "grad_norm": 108.59980773925781,
      "learning_rate": 8.346398089032788e-05,
      "loss": 0.3757,
      "step": 26560
    },
    {
      "gate_value": 0.36561352014541626,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 26560
    },
    {
      "grad_norm": 74.10098266601562,
      "learning_rate": 8.335286095294122e-05,
      "loss": 0.3563,
      "step": 26570
    },
    {
      "gate_value": 0.3656443953514099,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 26570
    },
    {
      "grad_norm": 107.44097137451172,
      "learning_rate": 8.324178656824569e-05,
      "loss": 0.3707,
      "step": 26580
    },
    {
      "gate_value": 0.3657311499118805,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 26580
    },
    {
      "grad_norm": 43.990760803222656,
      "learning_rate": 8.313075781215961e-05,
      "loss": 0.3525,
      "step": 26590
    },
    {
      "gate_value": 0.36581742763519287,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 26590
    },
    {
      "grad_norm": 93.85136413574219,
      "learning_rate": 8.301977476056995e-05,
      "loss": 0.3456,
      "step": 26600
    },
    {
      "gate_value": 0.36589711904525757,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26600
    },
    {
      "grad_norm": 79.03585052490234,
      "learning_rate": 8.290883748933273e-05,
      "loss": 0.3726,
      "step": 26610
    },
    {
      "gate_value": 0.3659816086292267,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 26610
    },
    {
      "grad_norm": 112.67575073242188,
      "learning_rate": 8.279794607427236e-05,
      "loss": 0.3512,
      "step": 26620
    },
    {
      "gate_value": 0.3660266697406769,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 26620
    },
    {
      "grad_norm": 66.10786437988281,
      "learning_rate": 8.268710059118221e-05,
      "loss": 0.3603,
      "step": 26630
    },
    {
      "gate_value": 0.3660682737827301,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 26630
    },
    {
      "grad_norm": 112.40705108642578,
      "learning_rate": 8.257630111582408e-05,
      "loss": 0.3679,
      "step": 26640
    },
    {
      "gate_value": 0.36610719561576843,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 26640
    },
    {
      "grad_norm": 102.23544311523438,
      "learning_rate": 8.246554772392842e-05,
      "loss": 0.3632,
      "step": 26650
    },
    {
      "gate_value": 0.36619168519973755,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 26650
    },
    {
      "grad_norm": 90.14778900146484,
      "learning_rate": 8.235484049119402e-05,
      "loss": 0.3548,
      "step": 26660
    },
    {
      "gate_value": 0.3662635087966919,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 26660
    },
    {
      "grad_norm": 88.61628723144531,
      "learning_rate": 8.224417949328828e-05,
      "loss": 0.354,
      "step": 26670
    },
    {
      "gate_value": 0.36625897884368896,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 26670
    },
    {
      "grad_norm": 97.69658660888672,
      "learning_rate": 8.213356480584696e-05,
      "loss": 0.3518,
      "step": 26680
    },
    {
      "gate_value": 0.3662429451942444,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 26680
    },
    {
      "grad_norm": 215.29074096679688,
      "learning_rate": 8.202299650447422e-05,
      "loss": 0.3617,
      "step": 26690
    },
    {
      "gate_value": 0.3662889301776886,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 26690
    },
    {
      "grad_norm": 3471.608642578125,
      "learning_rate": 8.191247466474232e-05,
      "loss": 0.3551,
      "step": 26700
    },
    {
      "gate_value": 0.3663380444049835,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 26700
    },
    {
      "grad_norm": 131.4551544189453,
      "learning_rate": 8.180199936219201e-05,
      "loss": 0.3416,
      "step": 26710
    },
    {
      "gate_value": 0.36638250946998596,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 26710
    },
    {
      "grad_norm": 116.79961395263672,
      "learning_rate": 8.169157067233204e-05,
      "loss": 0.3812,
      "step": 26720
    },
    {
      "gate_value": 0.36638423800468445,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 26720
    },
    {
      "grad_norm": 115.24409484863281,
      "learning_rate": 8.158118867063939e-05,
      "loss": 0.3785,
      "step": 26730
    },
    {
      "gate_value": 0.36644190549850464,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 26730
    },
    {
      "grad_norm": 103.70458221435547,
      "learning_rate": 8.14708534325591e-05,
      "loss": 0.3611,
      "step": 26740
    },
    {
      "gate_value": 0.36655059456825256,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 26740
    },
    {
      "grad_norm": 101.19654083251953,
      "learning_rate": 8.136056503350441e-05,
      "loss": 0.3733,
      "step": 26750
    },
    {
      "gate_value": 0.36665982007980347,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 26750
    },
    {
      "grad_norm": 134.09664916992188,
      "learning_rate": 8.12503235488562e-05,
      "loss": 0.3731,
      "step": 26760
    },
    {
      "gate_value": 0.3667478561401367,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26760
    },
    {
      "grad_norm": 113.2645492553711,
      "learning_rate": 8.114012905396356e-05,
      "loss": 0.3596,
      "step": 26770
    },
    {
      "gate_value": 0.3667839467525482,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26770
    },
    {
      "grad_norm": 400.9145202636719,
      "learning_rate": 8.102998162414342e-05,
      "loss": 0.3667,
      "step": 26780
    },
    {
      "gate_value": 0.3667834401130676,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 26780
    },
    {
      "grad_norm": 60948.7890625,
      "learning_rate": 8.091988133468056e-05,
      "loss": 0.3567,
      "step": 26790
    },
    {
      "gate_value": 0.3667881488800049,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 26790
    },
    {
      "grad_norm": 91.94977569580078,
      "learning_rate": 8.08098282608274e-05,
      "loss": 0.3669,
      "step": 26800
    },
    {
      "gate_value": 0.36683157086372375,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 26800
    },
    {
      "grad_norm": 136.93983459472656,
      "learning_rate": 8.069982247780416e-05,
      "loss": 0.3668,
      "step": 26810
    },
    {
      "gate_value": 0.366877943277359,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 26810
    },
    {
      "grad_norm": 3072.138671875,
      "learning_rate": 8.058986406079878e-05,
      "loss": 0.3674,
      "step": 26820
    },
    {
      "gate_value": 0.36689361929893494,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 26820
    },
    {
      "grad_norm": 4344.18505859375,
      "learning_rate": 8.047995308496684e-05,
      "loss": 0.3614,
      "step": 26830
    },
    {
      "gate_value": 0.3669084906578064,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 26830
    },
    {
      "grad_norm": 2586.574462890625,
      "learning_rate": 8.037008962543139e-05,
      "loss": 0.3793,
      "step": 26840
    },
    {
      "gate_value": 0.3669230043888092,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 26840
    },
    {
      "grad_norm": 254.2022705078125,
      "learning_rate": 8.02602737572832e-05,
      "loss": 0.3756,
      "step": 26850
    },
    {
      "gate_value": 0.3669438064098358,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26850
    },
    {
      "grad_norm": 5953.83935546875,
      "learning_rate": 8.015050555558022e-05,
      "loss": 0.3569,
      "step": 26860
    },
    {
      "gate_value": 0.3669627010822296,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 26860
    },
    {
      "grad_norm": 683.7999267578125,
      "learning_rate": 8.004078509534807e-05,
      "loss": 0.3645,
      "step": 26870
    },
    {
      "gate_value": 0.3669717013835907,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 26870
    },
    {
      "grad_norm": 13406.55859375,
      "learning_rate": 7.99311124515796e-05,
      "loss": 0.3823,
      "step": 26880
    },
    {
      "gate_value": 0.3669739067554474,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26880
    },
    {
      "grad_norm": 28604.58203125,
      "learning_rate": 7.982148769923513e-05,
      "loss": 0.358,
      "step": 26890
    },
    {
      "gate_value": 0.366974800825119,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 26890
    },
    {
      "grad_norm": 23105.599609375,
      "learning_rate": 7.971191091324209e-05,
      "loss": 0.3573,
      "step": 26900
    },
    {
      "gate_value": 0.3669755160808563,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 26900
    },
    {
      "grad_norm": 8306.990234375,
      "learning_rate": 7.960238216849508e-05,
      "loss": 0.3597,
      "step": 26910
    },
    {
      "gate_value": 0.3669755756855011,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 26910
    },
    {
      "grad_norm": 2806.118408203125,
      "learning_rate": 7.949290153985608e-05,
      "loss": 0.363,
      "step": 26920
    },
    {
      "gate_value": 0.36697500944137573,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26920
    },
    {
      "grad_norm": 1918.9229736328125,
      "learning_rate": 7.938346910215402e-05,
      "loss": 0.3462,
      "step": 26930
    },
    {
      "gate_value": 0.36697521805763245,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 26930
    },
    {
      "grad_norm": 6161.009765625,
      "learning_rate": 7.927408493018493e-05,
      "loss": 0.3719,
      "step": 26940
    },
    {
      "gate_value": 0.36697572469711304,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 26940
    },
    {
      "grad_norm": 3241.89599609375,
      "learning_rate": 7.916474909871199e-05,
      "loss": 0.3612,
      "step": 26950
    },
    {
      "gate_value": 0.36697638034820557,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26950
    },
    {
      "grad_norm": 18069.666015625,
      "learning_rate": 7.9055461682465e-05,
      "loss": 0.3615,
      "step": 26960
    },
    {
      "gate_value": 0.36697715520858765,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 26960
    },
    {
      "grad_norm": 32112.46484375,
      "learning_rate": 7.894622275614102e-05,
      "loss": 0.3598,
      "step": 26970
    },
    {
      "gate_value": 0.36697834730148315,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 26970
    },
    {
      "grad_norm": 9550.240234375,
      "learning_rate": 7.883703239440377e-05,
      "loss": 0.3606,
      "step": 26980
    },
    {
      "gate_value": 0.36697885394096375,
      "icl_sequence_length": 56,
      "num_contexts": 3,
      "step": 26980
    },
    {
      "grad_norm": 19501.4921875,
      "learning_rate": 7.872789067188391e-05,
      "loss": 0.3556,
      "step": 26990
    },
    {
      "gate_value": 0.36697813868522644,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 26990
    },
    {
      "grad_norm": 8118.97607421875,
      "learning_rate": 7.861879766317873e-05,
      "loss": 0.3732,
      "step": 27000
    },
    {
      "gate_value": 0.36697742342948914,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 27000
    },
    {
      "grad_norm": 17374.55859375,
      "learning_rate": 7.850975344285219e-05,
      "loss": 0.3678,
      "step": 27010
    },
    {
      "gate_value": 0.3669770359992981,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 27010
    },
    {
      "grad_norm": 69148.9453125,
      "learning_rate": 7.840075808543508e-05,
      "loss": 0.3632,
      "step": 27020
    },
    {
      "gate_value": 0.3669770061969757,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 27020
    },
    {
      "grad_norm": 14847.521484375,
      "learning_rate": 7.829181166542464e-05,
      "loss": 0.366,
      "step": 27030
    },
    {
      "gate_value": 0.36697685718536377,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 27030
    },
    {
      "grad_norm": 24615.54296875,
      "learning_rate": 7.81829142572848e-05,
      "loss": 0.3668,
      "step": 27040
    },
    {
      "gate_value": 0.36697646975517273,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27040
    },
    {
      "grad_norm": 36081.73828125,
      "learning_rate": 7.807406593544592e-05,
      "loss": 0.3775,
      "step": 27050
    },
    {
      "gate_value": 0.3669762909412384,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 27050
    },
    {
      "grad_norm": 19997.75,
      "learning_rate": 7.796526677430468e-05,
      "loss": 0.3613,
      "step": 27060
    },
    {
      "gate_value": 0.3669760525226593,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 27060
    },
    {
      "grad_norm": 18589.345703125,
      "learning_rate": 7.785651684822436e-05,
      "loss": 0.377,
      "step": 27070
    },
    {
      "gate_value": 0.3669760227203369,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 27070
    },
    {
      "grad_norm": 30868.748046875,
      "learning_rate": 7.774781623153455e-05,
      "loss": 0.348,
      "step": 27080
    },
    {
      "gate_value": 0.36697590351104736,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 27080
    },
    {
      "grad_norm": 16647.330078125,
      "learning_rate": 7.7639164998531e-05,
      "loss": 0.3709,
      "step": 27090
    },
    {
      "gate_value": 0.366975873708725,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 27090
    },
    {
      "grad_norm": 35517.39453125,
      "learning_rate": 7.75305632234759e-05,
      "loss": 0.3702,
      "step": 27100
    },
    {
      "gate_value": 0.3669756054878235,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 27100
    },
    {
      "grad_norm": 31658.29296875,
      "learning_rate": 7.742201098059746e-05,
      "loss": 0.3734,
      "step": 27110
    },
    {
      "gate_value": 0.36697515845298767,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 27110
    },
    {
      "grad_norm": 13528.32421875,
      "learning_rate": 7.731350834409011e-05,
      "loss": 0.3628,
      "step": 27120
    },
    {
      "gate_value": 0.36697474122047424,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 27120
    },
    {
      "grad_norm": 17795.712890625,
      "learning_rate": 7.720505538811444e-05,
      "loss": 0.3656,
      "step": 27130
    },
    {
      "gate_value": 0.366974413394928,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 27130
    },
    {
      "grad_norm": 13977.12109375,
      "learning_rate": 7.709665218679698e-05,
      "loss": 0.3694,
      "step": 27140
    },
    {
      "gate_value": 0.36697402596473694,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 27140
    },
    {
      "grad_norm": 26547.1484375,
      "learning_rate": 7.698829881423039e-05,
      "loss": 0.3581,
      "step": 27150
    },
    {
      "gate_value": 0.3669734299182892,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 27150
    },
    {
      "grad_norm": 9316.2724609375,
      "learning_rate": 7.687999534447303e-05,
      "loss": 0.3642,
      "step": 27160
    },
    {
      "gate_value": 0.366972953081131,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 27160
    },
    {
      "grad_norm": 12746.78515625,
      "learning_rate": 7.677174185154943e-05,
      "loss": 0.3871,
      "step": 27170
    },
    {
      "gate_value": 0.36697226762771606,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 27170
    },
    {
      "grad_norm": 11482.865234375,
      "learning_rate": 7.666353840944972e-05,
      "loss": 0.3691,
      "step": 27180
    },
    {
      "gate_value": 0.36697155237197876,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27180
    },
    {
      "grad_norm": 14710.9580078125,
      "learning_rate": 7.655538509212998e-05,
      "loss": 0.3585,
      "step": 27190
    },
    {
      "gate_value": 0.36697089672088623,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 27190
    },
    {
      "grad_norm": 8220.4599609375,
      "learning_rate": 7.644728197351205e-05,
      "loss": 0.363,
      "step": 27200
    },
    {
      "gate_value": 0.36697012186050415,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 27200
    },
    {
      "grad_norm": 2625.1962890625,
      "learning_rate": 7.633922912748328e-05,
      "loss": 0.3613,
      "step": 27210
    },
    {
      "gate_value": 0.36697015166282654,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 27210
    },
    {
      "grad_norm": 662.7630004882812,
      "learning_rate": 7.623122662789681e-05,
      "loss": 0.3677,
      "step": 27220
    },
    {
      "gate_value": 0.3669702708721161,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 27220
    },
    {
      "grad_norm": 1302.579345703125,
      "learning_rate": 7.612327454857134e-05,
      "loss": 0.3699,
      "step": 27230
    },
    {
      "gate_value": 0.3669745922088623,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 27230
    },
    {
      "grad_norm": 712.6267700195312,
      "learning_rate": 7.601537296329109e-05,
      "loss": 0.3561,
      "step": 27240
    },
    {
      "gate_value": 0.36698177456855774,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 27240
    },
    {
      "grad_norm": 1077.0098876953125,
      "learning_rate": 7.590752194580589e-05,
      "loss": 0.3671,
      "step": 27250
    },
    {
      "gate_value": 0.3669895827770233,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 27250
    },
    {
      "grad_norm": 1208.8837890625,
      "learning_rate": 7.579972156983075e-05,
      "loss": 0.3763,
      "step": 27260
    },
    {
      "gate_value": 0.36700811982154846,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 27260
    },
    {
      "grad_norm": 390.91094970703125,
      "learning_rate": 7.56919719090462e-05,
      "loss": 0.3694,
      "step": 27270
    },
    {
      "gate_value": 0.3670542538166046,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27270
    },
    {
      "grad_norm": 269.8956604003906,
      "learning_rate": 7.558427303709817e-05,
      "loss": 0.3803,
      "step": 27280
    },
    {
      "gate_value": 0.36706268787384033,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 27280
    },
    {
      "grad_norm": 39.60105895996094,
      "learning_rate": 7.547662502759783e-05,
      "loss": 0.3695,
      "step": 27290
    },
    {
      "gate_value": 0.36706313490867615,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 27290
    },
    {
      "grad_norm": 1831.5201416015625,
      "learning_rate": 7.536902795412159e-05,
      "loss": 0.3556,
      "step": 27300
    },
    {
      "gate_value": 0.366983562707901,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 27300
    },
    {
      "grad_norm": 1605.810302734375,
      "learning_rate": 7.5261481890211e-05,
      "loss": 0.3733,
      "step": 27310
    },
    {
      "gate_value": 0.3669770359992981,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 27310
    },
    {
      "grad_norm": 68.8116683959961,
      "learning_rate": 7.515398690937279e-05,
      "loss": 0.3808,
      "step": 27320
    },
    {
      "gate_value": 0.36701732873916626,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 27320
    },
    {
      "grad_norm": 18.84232521057129,
      "learning_rate": 7.504654308507875e-05,
      "loss": 0.3664,
      "step": 27330
    },
    {
      "gate_value": 0.36712032556533813,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 27330
    },
    {
      "grad_norm": 41.52174377441406,
      "learning_rate": 7.493915049076576e-05,
      "loss": 0.3641,
      "step": 27340
    },
    {
      "gate_value": 0.36718326807022095,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 27340
    },
    {
      "grad_norm": 24.148374557495117,
      "learning_rate": 7.48318091998357e-05,
      "loss": 0.3496,
      "step": 27350
    },
    {
      "gate_value": 0.3673146963119507,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 27350
    },
    {
      "grad_norm": 14.730865478515625,
      "learning_rate": 7.472451928565523e-05,
      "loss": 0.3738,
      "step": 27360
    },
    {
      "gate_value": 0.3674127161502838,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 27360
    },
    {
      "grad_norm": 63.99334716796875,
      "learning_rate": 7.461728082155597e-05,
      "loss": 0.3546,
      "step": 27370
    },
    {
      "gate_value": 0.36764848232269287,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 27370
    },
    {
      "grad_norm": 42.657657623291016,
      "learning_rate": 7.451009388083445e-05,
      "loss": 0.3724,
      "step": 27380
    },
    {
      "gate_value": 0.3677850663661957,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 27380
    },
    {
      "grad_norm": 61.866371154785156,
      "learning_rate": 7.440295853675195e-05,
      "loss": 0.3768,
      "step": 27390
    },
    {
      "gate_value": 0.3676343560218811,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 27390
    },
    {
      "grad_norm": 7950.65771484375,
      "learning_rate": 7.42958748625345e-05,
      "loss": 0.3605,
      "step": 27400
    },
    {
      "gate_value": 0.3675735890865326,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 27400
    },
    {
      "grad_norm": 22.300975799560547,
      "learning_rate": 7.418884293137267e-05,
      "loss": 0.363,
      "step": 27410
    },
    {
      "gate_value": 0.3675788938999176,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27410
    },
    {
      "grad_norm": 2110.2626953125,
      "learning_rate": 7.408186281642186e-05,
      "loss": 0.3656,
      "step": 27420
    },
    {
      "gate_value": 0.3675514757633209,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 27420
    },
    {
      "grad_norm": 32832.37890625,
      "learning_rate": 7.397493459080193e-05,
      "loss": 0.3811,
      "step": 27430
    },
    {
      "gate_value": 0.36745667457580566,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27430
    },
    {
      "grad_norm": 111.27005004882812,
      "learning_rate": 7.386805832759735e-05,
      "loss": 0.3723,
      "step": 27440
    },
    {
      "gate_value": 0.36754679679870605,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 27440
    },
    {
      "grad_norm": 67.27931213378906,
      "learning_rate": 7.376123409985707e-05,
      "loss": 0.3629,
      "step": 27450
    },
    {
      "gate_value": 0.3676038086414337,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 27450
    },
    {
      "grad_norm": 74.29624938964844,
      "learning_rate": 7.36544619805944e-05,
      "loss": 0.3606,
      "step": 27460
    },
    {
      "gate_value": 0.36755993962287903,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 27460
    },
    {
      "grad_norm": 14.337113380432129,
      "learning_rate": 7.3547742042787e-05,
      "loss": 0.364,
      "step": 27470
    },
    {
      "gate_value": 0.36757075786590576,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 27470
    },
    {
      "grad_norm": 26.758407592773438,
      "learning_rate": 7.344107435937703e-05,
      "loss": 0.3781,
      "step": 27480
    },
    {
      "gate_value": 0.3676011860370636,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 27480
    },
    {
      "grad_norm": 250.94358825683594,
      "learning_rate": 7.333445900327082e-05,
      "loss": 0.3616,
      "step": 27490
    },
    {
      "gate_value": 0.3674643933773041,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 27490
    },
    {
      "grad_norm": 16.786155700683594,
      "learning_rate": 7.322789604733902e-05,
      "loss": 0.3666,
      "step": 27500
    },
    {
      "gate_value": 0.36760422587394714,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 27500
    },
    {
      "grad_norm": 35.89059066772461,
      "learning_rate": 7.31213855644163e-05,
      "loss": 0.3469,
      "step": 27510
    },
    {
      "gate_value": 0.3677339553833008,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 27510
    },
    {
      "grad_norm": 19.958139419555664,
      "learning_rate": 7.301492762730162e-05,
      "loss": 0.3761,
      "step": 27520
    },
    {
      "gate_value": 0.36780139803886414,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 27520
    },
    {
      "grad_norm": 78.30398559570312,
      "learning_rate": 7.2908522308758e-05,
      "loss": 0.3981,
      "step": 27530
    },
    {
      "gate_value": 0.36783716082572937,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 27530
    },
    {
      "grad_norm": 608.5565795898438,
      "learning_rate": 7.280216968151249e-05,
      "loss": 0.3739,
      "step": 27540
    },
    {
      "gate_value": 0.3678564429283142,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 27540
    },
    {
      "grad_norm": 15003.1396484375,
      "learning_rate": 7.269586981825602e-05,
      "loss": 0.3671,
      "step": 27550
    },
    {
      "gate_value": 0.36783695220947266,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 27550
    },
    {
      "grad_norm": 92.22562408447266,
      "learning_rate": 7.258962279164366e-05,
      "loss": 0.3814,
      "step": 27560
    },
    {
      "gate_value": 0.3677425980567932,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27560
    },
    {
      "grad_norm": 19.08238410949707,
      "learning_rate": 7.248342867429412e-05,
      "loss": 0.3819,
      "step": 27570
    },
    {
      "gate_value": 0.3676200211048126,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 27570
    },
    {
      "grad_norm": 33.2855224609375,
      "learning_rate": 7.237728753879014e-05,
      "loss": 0.3657,
      "step": 27580
    },
    {
      "gate_value": 0.36787912249565125,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27580
    },
    {
      "grad_norm": 50.328094482421875,
      "learning_rate": 7.22711994576782e-05,
      "loss": 0.3521,
      "step": 27590
    },
    {
      "gate_value": 0.3680151700973511,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 27590
    },
    {
      "grad_norm": 24.794818878173828,
      "learning_rate": 7.216516450346853e-05,
      "loss": 0.3669,
      "step": 27600
    },
    {
      "gate_value": 0.36789053678512573,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 27600
    },
    {
      "grad_norm": 58.86216735839844,
      "learning_rate": 7.205918274863495e-05,
      "loss": 0.371,
      "step": 27610
    },
    {
      "gate_value": 0.3678717315196991,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 27610
    },
    {
      "grad_norm": 21.35430145263672,
      "learning_rate": 7.195325426561501e-05,
      "loss": 0.353,
      "step": 27620
    },
    {
      "gate_value": 0.3678174912929535,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 27620
    },
    {
      "grad_norm": 18.842321395874023,
      "learning_rate": 7.184737912680985e-05,
      "loss": 0.3591,
      "step": 27630
    },
    {
      "gate_value": 0.3679717481136322,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 27630
    },
    {
      "grad_norm": 13.276396751403809,
      "learning_rate": 7.174155740458416e-05,
      "loss": 0.3645,
      "step": 27640
    },
    {
      "gate_value": 0.36811956763267517,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 27640
    },
    {
      "grad_norm": 7.989704608917236,
      "learning_rate": 7.163578917126602e-05,
      "loss": 0.3766,
      "step": 27650
    },
    {
      "gate_value": 0.3683876693248749,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 27650
    },
    {
      "grad_norm": 12.756558418273926,
      "learning_rate": 7.15300744991471e-05,
      "loss": 0.35,
      "step": 27660
    },
    {
      "gate_value": 0.3684402108192444,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 27660
    },
    {
      "grad_norm": 20.83132553100586,
      "learning_rate": 7.142441346048227e-05,
      "loss": 0.3865,
      "step": 27670
    },
    {
      "gate_value": 0.36845216155052185,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 27670
    },
    {
      "grad_norm": 19.569744110107422,
      "learning_rate": 7.131880612748991e-05,
      "loss": 0.3727,
      "step": 27680
    },
    {
      "gate_value": 0.3685762584209442,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 27680
    },
    {
      "grad_norm": 185.0944366455078,
      "learning_rate": 7.121325257235165e-05,
      "loss": 0.366,
      "step": 27690
    },
    {
      "gate_value": 0.3684345483779907,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 27690
    },
    {
      "grad_norm": 700.8203125,
      "learning_rate": 7.110775286721242e-05,
      "loss": 0.362,
      "step": 27700
    },
    {
      "gate_value": 0.3685183525085449,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 27700
    },
    {
      "grad_norm": 26.56828498840332,
      "learning_rate": 7.10023070841801e-05,
      "loss": 0.3669,
      "step": 27710
    },
    {
      "gate_value": 0.3687787652015686,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 27710
    },
    {
      "grad_norm": 26.16849136352539,
      "learning_rate": 7.0896915295326e-05,
      "loss": 0.3551,
      "step": 27720
    },
    {
      "gate_value": 0.36871036887168884,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 27720
    },
    {
      "grad_norm": 28.85477066040039,
      "learning_rate": 7.079157757268446e-05,
      "loss": 0.3546,
      "step": 27730
    },
    {
      "gate_value": 0.3683936297893524,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 27730
    },
    {
      "grad_norm": 54.32163619995117,
      "learning_rate": 7.06862939882527e-05,
      "loss": 0.3639,
      "step": 27740
    },
    {
      "gate_value": 0.3683428466320038,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 27740
    },
    {
      "grad_norm": 954.1820068359375,
      "learning_rate": 7.058106461399111e-05,
      "loss": 0.3498,
      "step": 27750
    },
    {
      "gate_value": 0.3682975769042969,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27750
    },
    {
      "grad_norm": 58.116432189941406,
      "learning_rate": 7.047588952182304e-05,
      "loss": 0.3796,
      "step": 27760
    },
    {
      "gate_value": 0.36843323707580566,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 27760
    },
    {
      "grad_norm": 24.86961555480957,
      "learning_rate": 7.037076878363458e-05,
      "loss": 0.3611,
      "step": 27770
    },
    {
      "gate_value": 0.36853909492492676,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27770
    },
    {
      "grad_norm": 23.842926025390625,
      "learning_rate": 7.026570247127476e-05,
      "loss": 0.3666,
      "step": 27780
    },
    {
      "gate_value": 0.3685752749443054,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 27780
    },
    {
      "grad_norm": 81.14399719238281,
      "learning_rate": 7.016069065655547e-05,
      "loss": 0.3641,
      "step": 27790
    },
    {
      "gate_value": 0.3687325716018677,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27790
    },
    {
      "grad_norm": 14.452661514282227,
      "learning_rate": 7.005573341125133e-05,
      "loss": 0.3661,
      "step": 27800
    },
    {
      "gate_value": 0.36890509724617004,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27800
    },
    {
      "grad_norm": 12.318946838378906,
      "learning_rate": 6.995083080709951e-05,
      "loss": 0.3793,
      "step": 27810
    },
    {
      "gate_value": 0.36927101016044617,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 27810
    },
    {
      "grad_norm": 9.148600578308105,
      "learning_rate": 6.98459829158001e-05,
      "loss": 0.3592,
      "step": 27820
    },
    {
      "gate_value": 0.36944547295570374,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 27820
    },
    {
      "grad_norm": 11.932467460632324,
      "learning_rate": 6.974118980901546e-05,
      "loss": 0.3548,
      "step": 27830
    },
    {
      "gate_value": 0.3691197633743286,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 27830
    },
    {
      "grad_norm": 13.40058422088623,
      "learning_rate": 6.963645155837084e-05,
      "loss": 0.3606,
      "step": 27840
    },
    {
      "gate_value": 0.3690016567707062,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 27840
    },
    {
      "grad_norm": 9.346571922302246,
      "learning_rate": 6.953176823545375e-05,
      "loss": 0.381,
      "step": 27850
    },
    {
      "gate_value": 0.3696131706237793,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 27850
    },
    {
      "grad_norm": 16.809316635131836,
      "learning_rate": 6.942713991181439e-05,
      "loss": 0.3705,
      "step": 27860
    },
    {
      "gate_value": 0.36983686685562134,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 27860
    },
    {
      "grad_norm": 28.75189971923828,
      "learning_rate": 6.932256665896507e-05,
      "loss": 0.358,
      "step": 27870
    },
    {
      "gate_value": 0.3699440658092499,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 27870
    },
    {
      "grad_norm": 6.663064002990723,
      "learning_rate": 6.92180485483807e-05,
      "loss": 0.3628,
      "step": 27880
    },
    {
      "gate_value": 0.3699015974998474,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 27880
    },
    {
      "grad_norm": 16.0593204498291,
      "learning_rate": 6.911358565149842e-05,
      "loss": 0.3758,
      "step": 27890
    },
    {
      "gate_value": 0.36944106221199036,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 27890
    },
    {
      "grad_norm": 8.876472473144531,
      "learning_rate": 6.90091780397177e-05,
      "loss": 0.373,
      "step": 27900
    },
    {
      "gate_value": 0.3694072961807251,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 27900
    },
    {
      "grad_norm": 38.90917205810547,
      "learning_rate": 6.890482578440002e-05,
      "loss": 0.3733,
      "step": 27910
    },
    {
      "gate_value": 0.3693521320819855,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 27910
    },
    {
      "grad_norm": 16.983854293823242,
      "learning_rate": 6.88005289568693e-05,
      "loss": 0.3763,
      "step": 27920
    },
    {
      "gate_value": 0.36918890476226807,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 27920
    },
    {
      "grad_norm": 11.926179885864258,
      "learning_rate": 6.869628762841132e-05,
      "loss": 0.3716,
      "step": 27930
    },
    {
      "gate_value": 0.3690800368785858,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 27930
    },
    {
      "grad_norm": 47.29314041137695,
      "learning_rate": 6.859210187027408e-05,
      "loss": 0.3642,
      "step": 27940
    },
    {
      "gate_value": 0.36900752782821655,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 27940
    },
    {
      "grad_norm": 13.692339897155762,
      "learning_rate": 6.848797175366759e-05,
      "loss": 0.365,
      "step": 27950
    },
    {
      "gate_value": 0.36898988485336304,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 27950
    },
    {
      "grad_norm": 15.379136085510254,
      "learning_rate": 6.838389734976386e-05,
      "loss": 0.3793,
      "step": 27960
    },
    {
      "gate_value": 0.36915260553359985,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 27960
    },
    {
      "grad_norm": 34.57461929321289,
      "learning_rate": 6.827987872969663e-05,
      "loss": 0.3821,
      "step": 27970
    },
    {
      "gate_value": 0.3692905008792877,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 27970
    },
    {
      "grad_norm": 23.75289535522461,
      "learning_rate": 6.817591596456173e-05,
      "loss": 0.3673,
      "step": 27980
    },
    {
      "gate_value": 0.36970794200897217,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 27980
    },
    {
      "grad_norm": 43.34590148925781,
      "learning_rate": 6.80720091254167e-05,
      "loss": 0.3745,
      "step": 27990
    },
    {
      "gate_value": 0.36998069286346436,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 27990
    },
    {
      "grad_norm": 9.07591438293457,
      "learning_rate": 6.796815828328096e-05,
      "loss": 0.3683,
      "step": 28000
    },
    {
      "gate_value": 0.37007784843444824,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28000
    },
    {
      "grad_norm": 15.812060356140137,
      "learning_rate": 6.78643635091355e-05,
      "loss": 0.368,
      "step": 28010
    },
    {
      "gate_value": 0.3701871633529663,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 28010
    },
    {
      "grad_norm": 753.4508056640625,
      "learning_rate": 6.776062487392305e-05,
      "loss": 0.3807,
      "step": 28020
    },
    {
      "gate_value": 0.3702496886253357,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 28020
    },
    {
      "grad_norm": 16.918176651000977,
      "learning_rate": 6.765694244854803e-05,
      "loss": 0.3642,
      "step": 28030
    },
    {
      "gate_value": 0.3702527582645416,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 28030
    },
    {
      "grad_norm": 28.58259391784668,
      "learning_rate": 6.75533163038764e-05,
      "loss": 0.3822,
      "step": 28040
    },
    {
      "gate_value": 0.3701658546924591,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 28040
    },
    {
      "grad_norm": 16.40875244140625,
      "learning_rate": 6.744974651073563e-05,
      "loss": 0.362,
      "step": 28050
    },
    {
      "gate_value": 0.37029552459716797,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 28050
    },
    {
      "grad_norm": 6.374593734741211,
      "learning_rate": 6.734623313991478e-05,
      "loss": 0.384,
      "step": 28060
    },
    {
      "gate_value": 0.3704751133918762,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 28060
    },
    {
      "grad_norm": 18.405296325683594,
      "learning_rate": 6.724277626216416e-05,
      "loss": 0.3776,
      "step": 28070
    },
    {
      "gate_value": 0.37067288160324097,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 28070
    },
    {
      "grad_norm": 12.541691780090332,
      "learning_rate": 6.71393759481956e-05,
      "loss": 0.3754,
      "step": 28080
    },
    {
      "gate_value": 0.37075117230415344,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 28080
    },
    {
      "grad_norm": 29.952152252197266,
      "learning_rate": 6.703603226868226e-05,
      "loss": 0.3633,
      "step": 28090
    },
    {
      "gate_value": 0.3708937466144562,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 28090
    },
    {
      "grad_norm": 13.219239234924316,
      "learning_rate": 6.69327452942586e-05,
      "loss": 0.3807,
      "step": 28100
    },
    {
      "gate_value": 0.3709869384765625,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 28100
    },
    {
      "grad_norm": 28.397701263427734,
      "learning_rate": 6.682951509552025e-05,
      "loss": 0.3772,
      "step": 28110
    },
    {
      "gate_value": 0.3711105287075043,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 28110
    },
    {
      "grad_norm": 40.45359420776367,
      "learning_rate": 6.672634174302405e-05,
      "loss": 0.3859,
      "step": 28120
    },
    {
      "gate_value": 0.3712112307548523,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 28120
    },
    {
      "grad_norm": 24.582040786743164,
      "learning_rate": 6.662322530728805e-05,
      "loss": 0.3726,
      "step": 28130
    },
    {
      "gate_value": 0.3713975250720978,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 28130
    },
    {
      "grad_norm": 39473.47265625,
      "learning_rate": 6.652016585879133e-05,
      "loss": 0.376,
      "step": 28140
    },
    {
      "gate_value": 0.37156111001968384,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 28140
    },
    {
      "grad_norm": 38.606075286865234,
      "learning_rate": 6.64171634679741e-05,
      "loss": 0.3557,
      "step": 28150
    },
    {
      "gate_value": 0.37173065543174744,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 28150
    },
    {
      "grad_norm": 25.573253631591797,
      "learning_rate": 6.631421820523755e-05,
      "loss": 0.3831,
      "step": 28160
    },
    {
      "gate_value": 0.3719416856765747,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 28160
    },
    {
      "grad_norm": 10.938640594482422,
      "learning_rate": 6.621133014094367e-05,
      "loss": 0.3608,
      "step": 28170
    },
    {
      "gate_value": 0.3720950484275818,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 28170
    },
    {
      "grad_norm": 7.006185054779053,
      "learning_rate": 6.610849934541557e-05,
      "loss": 0.3519,
      "step": 28180
    },
    {
      "gate_value": 0.3721492886543274,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 28180
    },
    {
      "grad_norm": 10.840231895446777,
      "learning_rate": 6.600572588893712e-05,
      "loss": 0.3721,
      "step": 28190
    },
    {
      "gate_value": 0.3722116947174072,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 28190
    },
    {
      "grad_norm": 11.135544776916504,
      "learning_rate": 6.590300984175306e-05,
      "loss": 0.3454,
      "step": 28200
    },
    {
      "gate_value": 0.37256065011024475,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 28200
    },
    {
      "grad_norm": 116.5069580078125,
      "learning_rate": 6.580035127406874e-05,
      "loss": 0.3552,
      "step": 28210
    },
    {
      "gate_value": 0.37304627895355225,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 28210
    },
    {
      "grad_norm": 11.667895317077637,
      "learning_rate": 6.569775025605042e-05,
      "loss": 0.3636,
      "step": 28220
    },
    {
      "gate_value": 0.3732167184352875,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 28220
    },
    {
      "grad_norm": 8.537809371948242,
      "learning_rate": 6.559520685782481e-05,
      "loss": 0.3553,
      "step": 28230
    },
    {
      "gate_value": 0.373494029045105,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 28230
    },
    {
      "grad_norm": 13.881553649902344,
      "learning_rate": 6.549272114947945e-05,
      "loss": 0.3553,
      "step": 28240
    },
    {
      "gate_value": 0.37379199266433716,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 28240
    },
    {
      "grad_norm": 16.013992309570312,
      "learning_rate": 6.539029320106232e-05,
      "loss": 0.3729,
      "step": 28250
    },
    {
      "gate_value": 0.3742316961288452,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28250
    },
    {
      "grad_norm": 831.845703125,
      "learning_rate": 6.5287923082582e-05,
      "loss": 0.3863,
      "step": 28260
    },
    {
      "gate_value": 0.3742944896221161,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 28260
    },
    {
      "grad_norm": 21.637969970703125,
      "learning_rate": 6.518561086400742e-05,
      "loss": 0.3844,
      "step": 28270
    },
    {
      "gate_value": 0.3739367127418518,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 28270
    },
    {
      "grad_norm": 46.8300895690918,
      "learning_rate": 6.508335661526808e-05,
      "loss": 0.3752,
      "step": 28280
    },
    {
      "gate_value": 0.3740937113761902,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 28280
    },
    {
      "grad_norm": 19.60430145263672,
      "learning_rate": 6.498116040625382e-05,
      "loss": 0.3773,
      "step": 28290
    },
    {
      "gate_value": 0.3742987811565399,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 28290
    },
    {
      "grad_norm": 13.661314010620117,
      "learning_rate": 6.487902230681468e-05,
      "loss": 0.3614,
      "step": 28300
    },
    {
      "gate_value": 0.3744341731071472,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 28300
    },
    {
      "grad_norm": 103.14254760742188,
      "learning_rate": 6.477694238676116e-05,
      "loss": 0.3723,
      "step": 28310
    },
    {
      "gate_value": 0.37441954016685486,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 28310
    },
    {
      "grad_norm": 2326.118896484375,
      "learning_rate": 6.467492071586395e-05,
      "loss": 0.3766,
      "step": 28320
    },
    {
      "gate_value": 0.37444210052490234,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 28320
    },
    {
      "grad_norm": 409.30987548828125,
      "learning_rate": 6.457295736385383e-05,
      "loss": 0.3519,
      "step": 28330
    },
    {
      "gate_value": 0.37455645203590393,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 28330
    },
    {
      "grad_norm": 36.271575927734375,
      "learning_rate": 6.447105240042181e-05,
      "loss": 0.3723,
      "step": 28340
    },
    {
      "gate_value": 0.3745886981487274,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 28340
    },
    {
      "grad_norm": 9.298629760742188,
      "learning_rate": 6.4369205895219e-05,
      "loss": 0.3602,
      "step": 28350
    },
    {
      "gate_value": 0.37470346689224243,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 28350
    },
    {
      "grad_norm": 19.39296531677246,
      "learning_rate": 6.426741791785656e-05,
      "loss": 0.3644,
      "step": 28360
    },
    {
      "gate_value": 0.37486281991004944,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28360
    },
    {
      "grad_norm": 15.315899848937988,
      "learning_rate": 6.416568853790549e-05,
      "loss": 0.3484,
      "step": 28370
    },
    {
      "gate_value": 0.3749953508377075,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 28370
    },
    {
      "grad_norm": 19.064598083496094,
      "learning_rate": 6.406401782489702e-05,
      "loss": 0.3667,
      "step": 28380
    },
    {
      "gate_value": 0.37507161498069763,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 28380
    },
    {
      "grad_norm": 31.075756072998047,
      "learning_rate": 6.396240584832196e-05,
      "loss": 0.3682,
      "step": 28390
    },
    {
      "gate_value": 0.37511423230171204,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 28390
    },
    {
      "grad_norm": 23.362079620361328,
      "learning_rate": 6.386085267763122e-05,
      "loss": 0.3665,
      "step": 28400
    },
    {
      "gate_value": 0.3751569092273712,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 28400
    },
    {
      "grad_norm": 125.1460189819336,
      "learning_rate": 6.375935838223545e-05,
      "loss": 0.3623,
      "step": 28410
    },
    {
      "gate_value": 0.37524327635765076,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 28410
    },
    {
      "grad_norm": 112.93570709228516,
      "learning_rate": 6.365792303150505e-05,
      "loss": 0.3568,
      "step": 28420
    },
    {
      "gate_value": 0.3753010928630829,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 28420
    },
    {
      "grad_norm": 41.79950714111328,
      "learning_rate": 6.355654669477006e-05,
      "loss": 0.3566,
      "step": 28430
    },
    {
      "gate_value": 0.37531712651252747,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 28430
    },
    {
      "grad_norm": 49.83927917480469,
      "learning_rate": 6.345522944132031e-05,
      "loss": 0.3622,
      "step": 28440
    },
    {
      "gate_value": 0.37537771463394165,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 28440
    },
    {
      "grad_norm": 21.510791778564453,
      "learning_rate": 6.335397134040515e-05,
      "loss": 0.3845,
      "step": 28450
    },
    {
      "gate_value": 0.3755156397819519,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 28450
    },
    {
      "grad_norm": 161.8980255126953,
      "learning_rate": 6.325277246123362e-05,
      "loss": 0.3502,
      "step": 28460
    },
    {
      "gate_value": 0.3758998513221741,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28460
    },
    {
      "grad_norm": 12.291893005371094,
      "learning_rate": 6.315163287297407e-05,
      "loss": 0.3496,
      "step": 28470
    },
    {
      "gate_value": 0.3761850595474243,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 28470
    },
    {
      "grad_norm": 58.854331970214844,
      "learning_rate": 6.305055264475457e-05,
      "loss": 0.3675,
      "step": 28480
    },
    {
      "gate_value": 0.3762570023536682,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 28480
    },
    {
      "grad_norm": 75.17032623291016,
      "learning_rate": 6.294953184566241e-05,
      "loss": 0.3631,
      "step": 28490
    },
    {
      "gate_value": 0.3762133717536926,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 28490
    },
    {
      "grad_norm": 26.945207595825195,
      "learning_rate": 6.284857054474439e-05,
      "loss": 0.3608,
      "step": 28500
    },
    {
      "gate_value": 0.3762674033641815,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 28500
    },
    {
      "grad_norm": 4600.4619140625,
      "learning_rate": 6.274766881100662e-05,
      "loss": 0.3755,
      "step": 28510
    },
    {
      "gate_value": 0.37631040811538696,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 28510
    },
    {
      "grad_norm": 8.499515533447266,
      "learning_rate": 6.264682671341452e-05,
      "loss": 0.3622,
      "step": 28520
    },
    {
      "gate_value": 0.3764779567718506,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 28520
    },
    {
      "grad_norm": 24.589887619018555,
      "learning_rate": 6.254604432089263e-05,
      "loss": 0.3621,
      "step": 28530
    },
    {
      "gate_value": 0.3765389025211334,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 28530
    },
    {
      "grad_norm": 1568.8673095703125,
      "learning_rate": 6.24453217023248e-05,
      "loss": 0.3611,
      "step": 28540
    },
    {
      "gate_value": 0.37654218077659607,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 28540
    },
    {
      "grad_norm": 22.204059600830078,
      "learning_rate": 6.2344658926554e-05,
      "loss": 0.3413,
      "step": 28550
    },
    {
      "gate_value": 0.3768579065799713,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 28550
    },
    {
      "grad_norm": 27.134193420410156,
      "learning_rate": 6.224405606238233e-05,
      "loss": 0.3641,
      "step": 28560
    },
    {
      "gate_value": 0.3769153356552124,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 28560
    },
    {
      "grad_norm": 23.41484260559082,
      "learning_rate": 6.214351317857085e-05,
      "loss": 0.3559,
      "step": 28570
    },
    {
      "gate_value": 0.3770342767238617,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 28570
    },
    {
      "grad_norm": 16.700103759765625,
      "learning_rate": 6.204303034383964e-05,
      "loss": 0.3532,
      "step": 28580
    },
    {
      "gate_value": 0.3770424425601959,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28580
    },
    {
      "grad_norm": 25.210561752319336,
      "learning_rate": 6.194260762686779e-05,
      "loss": 0.3631,
      "step": 28590
    },
    {
      "gate_value": 0.37705928087234497,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 28590
    },
    {
      "grad_norm": 7.759881019592285,
      "learning_rate": 6.184224509629329e-05,
      "loss": 0.3506,
      "step": 28600
    },
    {
      "gate_value": 0.3770466148853302,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 28600
    },
    {
      "grad_norm": 23.259002685546875,
      "learning_rate": 6.1741942820713e-05,
      "loss": 0.3729,
      "step": 28610
    },
    {
      "gate_value": 0.37719234824180603,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 28610
    },
    {
      "grad_norm": 8.094666481018066,
      "learning_rate": 6.164170086868262e-05,
      "loss": 0.3661,
      "step": 28620
    },
    {
      "gate_value": 0.37746044993400574,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28620
    },
    {
      "grad_norm": 12.716617584228516,
      "learning_rate": 6.154151930871646e-05,
      "loss": 0.3748,
      "step": 28630
    },
    {
      "gate_value": 0.3775232434272766,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 28630
    },
    {
      "grad_norm": 28.23755645751953,
      "learning_rate": 6.144139820928774e-05,
      "loss": 0.3596,
      "step": 28640
    },
    {
      "gate_value": 0.37761175632476807,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 28640
    },
    {
      "grad_norm": 20.320539474487305,
      "learning_rate": 6.134133763882831e-05,
      "loss": 0.3725,
      "step": 28650
    },
    {
      "gate_value": 0.37779858708381653,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 28650
    },
    {
      "grad_norm": 13.410603523254395,
      "learning_rate": 6.124133766572864e-05,
      "loss": 0.3821,
      "step": 28660
    },
    {
      "gate_value": 0.37793204188346863,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28660
    },
    {
      "grad_norm": 19.16880989074707,
      "learning_rate": 6.114139835833773e-05,
      "loss": 0.377,
      "step": 28670
    },
    {
      "gate_value": 0.3780597150325775,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 28670
    },
    {
      "grad_norm": 23.84910774230957,
      "learning_rate": 6.10415197849631e-05,
      "loss": 0.3646,
      "step": 28680
    },
    {
      "gate_value": 0.37815070152282715,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 28680
    },
    {
      "grad_norm": 13.212060928344727,
      "learning_rate": 6.094170201387089e-05,
      "loss": 0.3655,
      "step": 28690
    },
    {
      "gate_value": 0.3782776892185211,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 28690
    },
    {
      "grad_norm": 45.62289810180664,
      "learning_rate": 6.084194511328556e-05,
      "loss": 0.3452,
      "step": 28700
    },
    {
      "gate_value": 0.3785243332386017,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 28700
    },
    {
      "grad_norm": 36.22829818725586,
      "learning_rate": 6.0742249151390046e-05,
      "loss": 0.3604,
      "step": 28710
    },
    {
      "gate_value": 0.3787309229373932,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 28710
    },
    {
      "grad_norm": 35.166202545166016,
      "learning_rate": 6.064261419632564e-05,
      "loss": 0.3781,
      "step": 28720
    },
    {
      "gate_value": 0.3788740932941437,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 28720
    },
    {
      "grad_norm": 20.661985397338867,
      "learning_rate": 6.054304031619178e-05,
      "loss": 0.3627,
      "step": 28730
    },
    {
      "gate_value": 0.3791327178478241,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 28730
    },
    {
      "grad_norm": 23.819217681884766,
      "learning_rate": 6.044352757904634e-05,
      "loss": 0.3651,
      "step": 28740
    },
    {
      "gate_value": 0.379306435585022,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 28740
    },
    {
      "grad_norm": 276.656982421875,
      "learning_rate": 6.0344076052905324e-05,
      "loss": 0.3534,
      "step": 28750
    },
    {
      "gate_value": 0.37942585349082947,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 28750
    },
    {
      "grad_norm": 19.64164924621582,
      "learning_rate": 6.024468580574299e-05,
      "loss": 0.3464,
      "step": 28760
    },
    {
      "gate_value": 0.3795641362667084,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 28760
    },
    {
      "grad_norm": 38.871768951416016,
      "learning_rate": 6.014535690549156e-05,
      "loss": 0.3599,
      "step": 28770
    },
    {
      "gate_value": 0.3797716200351715,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 28770
    },
    {
      "grad_norm": 26.767852783203125,
      "learning_rate": 6.004608942004135e-05,
      "loss": 0.3468,
      "step": 28780
    },
    {
      "gate_value": 0.37991610169410706,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 28780
    },
    {
      "grad_norm": 17.371288299560547,
      "learning_rate": 5.994688341724081e-05,
      "loss": 0.3654,
      "step": 28790
    },
    {
      "gate_value": 0.3800429701805115,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 28790
    },
    {
      "grad_norm": 44.015480041503906,
      "learning_rate": 5.9847738964896305e-05,
      "loss": 0.3749,
      "step": 28800
    },
    {
      "gate_value": 0.3801386058330536,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28800
    },
    {
      "grad_norm": 20.374393463134766,
      "learning_rate": 5.974865613077213e-05,
      "loss": 0.3558,
      "step": 28810
    },
    {
      "gate_value": 0.38034605979919434,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28810
    },
    {
      "grad_norm": 56.06901168823242,
      "learning_rate": 5.964963498259052e-05,
      "loss": 0.3534,
      "step": 28820
    },
    {
      "gate_value": 0.3804914653301239,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 28820
    },
    {
      "grad_norm": 38.3012580871582,
      "learning_rate": 5.95506755880314e-05,
      "loss": 0.3617,
      "step": 28830
    },
    {
      "gate_value": 0.38055482506752014,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28830
    },
    {
      "grad_norm": 13.337339401245117,
      "learning_rate": 5.945177801473262e-05,
      "loss": 0.3702,
      "step": 28840
    },
    {
      "gate_value": 0.38060054183006287,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 28840
    },
    {
      "grad_norm": 10.94632339477539,
      "learning_rate": 5.935294233028982e-05,
      "loss": 0.3725,
      "step": 28850
    },
    {
      "gate_value": 0.3807608187198639,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 28850
    },
    {
      "grad_norm": 130.21417236328125,
      "learning_rate": 5.925416860225611e-05,
      "loss": 0.3602,
      "step": 28860
    },
    {
      "gate_value": 0.38078853487968445,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 28860
    },
    {
      "grad_norm": 18.017894744873047,
      "learning_rate": 5.915545689814254e-05,
      "loss": 0.3474,
      "step": 28870
    },
    {
      "gate_value": 0.38082894682884216,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 28870
    },
    {
      "grad_norm": 547.781494140625,
      "learning_rate": 5.905680728541752e-05,
      "loss": 0.3553,
      "step": 28880
    },
    {
      "gate_value": 0.3808595836162567,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 28880
    },
    {
      "grad_norm": 23.99994468688965,
      "learning_rate": 5.895821983150718e-05,
      "loss": 0.3673,
      "step": 28890
    },
    {
      "gate_value": 0.38086777925491333,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28890
    },
    {
      "grad_norm": 29.337862014770508,
      "learning_rate": 5.8859694603795116e-05,
      "loss": 0.3717,
      "step": 28900
    },
    {
      "gate_value": 0.3809985816478729,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28900
    },
    {
      "grad_norm": 36.2806510925293,
      "learning_rate": 5.876123166962238e-05,
      "loss": 0.3684,
      "step": 28910
    },
    {
      "gate_value": 0.38116392493247986,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 28910
    },
    {
      "grad_norm": 11.751944541931152,
      "learning_rate": 5.8662831096287515e-05,
      "loss": 0.3695,
      "step": 28920
    },
    {
      "gate_value": 0.38121894001960754,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 28920
    },
    {
      "grad_norm": 16.78483772277832,
      "learning_rate": 5.8564492951046285e-05,
      "loss": 0.365,
      "step": 28930
    },
    {
      "gate_value": 0.38121622800827026,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 28930
    },
    {
      "grad_norm": 11.2169828414917,
      "learning_rate": 5.846621730111199e-05,
      "loss": 0.366,
      "step": 28940
    },
    {
      "gate_value": 0.3812275826931,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 28940
    },
    {
      "grad_norm": 14.724105834960938,
      "learning_rate": 5.836800421365502e-05,
      "loss": 0.3633,
      "step": 28950
    },
    {
      "gate_value": 0.3812483251094818,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 28950
    },
    {
      "grad_norm": 21.191669464111328,
      "learning_rate": 5.826985375580312e-05,
      "loss": 0.3573,
      "step": 28960
    },
    {
      "gate_value": 0.38138577342033386,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 28960
    },
    {
      "grad_norm": 20.08879852294922,
      "learning_rate": 5.8171765994641274e-05,
      "loss": 0.3494,
      "step": 28970
    },
    {
      "gate_value": 0.38147062063217163,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 28970
    },
    {
      "grad_norm": 3683.35107421875,
      "learning_rate": 5.807374099721142e-05,
      "loss": 0.3737,
      "step": 28980
    },
    {
      "gate_value": 0.38168245553970337,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 28980
    },
    {
      "grad_norm": 42.52603530883789,
      "learning_rate": 5.7975778830512784e-05,
      "loss": 0.3509,
      "step": 28990
    },
    {
      "gate_value": 0.38184642791748047,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 28990
    },
    {
      "grad_norm": 34.126991271972656,
      "learning_rate": 5.7877879561501596e-05,
      "loss": 0.3557,
      "step": 29000
    },
    {
      "gate_value": 0.38198453187942505,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29000
    },
    {
      "grad_norm": 45.88672637939453,
      "learning_rate": 5.778004325709105e-05,
      "loss": 0.3735,
      "step": 29010
    },
    {
      "gate_value": 0.38210225105285645,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 29010
    },
    {
      "grad_norm": 358.89947509765625,
      "learning_rate": 5.768226998415142e-05,
      "loss": 0.3599,
      "step": 29020
    },
    {
      "gate_value": 0.3822075128555298,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 29020
    },
    {
      "grad_norm": 31.956314086914062,
      "learning_rate": 5.758455980950974e-05,
      "loss": 0.3661,
      "step": 29030
    },
    {
      "gate_value": 0.38228172063827515,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29030
    },
    {
      "grad_norm": 35.25785446166992,
      "learning_rate": 5.7486912799949956e-05,
      "loss": 0.3525,
      "step": 29040
    },
    {
      "gate_value": 0.3823954463005066,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 29040
    },
    {
      "grad_norm": 57.509822845458984,
      "learning_rate": 5.738932902221294e-05,
      "loss": 0.3745,
      "step": 29050
    },
    {
      "gate_value": 0.38250795006752014,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 29050
    },
    {
      "grad_norm": 16.257240295410156,
      "learning_rate": 5.7291808542996245e-05,
      "loss": 0.3593,
      "step": 29060
    },
    {
      "gate_value": 0.38265126943588257,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 29060
    },
    {
      "grad_norm": 556.2877197265625,
      "learning_rate": 5.719435142895429e-05,
      "loss": 0.3626,
      "step": 29070
    },
    {
      "gate_value": 0.3827889859676361,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 29070
    },
    {
      "grad_norm": 618.673583984375,
      "learning_rate": 5.709695774669799e-05,
      "loss": 0.3487,
      "step": 29080
    },
    {
      "gate_value": 0.3828449547290802,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 29080
    },
    {
      "grad_norm": 479.97174072265625,
      "learning_rate": 5.699962756279504e-05,
      "loss": 0.3614,
      "step": 29090
    },
    {
      "gate_value": 0.38286563754081726,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29090
    },
    {
      "grad_norm": 36.629093170166016,
      "learning_rate": 5.690236094376969e-05,
      "loss": 0.3593,
      "step": 29100
    },
    {
      "gate_value": 0.38298630714416504,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 29100
    },
    {
      "grad_norm": 30.256851196289062,
      "learning_rate": 5.68051579561028e-05,
      "loss": 0.3752,
      "step": 29110
    },
    {
      "gate_value": 0.38303476572036743,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 29110
    },
    {
      "grad_norm": 31.483182907104492,
      "learning_rate": 5.670801866623171e-05,
      "loss": 0.3649,
      "step": 29120
    },
    {
      "gate_value": 0.38311877846717834,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 29120
    },
    {
      "grad_norm": 9.485666275024414,
      "learning_rate": 5.661094314055018e-05,
      "loss": 0.3615,
      "step": 29130
    },
    {
      "gate_value": 0.3831852078437805,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 29130
    },
    {
      "grad_norm": 12.52168083190918,
      "learning_rate": 5.651393144540834e-05,
      "loss": 0.3737,
      "step": 29140
    },
    {
      "gate_value": 0.3834601640701294,
      "icl_sequence_length": 58,
      "num_contexts": 3,
      "step": 29140
    },
    {
      "grad_norm": 245.93165588378906,
      "learning_rate": 5.641698364711286e-05,
      "loss": 0.3661,
      "step": 29150
    },
    {
      "gate_value": 0.3837195336818695,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29150
    },
    {
      "grad_norm": 12.20414924621582,
      "learning_rate": 5.632009981192661e-05,
      "loss": 0.375,
      "step": 29160
    },
    {
      "gate_value": 0.3836742043495178,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29160
    },
    {
      "grad_norm": 117.4782485961914,
      "learning_rate": 5.6223280006068835e-05,
      "loss": 0.3521,
      "step": 29170
    },
    {
      "gate_value": 0.3838871717453003,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29170
    },
    {
      "grad_norm": 23.043319702148438,
      "learning_rate": 5.612652429571487e-05,
      "loss": 0.3451,
      "step": 29180
    },
    {
      "gate_value": 0.38392624258995056,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 29180
    },
    {
      "grad_norm": 22.326448440551758,
      "learning_rate": 5.6029832746996375e-05,
      "loss": 0.3823,
      "step": 29190
    },
    {
      "gate_value": 0.3842223286628723,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 29190
    },
    {
      "grad_norm": 42.24995040893555,
      "learning_rate": 5.593320542600111e-05,
      "loss": 0.3669,
      "step": 29200
    },
    {
      "gate_value": 0.38453155755996704,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 29200
    },
    {
      "grad_norm": 14.570834159851074,
      "learning_rate": 5.583664239877294e-05,
      "loss": 0.3474,
      "step": 29210
    },
    {
      "gate_value": 0.3847702741622925,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29210
    },
    {
      "grad_norm": 14.977248191833496,
      "learning_rate": 5.574014373131184e-05,
      "loss": 0.3551,
      "step": 29220
    },
    {
      "gate_value": 0.3847298324108124,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29220
    },
    {
      "grad_norm": 18.472740173339844,
      "learning_rate": 5.5643709489573675e-05,
      "loss": 0.3746,
      "step": 29230
    },
    {
      "gate_value": 0.3847012519836426,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 29230
    },
    {
      "grad_norm": 23.11821937561035,
      "learning_rate": 5.554733973947029e-05,
      "loss": 0.374,
      "step": 29240
    },
    {
      "gate_value": 0.38457468152046204,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 29240
    },
    {
      "grad_norm": 19.69847869873047,
      "learning_rate": 5.545103454686957e-05,
      "loss": 0.3472,
      "step": 29250
    },
    {
      "gate_value": 0.38453060388565063,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 29250
    },
    {
      "grad_norm": 11.556638717651367,
      "learning_rate": 5.535479397759519e-05,
      "loss": 0.3721,
      "step": 29260
    },
    {
      "gate_value": 0.38458558917045593,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 29260
    },
    {
      "grad_norm": 15350.7255859375,
      "learning_rate": 5.5258618097426735e-05,
      "loss": 0.3611,
      "step": 29270
    },
    {
      "gate_value": 0.3847283720970154,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 29270
    },
    {
      "grad_norm": 26.339685440063477,
      "learning_rate": 5.516250697209938e-05,
      "loss": 0.3536,
      "step": 29280
    },
    {
      "gate_value": 0.38488730788230896,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 29280
    },
    {
      "grad_norm": 34.276817321777344,
      "learning_rate": 5.5066460667304254e-05,
      "loss": 0.3833,
      "step": 29290
    },
    {
      "gate_value": 0.3849720358848572,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 29290
    },
    {
      "grad_norm": 21.733354568481445,
      "learning_rate": 5.49704792486881e-05,
      "loss": 0.3801,
      "step": 29300
    },
    {
      "gate_value": 0.3850296437740326,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 29300
    },
    {
      "grad_norm": 13.726507186889648,
      "learning_rate": 5.4874562781853356e-05,
      "loss": 0.3572,
      "step": 29310
    },
    {
      "gate_value": 0.3850591778755188,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 29310
    },
    {
      "grad_norm": 5749.76025390625,
      "learning_rate": 5.477871133235791e-05,
      "loss": 0.3633,
      "step": 29320
    },
    {
      "gate_value": 0.38509833812713623,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 29320
    },
    {
      "grad_norm": 772.3057861328125,
      "learning_rate": 5.468292496571545e-05,
      "loss": 0.3667,
      "step": 29330
    },
    {
      "gate_value": 0.38516709208488464,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 29330
    },
    {
      "grad_norm": 20.018020629882812,
      "learning_rate": 5.458720374739493e-05,
      "loss": 0.3831,
      "step": 29340
    },
    {
      "gate_value": 0.38528576493263245,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 29340
    },
    {
      "grad_norm": 13.0118989944458,
      "learning_rate": 5.449154774282096e-05,
      "loss": 0.3817,
      "step": 29350
    },
    {
      "gate_value": 0.38539865612983704,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 29350
    },
    {
      "grad_norm": 18.307941436767578,
      "learning_rate": 5.4395957017373514e-05,
      "loss": 0.3649,
      "step": 29360
    },
    {
      "gate_value": 0.38549789786338806,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 29360
    },
    {
      "grad_norm": 3333.202880859375,
      "learning_rate": 5.430043163638801e-05,
      "loss": 0.3654,
      "step": 29370
    },
    {
      "gate_value": 0.3855592608451843,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 29370
    },
    {
      "grad_norm": 35.40414047241211,
      "learning_rate": 5.420497166515503e-05,
      "loss": 0.3538,
      "step": 29380
    },
    {
      "gate_value": 0.38564735651016235,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29380
    },
    {
      "grad_norm": 2308.856201171875,
      "learning_rate": 5.410957716892065e-05,
      "loss": 0.3699,
      "step": 29390
    },
    {
      "gate_value": 0.3856613337993622,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 29390
    },
    {
      "grad_norm": 15.066496849060059,
      "learning_rate": 5.4014248212886044e-05,
      "loss": 0.3832,
      "step": 29400
    },
    {
      "gate_value": 0.3857213258743286,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 29400
    },
    {
      "grad_norm": 244.73097229003906,
      "learning_rate": 5.391898486220778e-05,
      "loss": 0.3729,
      "step": 29410
    },
    {
      "gate_value": 0.38576260209083557,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29410
    },
    {
      "grad_norm": 16.249357223510742,
      "learning_rate": 5.38237871819973e-05,
      "loss": 0.3749,
      "step": 29420
    },
    {
      "gate_value": 0.38585683703422546,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29420
    },
    {
      "grad_norm": 82.86548614501953,
      "learning_rate": 5.3728655237321443e-05,
      "loss": 0.3643,
      "step": 29430
    },
    {
      "gate_value": 0.38592880964279175,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 29430
    },
    {
      "grad_norm": 18.706005096435547,
      "learning_rate": 5.3633589093201906e-05,
      "loss": 0.3716,
      "step": 29440
    },
    {
      "gate_value": 0.3860245943069458,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 29440
    },
    {
      "grad_norm": 14.105803489685059,
      "learning_rate": 5.353858881461555e-05,
      "loss": 0.3493,
      "step": 29450
    },
    {
      "gate_value": 0.3861307203769684,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 29450
    },
    {
      "grad_norm": 53.31535720825195,
      "learning_rate": 5.344365446649414e-05,
      "loss": 0.383,
      "step": 29460
    },
    {
      "gate_value": 0.38630276918411255,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29460
    },
    {
      "grad_norm": 17.413074493408203,
      "learning_rate": 5.33487861137245e-05,
      "loss": 0.359,
      "step": 29470
    },
    {
      "gate_value": 0.3865601420402527,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 29470
    },
    {
      "grad_norm": 262.77484130859375,
      "learning_rate": 5.3253983821148124e-05,
      "loss": 0.3752,
      "step": 29480
    },
    {
      "gate_value": 0.3866910934448242,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 29480
    },
    {
      "grad_norm": 16.89266586303711,
      "learning_rate": 5.3159247653561555e-05,
      "loss": 0.3593,
      "step": 29490
    },
    {
      "gate_value": 0.3867685794830322,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 29490
    },
    {
      "grad_norm": 14.136067390441895,
      "learning_rate": 5.30645776757161e-05,
      "loss": 0.3647,
      "step": 29500
    },
    {
      "gate_value": 0.3868660032749176,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29500
    },
    {
      "grad_norm": 14.746859550476074,
      "learning_rate": 5.2969973952317715e-05,
      "loss": 0.3463,
      "step": 29510
    },
    {
      "gate_value": 0.38711073994636536,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 29510
    },
    {
      "grad_norm": 1038.1717529296875,
      "learning_rate": 5.28754365480272e-05,
      "loss": 0.3662,
      "step": 29520
    },
    {
      "gate_value": 0.3872504234313965,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29520
    },
    {
      "grad_norm": 6.718716621398926,
      "learning_rate": 5.278096552746001e-05,
      "loss": 0.3723,
      "step": 29530
    },
    {
      "gate_value": 0.3872128129005432,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 29530
    },
    {
      "grad_norm": 27.14375877380371,
      "learning_rate": 5.268656095518613e-05,
      "loss": 0.3776,
      "step": 29540
    },
    {
      "gate_value": 0.3871647119522095,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 29540
    },
    {
      "grad_norm": 24.749032974243164,
      "learning_rate": 5.25922228957302e-05,
      "loss": 0.3672,
      "step": 29550
    },
    {
      "gate_value": 0.3872618079185486,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 29550
    },
    {
      "grad_norm": 21.76228141784668,
      "learning_rate": 5.249795141357145e-05,
      "loss": 0.371,
      "step": 29560
    },
    {
      "gate_value": 0.3873235583305359,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 29560
    },
    {
      "grad_norm": 45.05788803100586,
      "learning_rate": 5.240374657314354e-05,
      "loss": 0.3514,
      "step": 29570
    },
    {
      "gate_value": 0.387417197227478,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 29570
    },
    {
      "grad_norm": 19.219005584716797,
      "learning_rate": 5.2309608438834536e-05,
      "loss": 0.3775,
      "step": 29580
    },
    {
      "gate_value": 0.3874436318874359,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29580
    },
    {
      "grad_norm": 12.430703163146973,
      "learning_rate": 5.221553707498706e-05,
      "loss": 0.3608,
      "step": 29590
    },
    {
      "gate_value": 0.38745900988578796,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 29590
    },
    {
      "grad_norm": 3910.81787109375,
      "learning_rate": 5.212153254589787e-05,
      "loss": 0.3574,
      "step": 29600
    },
    {
      "gate_value": 0.38752108812332153,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29600
    },
    {
      "grad_norm": 63.541481018066406,
      "learning_rate": 5.2027594915818263e-05,
      "loss": 0.3794,
      "step": 29610
    },
    {
      "gate_value": 0.3875715732574463,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29610
    },
    {
      "grad_norm": 47.07277297973633,
      "learning_rate": 5.193372424895368e-05,
      "loss": 0.367,
      "step": 29620
    },
    {
      "gate_value": 0.38757985830307007,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 29620
    },
    {
      "grad_norm": 12.926732063293457,
      "learning_rate": 5.1839920609463936e-05,
      "loss": 0.3679,
      "step": 29630
    },
    {
      "gate_value": 0.38763460516929626,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 29630
    },
    {
      "grad_norm": 28.26099395751953,
      "learning_rate": 5.174618406146282e-05,
      "loss": 0.3602,
      "step": 29640
    },
    {
      "gate_value": 0.387708455324173,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 29640
    },
    {
      "grad_norm": 43.7656364440918,
      "learning_rate": 5.16525146690184e-05,
      "loss": 0.3784,
      "step": 29650
    },
    {
      "gate_value": 0.38778313994407654,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 29650
    },
    {
      "grad_norm": 60.07194900512695,
      "learning_rate": 5.1558912496152854e-05,
      "loss": 0.349,
      "step": 29660
    },
    {
      "gate_value": 0.3878571689128876,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 29660
    },
    {
      "grad_norm": 122.97605895996094,
      "learning_rate": 5.146537760684242e-05,
      "loss": 0.3726,
      "step": 29670
    },
    {
      "gate_value": 0.387951523065567,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29670
    },
    {
      "grad_norm": 20.661191940307617,
      "learning_rate": 5.13719100650172e-05,
      "loss": 0.3778,
      "step": 29680
    },
    {
      "gate_value": 0.388042151927948,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 29680
    },
    {
      "grad_norm": 25.058385848999023,
      "learning_rate": 5.127850993456151e-05,
      "loss": 0.3575,
      "step": 29690
    },
    {
      "gate_value": 0.38809892535209656,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 29690
    },
    {
      "grad_norm": 23.43122100830078,
      "learning_rate": 5.118517727931333e-05,
      "loss": 0.3552,
      "step": 29700
    },
    {
      "gate_value": 0.388154000043869,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29700
    },
    {
      "grad_norm": 387.5428161621094,
      "learning_rate": 5.1091912163064736e-05,
      "loss": 0.3781,
      "step": 29710
    },
    {
      "gate_value": 0.3881308436393738,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 29710
    },
    {
      "grad_norm": 105.68052673339844,
      "learning_rate": 5.099871464956151e-05,
      "loss": 0.3556,
      "step": 29720
    },
    {
      "gate_value": 0.3881755471229553,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 29720
    },
    {
      "grad_norm": 24.62873077392578,
      "learning_rate": 5.090558480250336e-05,
      "loss": 0.3699,
      "step": 29730
    },
    {
      "gate_value": 0.3882880210876465,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 29730
    },
    {
      "grad_norm": 454.452880859375,
      "learning_rate": 5.081252268554352e-05,
      "loss": 0.3692,
      "step": 29740
    },
    {
      "gate_value": 0.38832375407218933,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29740
    },
    {
      "grad_norm": 42.048065185546875,
      "learning_rate": 5.0719528362289156e-05,
      "loss": 0.3536,
      "step": 29750
    },
    {
      "gate_value": 0.38841527700424194,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 29750
    },
    {
      "grad_norm": 140.90931701660156,
      "learning_rate": 5.062660189630101e-05,
      "loss": 0.3695,
      "step": 29760
    },
    {
      "gate_value": 0.3884676992893219,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 29760
    },
    {
      "grad_norm": 65.93880462646484,
      "learning_rate": 5.053374335109346e-05,
      "loss": 0.3732,
      "step": 29770
    },
    {
      "gate_value": 0.38848721981048584,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 29770
    },
    {
      "grad_norm": 823.0439453125,
      "learning_rate": 5.0440952790134426e-05,
      "loss": 0.3649,
      "step": 29780
    },
    {
      "gate_value": 0.3884487748146057,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29780
    },
    {
      "grad_norm": 339.5392761230469,
      "learning_rate": 5.034823027684533e-05,
      "loss": 0.346,
      "step": 29790
    },
    {
      "gate_value": 0.388444185256958,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 29790
    },
    {
      "grad_norm": 92.40780639648438,
      "learning_rate": 5.025557587460118e-05,
      "loss": 0.3621,
      "step": 29800
    },
    {
      "gate_value": 0.38856759667396545,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 29800
    },
    {
      "grad_norm": 23.564212799072266,
      "learning_rate": 5.016298964673038e-05,
      "loss": 0.366,
      "step": 29810
    },
    {
      "gate_value": 0.38881248235702515,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 29810
    },
    {
      "grad_norm": 30.643938064575195,
      "learning_rate": 5.007047165651474e-05,
      "loss": 0.3718,
      "step": 29820
    },
    {
      "gate_value": 0.38913866877555847,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 29820
    },
    {
      "grad_norm": 2424.154052734375,
      "learning_rate": 4.997802196718951e-05,
      "loss": 0.3672,
      "step": 29830
    },
    {
      "gate_value": 0.3892226219177246,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29830
    },
    {
      "grad_norm": 80.18303680419922,
      "learning_rate": 4.988564064194306e-05,
      "loss": 0.3458,
      "step": 29840
    },
    {
      "gate_value": 0.3894360065460205,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 29840
    },
    {
      "grad_norm": 21.752769470214844,
      "learning_rate": 4.979332774391721e-05,
      "loss": 0.367,
      "step": 29850
    },
    {
      "gate_value": 0.3895873725414276,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 29850
    },
    {
      "grad_norm": 19.119279861450195,
      "learning_rate": 4.970108333620696e-05,
      "loss": 0.37,
      "step": 29860
    },
    {
      "gate_value": 0.3896397650241852,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 29860
    },
    {
      "grad_norm": 20.726579666137695,
      "learning_rate": 4.960890748186052e-05,
      "loss": 0.3599,
      "step": 29870
    },
    {
      "gate_value": 0.3897121250629425,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 29870
    },
    {
      "grad_norm": 24.870521545410156,
      "learning_rate": 4.95168002438792e-05,
      "loss": 0.3601,
      "step": 29880
    },
    {
      "gate_value": 0.3897797763347626,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 29880
    },
    {
      "grad_norm": 73.91206359863281,
      "learning_rate": 4.9424761685217353e-05,
      "loss": 0.3679,
      "step": 29890
    },
    {
      "gate_value": 0.38991779088974,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 29890
    },
    {
      "grad_norm": 52.439510345458984,
      "learning_rate": 4.933279186878255e-05,
      "loss": 0.3735,
      "step": 29900
    },
    {
      "gate_value": 0.3900202810764313,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 29900
    },
    {
      "grad_norm": 47.01316833496094,
      "learning_rate": 4.924089085743524e-05,
      "loss": 0.3725,
      "step": 29910
    },
    {
      "gate_value": 0.3900734782218933,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 29910
    },
    {
      "grad_norm": 90.96566009521484,
      "learning_rate": 4.9149058713988945e-05,
      "loss": 0.3616,
      "step": 29920
    },
    {
      "gate_value": 0.3901205062866211,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 29920
    },
    {
      "grad_norm": 8.128710746765137,
      "learning_rate": 4.9057295501210105e-05,
      "loss": 0.3827,
      "step": 29930
    },
    {
      "gate_value": 0.39021652936935425,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 29930
    },
    {
      "grad_norm": 208.60455322265625,
      "learning_rate": 4.8965601281817884e-05,
      "loss": 0.3523,
      "step": 29940
    },
    {
      "gate_value": 0.39037322998046875,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 29940
    },
    {
      "grad_norm": 47.41590118408203,
      "learning_rate": 4.88739761184845e-05,
      "loss": 0.3634,
      "step": 29950
    },
    {
      "gate_value": 0.39055919647216797,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 29950
    },
    {
      "grad_norm": 32.718605041503906,
      "learning_rate": 4.87824200738349e-05,
      "loss": 0.3642,
      "step": 29960
    },
    {
      "gate_value": 0.39067938923835754,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 29960
    },
    {
      "grad_norm": 32.24725341796875,
      "learning_rate": 4.869093321044678e-05,
      "loss": 0.3687,
      "step": 29970
    },
    {
      "gate_value": 0.3905954658985138,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 29970
    },
    {
      "grad_norm": 27.35053062438965,
      "learning_rate": 4.859951559085053e-05,
      "loss": 0.3506,
      "step": 29980
    },
    {
      "gate_value": 0.3906969130039215,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 29980
    },
    {
      "grad_norm": 18.87845230102539,
      "learning_rate": 4.850816727752917e-05,
      "loss": 0.3724,
      "step": 29990
    },
    {
      "gate_value": 0.3908305764198303,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 29990
    },
    {
      "grad_norm": 13.033040046691895,
      "learning_rate": 4.8416888332918474e-05,
      "loss": 0.3616,
      "step": 30000
    },
    {
      "gate_value": 0.3909852206707001,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30000
    },
    {
      "grad_norm": 10490.21875,
      "learning_rate": 4.832567881940672e-05,
      "loss": 0.3711,
      "step": 30010
    },
    {
      "gate_value": 0.3910848796367645,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 30010
    },
    {
      "grad_norm": 153.7618408203125,
      "learning_rate": 4.823453879933477e-05,
      "loss": 0.3687,
      "step": 30020
    },
    {
      "gate_value": 0.3911765217781067,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 30020
    },
    {
      "grad_norm": 28.18154525756836,
      "learning_rate": 4.814346833499601e-05,
      "loss": 0.3604,
      "step": 30030
    },
    {
      "gate_value": 0.39127495884895325,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 30030
    },
    {
      "grad_norm": 16.653223037719727,
      "learning_rate": 4.8052467488636134e-05,
      "loss": 0.3747,
      "step": 30040
    },
    {
      "gate_value": 0.3913731276988983,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30040
    },
    {
      "grad_norm": 17.24098777770996,
      "learning_rate": 4.796153632245343e-05,
      "loss": 0.3581,
      "step": 30050
    },
    {
      "gate_value": 0.39150476455688477,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 30050
    },
    {
      "grad_norm": 16.153825759887695,
      "learning_rate": 4.787067489859854e-05,
      "loss": 0.3816,
      "step": 30060
    },
    {
      "gate_value": 0.39168140292167664,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 30060
    },
    {
      "grad_norm": 22.805763244628906,
      "learning_rate": 4.777988327917427e-05,
      "loss": 0.3484,
      "step": 30070
    },
    {
      "gate_value": 0.39185601472854614,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 30070
    },
    {
      "grad_norm": 14.978759765625,
      "learning_rate": 4.768916152623595e-05,
      "loss": 0.3715,
      "step": 30080
    },
    {
      "gate_value": 0.39189791679382324,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 30080
    },
    {
      "grad_norm": 40.283058166503906,
      "learning_rate": 4.759850970179096e-05,
      "loss": 0.3555,
      "step": 30090
    },
    {
      "gate_value": 0.3919513523578644,
      "icl_sequence_length": 50,
      "num_contexts": 3,
      "step": 30090
    },
    {
      "grad_norm": 63.80585861206055,
      "learning_rate": 4.7507927867799004e-05,
      "loss": 0.3506,
      "step": 30100
    },
    {
      "gate_value": 0.3920625150203705,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 30100
    },
    {
      "grad_norm": 11.960375785827637,
      "learning_rate": 4.741741608617188e-05,
      "loss": 0.3532,
      "step": 30110
    },
    {
      "gate_value": 0.3922727406024933,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 30110
    },
    {
      "grad_norm": 56.46166229248047,
      "learning_rate": 4.732697441877359e-05,
      "loss": 0.3511,
      "step": 30120
    },
    {
      "gate_value": 0.39238861203193665,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 30120
    },
    {
      "grad_norm": 179.3865203857422,
      "learning_rate": 4.723660292742017e-05,
      "loss": 0.3617,
      "step": 30130
    },
    {
      "gate_value": 0.3924447000026703,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 30130
    },
    {
      "grad_norm": 44.34315490722656,
      "learning_rate": 4.7146301673879615e-05,
      "loss": 0.3425,
      "step": 30140
    },
    {
      "gate_value": 0.3926564157009125,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 30140
    },
    {
      "grad_norm": 237.02188110351562,
      "learning_rate": 4.705607071987204e-05,
      "loss": 0.3739,
      "step": 30150
    },
    {
      "gate_value": 0.392910361289978,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 30150
    },
    {
      "grad_norm": 21.830909729003906,
      "learning_rate": 4.6965910127069394e-05,
      "loss": 0.3441,
      "step": 30160
    },
    {
      "gate_value": 0.39306795597076416,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30160
    },
    {
      "grad_norm": 32.624427795410156,
      "learning_rate": 4.687581995709562e-05,
      "loss": 0.3646,
      "step": 30170
    },
    {
      "gate_value": 0.3931129276752472,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 30170
    },
    {
      "grad_norm": 17.580211639404297,
      "learning_rate": 4.678580027152655e-05,
      "loss": 0.3533,
      "step": 30180
    },
    {
      "gate_value": 0.3931795060634613,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 30180
    },
    {
      "grad_norm": 84.48542022705078,
      "learning_rate": 4.66958511318897e-05,
      "loss": 0.3736,
      "step": 30190
    },
    {
      "gate_value": 0.3933153450489044,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 30190
    },
    {
      "grad_norm": 23.2664794921875,
      "learning_rate": 4.660597259966448e-05,
      "loss": 0.3454,
      "step": 30200
    },
    {
      "gate_value": 0.3933710753917694,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 30200
    },
    {
      "grad_norm": 28.477397918701172,
      "learning_rate": 4.6516164736282056e-05,
      "loss": 0.3743,
      "step": 30210
    },
    {
      "gate_value": 0.3934413492679596,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 30210
    },
    {
      "grad_norm": 128.1862335205078,
      "learning_rate": 4.642642760312524e-05,
      "loss": 0.3719,
      "step": 30220
    },
    {
      "gate_value": 0.39349788427352905,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 30220
    },
    {
      "grad_norm": 14.590642929077148,
      "learning_rate": 4.633676126152858e-05,
      "loss": 0.3627,
      "step": 30230
    },
    {
      "gate_value": 0.39354342222213745,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30230
    },
    {
      "grad_norm": 18.493053436279297,
      "learning_rate": 4.624716577277803e-05,
      "loss": 0.3571,
      "step": 30240
    },
    {
      "gate_value": 0.3936373293399811,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 30240
    },
    {
      "grad_norm": 24.06255531311035,
      "learning_rate": 4.615764119811141e-05,
      "loss": 0.3757,
      "step": 30250
    },
    {
      "gate_value": 0.3937382102012634,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 30250
    },
    {
      "grad_norm": 30.321077346801758,
      "learning_rate": 4.606818759871782e-05,
      "loss": 0.3617,
      "step": 30260
    },
    {
      "gate_value": 0.3939821124076843,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 30260
    },
    {
      "grad_norm": 24.308691024780273,
      "learning_rate": 4.597880503573797e-05,
      "loss": 0.3588,
      "step": 30270
    },
    {
      "gate_value": 0.3941650092601776,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30270
    },
    {
      "grad_norm": 12.186562538146973,
      "learning_rate": 4.5889493570264074e-05,
      "loss": 0.3683,
      "step": 30280
    },
    {
      "gate_value": 0.39424392580986023,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 30280
    },
    {
      "grad_norm": 18.442607879638672,
      "learning_rate": 4.580025326333956e-05,
      "loss": 0.3561,
      "step": 30290
    },
    {
      "gate_value": 0.3942907452583313,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 30290
    },
    {
      "grad_norm": 28.805355072021484,
      "learning_rate": 4.571108417595942e-05,
      "loss": 0.3622,
      "step": 30300
    },
    {
      "gate_value": 0.39430415630340576,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30300
    },
    {
      "grad_norm": 12.717275619506836,
      "learning_rate": 4.562198636906983e-05,
      "loss": 0.3833,
      "step": 30310
    },
    {
      "gate_value": 0.3942856788635254,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30310
    },
    {
      "grad_norm": 15.916489601135254,
      "learning_rate": 4.553295990356836e-05,
      "loss": 0.3707,
      "step": 30320
    },
    {
      "gate_value": 0.39437082409858704,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30320
    },
    {
      "grad_norm": 889.87060546875,
      "learning_rate": 4.5444004840303757e-05,
      "loss": 0.3611,
      "step": 30330
    },
    {
      "gate_value": 0.3944534361362457,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 30330
    },
    {
      "grad_norm": 38.44389343261719,
      "learning_rate": 4.5355121240075944e-05,
      "loss": 0.3608,
      "step": 30340
    },
    {
      "gate_value": 0.3944818675518036,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 30340
    },
    {
      "grad_norm": 26.342105865478516,
      "learning_rate": 4.526630916363597e-05,
      "loss": 0.3827,
      "step": 30350
    },
    {
      "gate_value": 0.3946109116077423,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 30350
    },
    {
      "grad_norm": 99.67547607421875,
      "learning_rate": 4.517756867168612e-05,
      "loss": 0.3616,
      "step": 30360
    },
    {
      "gate_value": 0.3946753144264221,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 30360
    },
    {
      "grad_norm": 134.35885620117188,
      "learning_rate": 4.508889982487965e-05,
      "loss": 0.3555,
      "step": 30370
    },
    {
      "gate_value": 0.3948347568511963,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 30370
    },
    {
      "grad_norm": 25.946739196777344,
      "learning_rate": 4.500030268382096e-05,
      "loss": 0.3771,
      "step": 30380
    },
    {
      "gate_value": 0.3949762284755707,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30380
    },
    {
      "grad_norm": 23.79796028137207,
      "learning_rate": 4.4911777309065236e-05,
      "loss": 0.3443,
      "step": 30390
    },
    {
      "gate_value": 0.3950321674346924,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30390
    },
    {
      "grad_norm": 26.93970489501953,
      "learning_rate": 4.4823323761118807e-05,
      "loss": 0.3567,
      "step": 30400
    },
    {
      "gate_value": 0.39504531025886536,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30400
    },
    {
      "grad_norm": 33.7324104309082,
      "learning_rate": 4.4734942100438835e-05,
      "loss": 0.347,
      "step": 30410
    },
    {
      "gate_value": 0.39515039324760437,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30410
    },
    {
      "grad_norm": 35.20787811279297,
      "learning_rate": 4.464663238743333e-05,
      "loss": 0.3602,
      "step": 30420
    },
    {
      "gate_value": 0.3952362537384033,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 30420
    },
    {
      "grad_norm": 18.598276138305664,
      "learning_rate": 4.4558394682461236e-05,
      "loss": 0.3505,
      "step": 30430
    },
    {
      "gate_value": 0.39534491300582886,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30430
    },
    {
      "grad_norm": 17.984878540039062,
      "learning_rate": 4.44702290458321e-05,
      "loss": 0.3656,
      "step": 30440
    },
    {
      "gate_value": 0.3954671025276184,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 30440
    },
    {
      "grad_norm": 18.157207489013672,
      "learning_rate": 4.438213553780628e-05,
      "loss": 0.3556,
      "step": 30450
    },
    {
      "gate_value": 0.3956170380115509,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 30450
    },
    {
      "grad_norm": 2112.7578125,
      "learning_rate": 4.429411421859492e-05,
      "loss": 0.384,
      "step": 30460
    },
    {
      "gate_value": 0.39561453461647034,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 30460
    },
    {
      "grad_norm": 30.692434310913086,
      "learning_rate": 4.420616514835973e-05,
      "loss": 0.3607,
      "step": 30470
    },
    {
      "gate_value": 0.39567017555236816,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 30470
    },
    {
      "grad_norm": 115.94886016845703,
      "learning_rate": 4.411828838721313e-05,
      "loss": 0.3506,
      "step": 30480
    },
    {
      "gate_value": 0.3958011567592621,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 30480
    },
    {
      "grad_norm": 60.980934143066406,
      "learning_rate": 4.403048399521798e-05,
      "loss": 0.3457,
      "step": 30490
    },
    {
      "gate_value": 0.3959410488605499,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 30490
    },
    {
      "grad_norm": 22.16244125366211,
      "learning_rate": 4.394275203238778e-05,
      "loss": 0.3448,
      "step": 30500
    },
    {
      "gate_value": 0.39600443840026855,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 30500
    },
    {
      "grad_norm": 32.711326599121094,
      "learning_rate": 4.38550925586865e-05,
      "loss": 0.3605,
      "step": 30510
    },
    {
      "gate_value": 0.3960520625114441,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 30510
    },
    {
      "grad_norm": 28.291837692260742,
      "learning_rate": 4.3767505634028614e-05,
      "loss": 0.3721,
      "step": 30520
    },
    {
      "gate_value": 0.39630264043807983,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 30520
    },
    {
      "grad_norm": 199.38694763183594,
      "learning_rate": 4.3679991318278875e-05,
      "loss": 0.3589,
      "step": 30530
    },
    {
      "gate_value": 0.39648500084877014,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30530
    },
    {
      "grad_norm": 33.74460220336914,
      "learning_rate": 4.3592549671252584e-05,
      "loss": 0.3695,
      "step": 30540
    },
    {
      "gate_value": 0.3966231048107147,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30540
    },
    {
      "grad_norm": 24.554481506347656,
      "learning_rate": 4.350518075271518e-05,
      "loss": 0.3546,
      "step": 30550
    },
    {
      "gate_value": 0.39659935235977173,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 30550
    },
    {
      "grad_norm": 100.24298095703125,
      "learning_rate": 4.3417884622382536e-05,
      "loss": 0.355,
      "step": 30560
    },
    {
      "gate_value": 0.3965737521648407,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30560
    },
    {
      "grad_norm": 68.57526397705078,
      "learning_rate": 4.333066133992075e-05,
      "loss": 0.3643,
      "step": 30570
    },
    {
      "gate_value": 0.3965604901313782,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 30570
    },
    {
      "grad_norm": 14.827048301696777,
      "learning_rate": 4.32435109649461e-05,
      "loss": 0.3357,
      "step": 30580
    },
    {
      "gate_value": 0.39664265513420105,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30580
    },
    {
      "grad_norm": 42.24051284790039,
      "learning_rate": 4.315643355702511e-05,
      "loss": 0.3607,
      "step": 30590
    },
    {
      "gate_value": 0.39675700664520264,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 30590
    },
    {
      "grad_norm": 38.62068176269531,
      "learning_rate": 4.306942917567426e-05,
      "loss": 0.3586,
      "step": 30600
    },
    {
      "gate_value": 0.39687106013298035,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 30600
    },
    {
      "grad_norm": 38.699825286865234,
      "learning_rate": 4.298249788036026e-05,
      "loss": 0.3761,
      "step": 30610
    },
    {
      "gate_value": 0.3969728648662567,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 30610
    },
    {
      "grad_norm": 17.548847198486328,
      "learning_rate": 4.2895639730499906e-05,
      "loss": 0.3703,
      "step": 30620
    },
    {
      "gate_value": 0.3970944881439209,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 30620
    },
    {
      "grad_norm": 17.83329200744629,
      "learning_rate": 4.2808854785459815e-05,
      "loss": 0.363,
      "step": 30630
    },
    {
      "gate_value": 0.3971506357192993,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 30630
    },
    {
      "grad_norm": 18.071189880371094,
      "learning_rate": 4.272214310455677e-05,
      "loss": 0.3632,
      "step": 30640
    },
    {
      "gate_value": 0.39719945192337036,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 30640
    },
    {
      "grad_norm": 34.34341812133789,
      "learning_rate": 4.2635504747057296e-05,
      "loss": 0.3485,
      "step": 30650
    },
    {
      "gate_value": 0.3972983956336975,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 30650
    },
    {
      "grad_norm": 25.537384033203125,
      "learning_rate": 4.254893977217794e-05,
      "loss": 0.3661,
      "step": 30660
    },
    {
      "gate_value": 0.39736828207969666,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 30660
    },
    {
      "grad_norm": 17.409698486328125,
      "learning_rate": 4.2462448239085044e-05,
      "loss": 0.3478,
      "step": 30670
    },
    {
      "gate_value": 0.39749452471733093,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 30670
    },
    {
      "grad_norm": 37.236331939697266,
      "learning_rate": 4.237603020689477e-05,
      "loss": 0.362,
      "step": 30680
    },
    {
      "gate_value": 0.39765825867652893,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30680
    },
    {
      "grad_norm": 286.671875,
      "learning_rate": 4.228968573467306e-05,
      "loss": 0.356,
      "step": 30690
    },
    {
      "gate_value": 0.39775484800338745,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 30690
    },
    {
      "grad_norm": 39.26069259643555,
      "learning_rate": 4.2203414881435436e-05,
      "loss": 0.3715,
      "step": 30700
    },
    {
      "gate_value": 0.3978050947189331,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 30700
    },
    {
      "grad_norm": 3089.90478515625,
      "learning_rate": 4.211721770614734e-05,
      "loss": 0.3565,
      "step": 30710
    },
    {
      "gate_value": 0.3978253901004791,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 30710
    },
    {
      "grad_norm": 5112.45068359375,
      "learning_rate": 4.203109426772363e-05,
      "loss": 0.3661,
      "step": 30720
    },
    {
      "gate_value": 0.3978503346443176,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 30720
    },
    {
      "grad_norm": 84.64717102050781,
      "learning_rate": 4.19450446250289e-05,
      "loss": 0.3648,
      "step": 30730
    },
    {
      "gate_value": 0.3978942334651947,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 30730
    },
    {
      "grad_norm": 39.20880889892578,
      "learning_rate": 4.1859068836877306e-05,
      "loss": 0.3401,
      "step": 30740
    },
    {
      "gate_value": 0.39794957637786865,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 30740
    },
    {
      "grad_norm": 184.52806091308594,
      "learning_rate": 4.177316696203241e-05,
      "loss": 0.345,
      "step": 30750
    },
    {
      "gate_value": 0.3979901671409607,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 30750
    },
    {
      "grad_norm": 58.26884841918945,
      "learning_rate": 4.168733905920739e-05,
      "loss": 0.3537,
      "step": 30760
    },
    {
      "gate_value": 0.3980367183685303,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30760
    },
    {
      "grad_norm": 43.49728775024414,
      "learning_rate": 4.160158518706479e-05,
      "loss": 0.35,
      "step": 30770
    },
    {
      "gate_value": 0.3981196880340576,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30770
    },
    {
      "grad_norm": 3105.703857421875,
      "learning_rate": 4.151590540421657e-05,
      "loss": 0.3687,
      "step": 30780
    },
    {
      "gate_value": 0.39822056889533997,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 30780
    },
    {
      "grad_norm": 43.1259765625,
      "learning_rate": 4.143029976922411e-05,
      "loss": 0.3579,
      "step": 30790
    },
    {
      "gate_value": 0.39836224913597107,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 30790
    },
    {
      "grad_norm": 136.4414520263672,
      "learning_rate": 4.134476834059801e-05,
      "loss": 0.3663,
      "step": 30800
    },
    {
      "gate_value": 0.39839082956314087,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 30800
    },
    {
      "grad_norm": 31.794607162475586,
      "learning_rate": 4.1259311176798155e-05,
      "loss": 0.3516,
      "step": 30810
    },
    {
      "gate_value": 0.39832615852355957,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 30810
    },
    {
      "grad_norm": 15.481108665466309,
      "learning_rate": 4.117392833623373e-05,
      "loss": 0.3661,
      "step": 30820
    },
    {
      "gate_value": 0.39843201637268066,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 30820
    },
    {
      "grad_norm": 28.874069213867188,
      "learning_rate": 4.108861987726312e-05,
      "loss": 0.3641,
      "step": 30830
    },
    {
      "gate_value": 0.39849528670310974,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 30830
    },
    {
      "grad_norm": 7868.61328125,
      "learning_rate": 4.100338585819391e-05,
      "loss": 0.3589,
      "step": 30840
    },
    {
      "gate_value": 0.3985292613506317,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 30840
    },
    {
      "grad_norm": 92.6623306274414,
      "learning_rate": 4.091822633728264e-05,
      "loss": 0.3513,
      "step": 30850
    },
    {
      "gate_value": 0.39864015579223633,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 30850
    },
    {
      "grad_norm": 68.68574523925781,
      "learning_rate": 4.0833141372735086e-05,
      "loss": 0.3591,
      "step": 30860
    },
    {
      "gate_value": 0.3987644910812378,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 30860
    },
    {
      "grad_norm": 8631.802734375,
      "learning_rate": 4.074813102270603e-05,
      "loss": 0.3598,
      "step": 30870
    },
    {
      "gate_value": 0.3988719880580902,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 30870
    },
    {
      "grad_norm": 22.09979248046875,
      "learning_rate": 4.066319534529922e-05,
      "loss": 0.348,
      "step": 30880
    },
    {
      "gate_value": 0.39896559715270996,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 30880
    },
    {
      "grad_norm": 28.57960319519043,
      "learning_rate": 4.057833439856746e-05,
      "loss": 0.3653,
      "step": 30890
    },
    {
      "gate_value": 0.3991009294986725,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 30890
    },
    {
      "grad_norm": 281.9774475097656,
      "learning_rate": 4.0493548240512355e-05,
      "loss": 0.3667,
      "step": 30900
    },
    {
      "gate_value": 0.39916902780532837,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 30900
    },
    {
      "grad_norm": 19.42442512512207,
      "learning_rate": 4.0408836929084396e-05,
      "loss": 0.3551,
      "step": 30910
    },
    {
      "gate_value": 0.39917629957199097,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 30910
    },
    {
      "grad_norm": 25.34986114501953,
      "learning_rate": 4.032420052218302e-05,
      "loss": 0.3536,
      "step": 30920
    },
    {
      "gate_value": 0.39924490451812744,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 30920
    },
    {
      "grad_norm": 66.68396759033203,
      "learning_rate": 4.02396390776564e-05,
      "loss": 0.3603,
      "step": 30930
    },
    {
      "gate_value": 0.39930960536003113,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 30930
    },
    {
      "grad_norm": 28.13355255126953,
      "learning_rate": 4.015515265330155e-05,
      "loss": 0.3505,
      "step": 30940
    },
    {
      "gate_value": 0.39939790964126587,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 30940
    },
    {
      "grad_norm": 36.40007019042969,
      "learning_rate": 4.0070741306864026e-05,
      "loss": 0.3728,
      "step": 30950
    },
    {
      "gate_value": 0.399484783411026,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30950
    },
    {
      "grad_norm": 15.841800689697266,
      "learning_rate": 3.998640509603824e-05,
      "loss": 0.3561,
      "step": 30960
    },
    {
      "gate_value": 0.39953693747520447,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 30960
    },
    {
      "grad_norm": 78.29736328125,
      "learning_rate": 3.9902144078467234e-05,
      "loss": 0.3626,
      "step": 30970
    },
    {
      "gate_value": 0.39955389499664307,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 30970
    },
    {
      "grad_norm": 44.35389709472656,
      "learning_rate": 3.9817958311742564e-05,
      "loss": 0.3639,
      "step": 30980
    },
    {
      "gate_value": 0.39958593249320984,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 30980
    },
    {
      "grad_norm": 54.38600158691406,
      "learning_rate": 3.973384785340449e-05,
      "loss": 0.3697,
      "step": 30990
    },
    {
      "gate_value": 0.3996659517288208,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 30990
    },
    {
      "grad_norm": 23.38623046875,
      "learning_rate": 3.964981276094165e-05,
      "loss": 0.3511,
      "step": 31000
    },
    {
      "gate_value": 0.3997676968574524,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 31000
    },
    {
      "grad_norm": 22.269174575805664,
      "learning_rate": 3.956585309179121e-05,
      "loss": 0.3786,
      "step": 31010
    },
    {
      "gate_value": 0.3999699652194977,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 31010
    },
    {
      "grad_norm": 22.649744033813477,
      "learning_rate": 3.9481968903338864e-05,
      "loss": 0.3428,
      "step": 31020
    },
    {
      "gate_value": 0.40010973811149597,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 31020
    },
    {
      "grad_norm": 210.9306640625,
      "learning_rate": 3.9398160252918626e-05,
      "loss": 0.3639,
      "step": 31030
    },
    {
      "gate_value": 0.40025028586387634,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 31030
    },
    {
      "grad_norm": 59.51335525512695,
      "learning_rate": 3.9314427197812996e-05,
      "loss": 0.3542,
      "step": 31040
    },
    {
      "gate_value": 0.4003356099128723,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 31040
    },
    {
      "grad_norm": 1145.820068359375,
      "learning_rate": 3.923076979525263e-05,
      "loss": 0.372,
      "step": 31050
    },
    {
      "gate_value": 0.400397390127182,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 31050
    },
    {
      "grad_norm": 227.70965576171875,
      "learning_rate": 3.914718810241662e-05,
      "loss": 0.3483,
      "step": 31060
    },
    {
      "gate_value": 0.4004197418689728,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 31060
    },
    {
      "grad_norm": 246.35362243652344,
      "learning_rate": 3.906368217643227e-05,
      "loss": 0.3558,
      "step": 31070
    },
    {
      "gate_value": 0.4004781246185303,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 31070
    },
    {
      "grad_norm": 75.53336334228516,
      "learning_rate": 3.898025207437511e-05,
      "loss": 0.3559,
      "step": 31080
    },
    {
      "gate_value": 0.4005269706249237,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 31080
    },
    {
      "grad_norm": 45.185482025146484,
      "learning_rate": 3.8896897853268765e-05,
      "loss": 0.3696,
      "step": 31090
    },
    {
      "gate_value": 0.4005460739135742,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 31090
    },
    {
      "grad_norm": 61.928192138671875,
      "learning_rate": 3.881361957008516e-05,
      "loss": 0.3681,
      "step": 31100
    },
    {
      "gate_value": 0.40058523416519165,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 31100
    },
    {
      "grad_norm": 1029.142333984375,
      "learning_rate": 3.873041728174409e-05,
      "loss": 0.3501,
      "step": 31110
    },
    {
      "gate_value": 0.40063005685806274,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 31110
    },
    {
      "grad_norm": 40.88732147216797,
      "learning_rate": 3.864729104511361e-05,
      "loss": 0.3459,
      "step": 31120
    },
    {
      "gate_value": 0.4006367623806,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 31120
    },
    {
      "grad_norm": 395.06768798828125,
      "learning_rate": 3.8564240917009695e-05,
      "loss": 0.3655,
      "step": 31130
    },
    {
      "gate_value": 0.4006264805793762,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 31130
    },
    {
      "grad_norm": 76.37919616699219,
      "learning_rate": 3.848126695419639e-05,
      "loss": 0.3652,
      "step": 31140
    },
    {
      "gate_value": 0.40067169070243835,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 31140
    },
    {
      "grad_norm": 153.97393798828125,
      "learning_rate": 3.839836921338551e-05,
      "loss": 0.3446,
      "step": 31150
    },
    {
      "gate_value": 0.4007301330566406,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 31150
    },
    {
      "grad_norm": 96.89092254638672,
      "learning_rate": 3.831554775123694e-05,
      "loss": 0.3672,
      "step": 31160
    },
    {
      "gate_value": 0.4007788598537445,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 31160
    },
    {
      "grad_norm": 60.73162078857422,
      "learning_rate": 3.823280262435837e-05,
      "loss": 0.36,
      "step": 31170
    },
    {
      "gate_value": 0.4008052945137024,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 31170
    },
    {
      "grad_norm": 160.602294921875,
      "learning_rate": 3.8150133889305336e-05,
      "loss": 0.3401,
      "step": 31180
    },
    {
      "gate_value": 0.4008178412914276,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 31180
    },
    {
      "grad_norm": 91.08993530273438,
      "learning_rate": 3.806754160258106e-05,
      "loss": 0.3578,
      "step": 31190
    },
    {
      "gate_value": 0.40085065364837646,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 31190
    },
    {
      "grad_norm": 229.18531799316406,
      "learning_rate": 3.798502582063669e-05,
      "loss": 0.3692,
      "step": 31200
    },
    {
      "gate_value": 0.4008884131908417,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 31200
    },
    {
      "grad_norm": 37.98585510253906,
      "learning_rate": 3.7902586599870895e-05,
      "loss": 0.3507,
      "step": 31210
    },
    {
      "gate_value": 0.4009264409542084,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 31210
    },
    {
      "grad_norm": 19.111576080322266,
      "learning_rate": 3.782022399663014e-05,
      "loss": 0.3529,
      "step": 31220
    },
    {
      "gate_value": 0.40098142623901367,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 31220
    },
    {
      "grad_norm": 1276.112548828125,
      "learning_rate": 3.773793806720848e-05,
      "loss": 0.3756,
      "step": 31230
    },
    {
      "gate_value": 0.40107500553131104,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 31230
    },
    {
      "grad_norm": 62.54690933227539,
      "learning_rate": 3.765572886784764e-05,
      "loss": 0.3571,
      "step": 31240
    },
    {
      "gate_value": 0.40113914012908936,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 31240
    },
    {
      "grad_norm": 255.1707000732422,
      "learning_rate": 3.7573596454736724e-05,
      "loss": 0.3786,
      "step": 31250
    },
    {
      "gate_value": 0.4011474549770355,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 31250
    },
    {
      "grad_norm": 80.06964874267578,
      "learning_rate": 3.7491540884012516e-05,
      "loss": 0.3608,
      "step": 31260
    },
    {
      "gate_value": 0.40119272470474243,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 31260
    },
    {
      "grad_norm": 60.66999435424805,
      "learning_rate": 3.7409562211759265e-05,
      "loss": 0.3532,
      "step": 31270
    },
    {
      "gate_value": 0.40127280354499817,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 31270
    },
    {
      "grad_norm": 487.518310546875,
      "learning_rate": 3.732766049400853e-05,
      "loss": 0.3606,
      "step": 31280
    },
    {
      "gate_value": 0.4013500511646271,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 31280
    },
    {
      "grad_norm": 9397.779296875,
      "learning_rate": 3.7245835786739425e-05,
      "loss": 0.3582,
      "step": 31290
    },
    {
      "gate_value": 0.4013952314853668,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 31290
    },
    {
      "grad_norm": 50.94612121582031,
      "learning_rate": 3.716408814587837e-05,
      "loss": 0.3622,
      "step": 31300
    },
    {
      "gate_value": 0.4014360010623932,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 31300
    },
    {
      "grad_norm": 87.3506851196289,
      "learning_rate": 3.7082417627299064e-05,
      "loss": 0.3476,
      "step": 31310
    },
    {
      "gate_value": 0.40153583884239197,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 31310
    },
    {
      "grad_norm": 659.7249145507812,
      "learning_rate": 3.7000824286822566e-05,
      "loss": 0.3805,
      "step": 31320
    },
    {
      "gate_value": 0.4015822410583496,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31320
    },
    {
      "grad_norm": 25.28474235534668,
      "learning_rate": 3.6919308180217135e-05,
      "loss": 0.3714,
      "step": 31330
    },
    {
      "gate_value": 0.4015807509422302,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 31330
    },
    {
      "grad_norm": 86.76029968261719,
      "learning_rate": 3.683786936319833e-05,
      "loss": 0.3685,
      "step": 31340
    },
    {
      "gate_value": 0.40166762471199036,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 31340
    },
    {
      "grad_norm": 92.85562133789062,
      "learning_rate": 3.6756507891428714e-05,
      "loss": 0.3576,
      "step": 31350
    },
    {
      "gate_value": 0.40173211693763733,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 31350
    },
    {
      "grad_norm": 158.87136840820312,
      "learning_rate": 3.6675223820518174e-05,
      "loss": 0.3606,
      "step": 31360
    },
    {
      "gate_value": 0.4017789661884308,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 31360
    },
    {
      "grad_norm": 39.90798568725586,
      "learning_rate": 3.6594017206023514e-05,
      "loss": 0.3681,
      "step": 31370
    },
    {
      "gate_value": 0.4018018841743469,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 31370
    },
    {
      "grad_norm": 33.672882080078125,
      "learning_rate": 3.651288810344875e-05,
      "loss": 0.3712,
      "step": 31380
    },
    {
      "gate_value": 0.40186256170272827,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 31380
    },
    {
      "grad_norm": 40.18476104736328,
      "learning_rate": 3.643183656824485e-05,
      "loss": 0.3515,
      "step": 31390
    },
    {
      "gate_value": 0.40187448263168335,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 31390
    },
    {
      "grad_norm": 150.2606658935547,
      "learning_rate": 3.635086265580979e-05,
      "loss": 0.3495,
      "step": 31400
    },
    {
      "gate_value": 0.401906818151474,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 31400
    },
    {
      "grad_norm": 40.321495056152344,
      "learning_rate": 3.626996642148844e-05,
      "loss": 0.3561,
      "step": 31410
    },
    {
      "gate_value": 0.40194520354270935,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 31410
    },
    {
      "grad_norm": 51.097068786621094,
      "learning_rate": 3.618914792057262e-05,
      "loss": 0.3419,
      "step": 31420
    },
    {
      "gate_value": 0.4019700586795807,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 31420
    },
    {
      "grad_norm": 101.07646942138672,
      "learning_rate": 3.6108407208301035e-05,
      "loss": 0.3423,
      "step": 31430
    },
    {
      "gate_value": 0.4019973874092102,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 31430
    },
    {
      "grad_norm": 1876.52587890625,
      "learning_rate": 3.602774433985922e-05,
      "loss": 0.3498,
      "step": 31440
    },
    {
      "gate_value": 0.4020547568798065,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 31440
    },
    {
      "grad_norm": 708.6364135742188,
      "learning_rate": 3.594715937037942e-05,
      "loss": 0.3698,
      "step": 31450
    },
    {
      "gate_value": 0.4021237790584564,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31450
    },
    {
      "grad_norm": 1956.0980224609375,
      "learning_rate": 3.586665235494077e-05,
      "loss": 0.3627,
      "step": 31460
    },
    {
      "gate_value": 0.4021860957145691,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 31460
    },
    {
      "grad_norm": 31.426523208618164,
      "learning_rate": 3.578622334856898e-05,
      "loss": 0.3758,
      "step": 31470
    },
    {
      "gate_value": 0.4022514224052429,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 31470
    },
    {
      "grad_norm": 1016.490234375,
      "learning_rate": 3.570587240623658e-05,
      "loss": 0.358,
      "step": 31480
    },
    {
      "gate_value": 0.4023077189922333,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 31480
    },
    {
      "grad_norm": 851.119873046875,
      "learning_rate": 3.5625599582862647e-05,
      "loss": 0.3458,
      "step": 31490
    },
    {
      "gate_value": 0.4023520350456238,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 31490
    },
    {
      "grad_norm": 2855.093505859375,
      "learning_rate": 3.554540493331294e-05,
      "loss": 0.3623,
      "step": 31500
    },
    {
      "gate_value": 0.4023928940296173,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31500
    },
    {
      "grad_norm": 104.28507995605469,
      "learning_rate": 3.5465288512399694e-05,
      "loss": 0.3712,
      "step": 31510
    },
    {
      "gate_value": 0.40249326825141907,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 31510
    },
    {
      "grad_norm": 888.3125,
      "learning_rate": 3.538525037488176e-05,
      "loss": 0.3659,
      "step": 31520
    },
    {
      "gate_value": 0.40257859230041504,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 31520
    },
    {
      "grad_norm": 58.13582229614258,
      "learning_rate": 3.530529057546443e-05,
      "loss": 0.3424,
      "step": 31530
    },
    {
      "gate_value": 0.4026349186897278,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 31530
    },
    {
      "grad_norm": 207.80984497070312,
      "learning_rate": 3.5225409168799526e-05,
      "loss": 0.3437,
      "step": 31540
    },
    {
      "gate_value": 0.40274718403816223,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 31540
    },
    {
      "grad_norm": 82.31903076171875,
      "learning_rate": 3.51456062094852e-05,
      "loss": 0.3565,
      "step": 31550
    },
    {
      "gate_value": 0.40279620885849,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 31550
    },
    {
      "grad_norm": 777.2052001953125,
      "learning_rate": 3.506588175206598e-05,
      "loss": 0.3518,
      "step": 31560
    },
    {
      "gate_value": 0.4028371274471283,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 31560
    },
    {
      "grad_norm": 27.5644588470459,
      "learning_rate": 3.49862358510328e-05,
      "loss": 0.3613,
      "step": 31570
    },
    {
      "gate_value": 0.40281838178634644,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31570
    },
    {
      "grad_norm": 896.0078125,
      "learning_rate": 3.490666856082291e-05,
      "loss": 0.3719,
      "step": 31580
    },
    {
      "gate_value": 0.402811735868454,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 31580
    },
    {
      "grad_norm": 36.032833099365234,
      "learning_rate": 3.48271799358198e-05,
      "loss": 0.3562,
      "step": 31590
    },
    {
      "gate_value": 0.4028680622577667,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31590
    },
    {
      "grad_norm": 383.2489318847656,
      "learning_rate": 3.474777003035323e-05,
      "loss": 0.3776,
      "step": 31600
    },
    {
      "gate_value": 0.4029342532157898,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 31600
    },
    {
      "grad_norm": 76.16732788085938,
      "learning_rate": 3.466843889869903e-05,
      "loss": 0.3588,
      "step": 31610
    },
    {
      "gate_value": 0.40299057960510254,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 31610
    },
    {
      "grad_norm": 37.22277069091797,
      "learning_rate": 3.458918659507935e-05,
      "loss": 0.3557,
      "step": 31620
    },
    {
      "gate_value": 0.4030371308326721,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31620
    },
    {
      "grad_norm": 50.39080047607422,
      "learning_rate": 3.4510013173662356e-05,
      "loss": 0.3446,
      "step": 31630
    },
    {
      "gate_value": 0.4031177759170532,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 31630
    },
    {
      "grad_norm": 24.230958938598633,
      "learning_rate": 3.443091868856239e-05,
      "loss": 0.3653,
      "step": 31640
    },
    {
      "gate_value": 0.40322422981262207,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 31640
    },
    {
      "grad_norm": 81.67859649658203,
      "learning_rate": 3.435190319383977e-05,
      "loss": 0.3573,
      "step": 31650
    },
    {
      "gate_value": 0.4033409655094147,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 31650
    },
    {
      "grad_norm": 49.784027099609375,
      "learning_rate": 3.427296674350077e-05,
      "loss": 0.3486,
      "step": 31660
    },
    {
      "gate_value": 0.40337809920310974,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 31660
    },
    {
      "grad_norm": 1278.3245849609375,
      "learning_rate": 3.419410939149775e-05,
      "loss": 0.3603,
      "step": 31670
    },
    {
      "gate_value": 0.4034092426300049,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 31670
    },
    {
      "grad_norm": 78.81749725341797,
      "learning_rate": 3.411533119172898e-05,
      "loss": 0.3485,
      "step": 31680
    },
    {
      "gate_value": 0.4034452736377716,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 31680
    },
    {
      "grad_norm": 101.64257049560547,
      "learning_rate": 3.403663219803862e-05,
      "loss": 0.348,
      "step": 31690
    },
    {
      "gate_value": 0.4035550653934479,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31690
    },
    {
      "grad_norm": 38.30369567871094,
      "learning_rate": 3.3958012464216714e-05,
      "loss": 0.368,
      "step": 31700
    },
    {
      "gate_value": 0.4036957025527954,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 31700
    },
    {
      "grad_norm": 73.04335021972656,
      "learning_rate": 3.387947204399905e-05,
      "loss": 0.3643,
      "step": 31710
    },
    {
      "gate_value": 0.4038078486919403,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 31710
    },
    {
      "grad_norm": 48.24745559692383,
      "learning_rate": 3.3801010991067286e-05,
      "loss": 0.3589,
      "step": 31720
    },
    {
      "gate_value": 0.40385109186172485,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 31720
    },
    {
      "grad_norm": 18.187374114990234,
      "learning_rate": 3.372262935904882e-05,
      "loss": 0.3561,
      "step": 31730
    },
    {
      "gate_value": 0.40391314029693604,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31730
    },
    {
      "grad_norm": 14.42495346069336,
      "learning_rate": 3.3644327201516795e-05,
      "loss": 0.3498,
      "step": 31740
    },
    {
      "gate_value": 0.4040006101131439,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 31740
    },
    {
      "grad_norm": 117.840087890625,
      "learning_rate": 3.356610457198997e-05,
      "loss": 0.3496,
      "step": 31750
    },
    {
      "gate_value": 0.4040774703025818,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 31750
    },
    {
      "grad_norm": 32.17731475830078,
      "learning_rate": 3.348796152393271e-05,
      "loss": 0.3396,
      "step": 31760
    },
    {
      "gate_value": 0.4041006863117218,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31760
    },
    {
      "grad_norm": 99.77642059326172,
      "learning_rate": 3.340989811075512e-05,
      "loss": 0.3726,
      "step": 31770
    },
    {
      "gate_value": 0.40423011779785156,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 31770
    },
    {
      "grad_norm": 34.55682373046875,
      "learning_rate": 3.333191438581278e-05,
      "loss": 0.3616,
      "step": 31780
    },
    {
      "gate_value": 0.40432289242744446,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 31780
    },
    {
      "grad_norm": 8175.66162109375,
      "learning_rate": 3.3254010402406845e-05,
      "loss": 0.3582,
      "step": 31790
    },
    {
      "gate_value": 0.40442436933517456,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 31790
    },
    {
      "grad_norm": 152.19418334960938,
      "learning_rate": 3.317618621378399e-05,
      "loss": 0.3625,
      "step": 31800
    },
    {
      "gate_value": 0.4044768214225769,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31800
    },
    {
      "grad_norm": 47.31957244873047,
      "learning_rate": 3.309844187313625e-05,
      "loss": 0.3767,
      "step": 31810
    },
    {
      "gate_value": 0.40453842282295227,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 31810
    },
    {
      "grad_norm": 579.4370727539062,
      "learning_rate": 3.302077743360115e-05,
      "loss": 0.3608,
      "step": 31820
    },
    {
      "gate_value": 0.40464499592781067,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 31820
    },
    {
      "grad_norm": 202.05191040039062,
      "learning_rate": 3.294319294826169e-05,
      "loss": 0.355,
      "step": 31830
    },
    {
      "gate_value": 0.40469077229499817,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 31830
    },
    {
      "grad_norm": 48.13961410522461,
      "learning_rate": 3.286568847014602e-05,
      "loss": 0.3464,
      "step": 31840
    },
    {
      "gate_value": 0.4047562777996063,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 31840
    },
    {
      "grad_norm": 85.94761657714844,
      "learning_rate": 3.27882640522278e-05,
      "loss": 0.3631,
      "step": 31850
    },
    {
      "gate_value": 0.4047923982143402,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 31850
    },
    {
      "grad_norm": 59.6070556640625,
      "learning_rate": 3.271091974742583e-05,
      "loss": 0.3695,
      "step": 31860
    },
    {
      "gate_value": 0.40476715564727783,
      "icl_sequence_length": 94,
      "num_contexts": 3,
      "step": 31860
    },
    {
      "grad_norm": 33.71444320678711,
      "learning_rate": 3.263365560860424e-05,
      "loss": 0.358,
      "step": 31870
    },
    {
      "gate_value": 0.4048195779323578,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 31870
    },
    {
      "grad_norm": 1352.75244140625,
      "learning_rate": 3.255647168857235e-05,
      "loss": 0.3632,
      "step": 31880
    },
    {
      "gate_value": 0.4048292934894562,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 31880
    },
    {
      "grad_norm": 25.616077423095703,
      "learning_rate": 3.247936804008462e-05,
      "loss": 0.3667,
      "step": 31890
    },
    {
      "gate_value": 0.40487462282180786,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 31890
    },
    {
      "grad_norm": 12.800342559814453,
      "learning_rate": 3.240234471584073e-05,
      "loss": 0.3556,
      "step": 31900
    },
    {
      "gate_value": 0.4049961566925049,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 31900
    },
    {
      "grad_norm": 334.0349426269531,
      "learning_rate": 3.2325401768485315e-05,
      "loss": 0.3754,
      "step": 31910
    },
    {
      "gate_value": 0.4050622880458832,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 31910
    },
    {
      "grad_norm": 39.117591857910156,
      "learning_rate": 3.224853925060821e-05,
      "loss": 0.3649,
      "step": 31920
    },
    {
      "gate_value": 0.40509703755378723,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 31920
    },
    {
      "grad_norm": 101.0981674194336,
      "learning_rate": 3.217175721474416e-05,
      "loss": 0.3542,
      "step": 31930
    },
    {
      "gate_value": 0.40514880418777466,
      "icl_sequence_length": 64,
      "num_contexts": 3,
      "step": 31930
    },
    {
      "grad_norm": 21.74467658996582,
      "learning_rate": 3.2095055713373026e-05,
      "loss": 0.3583,
      "step": 31940
    },
    {
      "gate_value": 0.4052824079990387,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 31940
    },
    {
      "grad_norm": 37.686012268066406,
      "learning_rate": 3.201843479891954e-05,
      "loss": 0.3629,
      "step": 31950
    },
    {
      "gate_value": 0.40541791915893555,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 31950
    },
    {
      "grad_norm": 37.003089904785156,
      "learning_rate": 3.194189452375335e-05,
      "loss": 0.3502,
      "step": 31960
    },
    {
      "gate_value": 0.4054928123950958,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 31960
    },
    {
      "grad_norm": 21.644052505493164,
      "learning_rate": 3.1865434940189015e-05,
      "loss": 0.3574,
      "step": 31970
    },
    {
      "gate_value": 0.40563321113586426,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 31970
    },
    {
      "grad_norm": 10.192123413085938,
      "learning_rate": 3.1789056100485975e-05,
      "loss": 0.3514,
      "step": 31980
    },
    {
      "gate_value": 0.40571680665016174,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 31980
    },
    {
      "grad_norm": 44.86257553100586,
      "learning_rate": 3.1712758056848424e-05,
      "loss": 0.3505,
      "step": 31990
    },
    {
      "gate_value": 0.4058789610862732,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 31990
    },
    {
      "grad_norm": 363.2022399902344,
      "learning_rate": 3.1636540861425396e-05,
      "loss": 0.3619,
      "step": 32000
    },
    {
      "gate_value": 0.40589526295661926,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 32000
    },
    {
      "grad_norm": 121.92304229736328,
      "learning_rate": 3.156040456631059e-05,
      "loss": 0.3694,
      "step": 32010
    },
    {
      "gate_value": 0.40586337447166443,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 32010
    },
    {
      "grad_norm": 39.731449127197266,
      "learning_rate": 3.148434922354239e-05,
      "loss": 0.369,
      "step": 32020
    },
    {
      "gate_value": 0.40581846237182617,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 32020
    },
    {
      "grad_norm": 23.1926212310791,
      "learning_rate": 3.1408374885103966e-05,
      "loss": 0.3698,
      "step": 32030
    },
    {
      "gate_value": 0.40583550930023193,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 32030
    },
    {
      "grad_norm": 30.028512954711914,
      "learning_rate": 3.1332481602923066e-05,
      "loss": 0.3523,
      "step": 32040
    },
    {
      "gate_value": 0.4058579206466675,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 32040
    },
    {
      "grad_norm": 61.59327697753906,
      "learning_rate": 3.125666942887206e-05,
      "loss": 0.3493,
      "step": 32050
    },
    {
      "gate_value": 0.40589094161987305,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 32050
    },
    {
      "grad_norm": 521.9461059570312,
      "learning_rate": 3.118093841476777e-05,
      "loss": 0.3646,
      "step": 32060
    },
    {
      "gate_value": 0.4059288203716278,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32060
    },
    {
      "grad_norm": 1710.82080078125,
      "learning_rate": 3.110528861237169e-05,
      "loss": 0.3636,
      "step": 32070
    },
    {
      "gate_value": 0.40597227215766907,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 32070
    },
    {
      "grad_norm": 39.675498962402344,
      "learning_rate": 3.102972007338972e-05,
      "loss": 0.3537,
      "step": 32080
    },
    {
      "gate_value": 0.4060346782207489,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32080
    },
    {
      "grad_norm": 610.4242553710938,
      "learning_rate": 3.095423284947225e-05,
      "loss": 0.3566,
      "step": 32090
    },
    {
      "gate_value": 0.4061087667942047,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 32090
    },
    {
      "grad_norm": 33.1412467956543,
      "learning_rate": 3.0878826992214155e-05,
      "loss": 0.3763,
      "step": 32100
    },
    {
      "gate_value": 0.40613362193107605,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 32100
    },
    {
      "grad_norm": 44.12626266479492,
      "learning_rate": 3.0803502553154544e-05,
      "loss": 0.3531,
      "step": 32110
    },
    {
      "gate_value": 0.40618664026260376,
      "icl_sequence_length": 54,
      "num_contexts": 3,
      "step": 32110
    },
    {
      "grad_norm": 65.63241577148438,
      "learning_rate": 3.0728259583776953e-05,
      "loss": 0.3517,
      "step": 32120
    },
    {
      "gate_value": 0.4062596261501312,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 32120
    },
    {
      "grad_norm": 82.6564712524414,
      "learning_rate": 3.0653098135509274e-05,
      "loss": 0.3454,
      "step": 32130
    },
    {
      "gate_value": 0.40627121925354004,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 32130
    },
    {
      "grad_norm": 18.433292388916016,
      "learning_rate": 3.0578018259723646e-05,
      "loss": 0.3601,
      "step": 32140
    },
    {
      "gate_value": 0.4063282907009125,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 32140
    },
    {
      "grad_norm": 43.319366455078125,
      "learning_rate": 3.0503020007736488e-05,
      "loss": 0.3772,
      "step": 32150
    },
    {
      "gate_value": 0.4063490629196167,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 32150
    },
    {
      "grad_norm": 56.99197769165039,
      "learning_rate": 3.0428103430808332e-05,
      "loss": 0.3804,
      "step": 32160
    },
    {
      "gate_value": 0.4063299298286438,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 32160
    },
    {
      "grad_norm": 799.6575317382812,
      "learning_rate": 3.035326858014398e-05,
      "loss": 0.3503,
      "step": 32170
    },
    {
      "gate_value": 0.4063448905944824,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 32170
    },
    {
      "grad_norm": 25.74651527404785,
      "learning_rate": 3.0278515506892355e-05,
      "loss": 0.379,
      "step": 32180
    },
    {
      "gate_value": 0.40639516711235046,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 32180
    },
    {
      "grad_norm": 345.8633117675781,
      "learning_rate": 3.0203844262146483e-05,
      "loss": 0.3706,
      "step": 32190
    },
    {
      "gate_value": 0.40643778443336487,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32190
    },
    {
      "grad_norm": 4103.92041015625,
      "learning_rate": 3.0129254896943473e-05,
      "loss": 0.3518,
      "step": 32200
    },
    {
      "gate_value": 0.40648239850997925,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 32200
    },
    {
      "grad_norm": 26.54749870300293,
      "learning_rate": 3.0054747462264444e-05,
      "loss": 0.3694,
      "step": 32210
    },
    {
      "gate_value": 0.4065358638763428,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 32210
    },
    {
      "grad_norm": 25.705764770507812,
      "learning_rate": 2.9980322009034445e-05,
      "loss": 0.3517,
      "step": 32220
    },
    {
      "gate_value": 0.4066010117530823,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 32220
    },
    {
      "grad_norm": 50.34804916381836,
      "learning_rate": 2.9905978588122654e-05,
      "loss": 0.3792,
      "step": 32230
    },
    {
      "gate_value": 0.4066932499408722,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 32230
    },
    {
      "grad_norm": 100.87702178955078,
      "learning_rate": 2.983171725034207e-05,
      "loss": 0.3719,
      "step": 32240
    },
    {
      "gate_value": 0.4067370593547821,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 32240
    },
    {
      "grad_norm": 24.019262313842773,
      "learning_rate": 2.9757538046449676e-05,
      "loss": 0.3638,
      "step": 32250
    },
    {
      "gate_value": 0.4067458212375641,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32250
    },
    {
      "grad_norm": 45.86870193481445,
      "learning_rate": 2.9683441027146166e-05,
      "loss": 0.3606,
      "step": 32260
    },
    {
      "gate_value": 0.4067707657814026,
      "icl_sequence_length": 62,
      "num_contexts": 3,
      "step": 32260
    },
    {
      "grad_norm": 229.07736206054688,
      "learning_rate": 2.9609426243076178e-05,
      "loss": 0.36,
      "step": 32270
    },
    {
      "gate_value": 0.40676602721214294,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 32270
    },
    {
      "grad_norm": 37.639366149902344,
      "learning_rate": 2.9535493744828166e-05,
      "loss": 0.3589,
      "step": 32280
    },
    {
      "gate_value": 0.40678077936172485,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 32280
    },
    {
      "grad_norm": 26.005895614624023,
      "learning_rate": 2.9461643582934285e-05,
      "loss": 0.3499,
      "step": 32290
    },
    {
      "gate_value": 0.4068147838115692,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 32290
    },
    {
      "grad_norm": 20.593965530395508,
      "learning_rate": 2.938787580787038e-05,
      "loss": 0.3473,
      "step": 32300
    },
    {
      "gate_value": 0.4068923890590668,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 32300
    },
    {
      "grad_norm": 69.46590423583984,
      "learning_rate": 2.9314190470056086e-05,
      "loss": 0.3698,
      "step": 32310
    },
    {
      "gate_value": 0.40692099928855896,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 32310
    },
    {
      "grad_norm": 40623.91796875,
      "learning_rate": 2.9240587619854584e-05,
      "loss": 0.3534,
      "step": 32320
    },
    {
      "gate_value": 0.4068794250488281,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 32320
    },
    {
      "grad_norm": 29.661853790283203,
      "learning_rate": 2.9167067307572727e-05,
      "loss": 0.3691,
      "step": 32330
    },
    {
      "gate_value": 0.4069285988807678,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 32330
    },
    {
      "grad_norm": 37.21219253540039,
      "learning_rate": 2.909362958346099e-05,
      "loss": 0.3653,
      "step": 32340
    },
    {
      "gate_value": 0.4070126414299011,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 32340
    },
    {
      "grad_norm": 60.33441162109375,
      "learning_rate": 2.902027449771339e-05,
      "loss": 0.3451,
      "step": 32350
    },
    {
      "gate_value": 0.4070773422718048,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32350
    },
    {
      "grad_norm": 1850.6201171875,
      "learning_rate": 2.894700210046737e-05,
      "loss": 0.3646,
      "step": 32360
    },
    {
      "gate_value": 0.407095342874527,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 32360
    },
    {
      "grad_norm": 19.2963809967041,
      "learning_rate": 2.887381244180395e-05,
      "loss": 0.3791,
      "step": 32370
    },
    {
      "gate_value": 0.407134085893631,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 32370
    },
    {
      "grad_norm": 59.54200744628906,
      "learning_rate": 2.880070557174757e-05,
      "loss": 0.3658,
      "step": 32380
    },
    {
      "gate_value": 0.40719518065452576,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 32380
    },
    {
      "grad_norm": 17.734453201293945,
      "learning_rate": 2.8727681540266136e-05,
      "loss": 0.3709,
      "step": 32390
    },
    {
      "gate_value": 0.4072628319263458,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 32390
    },
    {
      "grad_norm": 29.109493255615234,
      "learning_rate": 2.8654740397270793e-05,
      "loss": 0.3582,
      "step": 32400
    },
    {
      "gate_value": 0.407309353351593,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 32400
    },
    {
      "grad_norm": 6415.794921875,
      "learning_rate": 2.8581882192616213e-05,
      "loss": 0.3657,
      "step": 32410
    },
    {
      "gate_value": 0.40732789039611816,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 32410
    },
    {
      "grad_norm": 81.83065032958984,
      "learning_rate": 2.850910697610021e-05,
      "loss": 0.3758,
      "step": 32420
    },
    {
      "gate_value": 0.40734460949897766,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 32420
    },
    {
      "grad_norm": 44.35910415649414,
      "learning_rate": 2.8436414797463996e-05,
      "loss": 0.3573,
      "step": 32430
    },
    {
      "gate_value": 0.4073881506919861,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32430
    },
    {
      "grad_norm": 134.13555908203125,
      "learning_rate": 2.8363805706391995e-05,
      "loss": 0.3541,
      "step": 32440
    },
    {
      "gate_value": 0.4074646234512329,
      "icl_sequence_length": 96,
      "num_contexts": 3,
      "step": 32440
    },
    {
      "grad_norm": 33.06184005737305,
      "learning_rate": 2.8291279752511874e-05,
      "loss": 0.3617,
      "step": 32450
    },
    {
      "gate_value": 0.4075838327407837,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 32450
    },
    {
      "grad_norm": 56.15337371826172,
      "learning_rate": 2.821883698539435e-05,
      "loss": 0.3473,
      "step": 32460
    },
    {
      "gate_value": 0.4077083170413971,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 32460
    },
    {
      "grad_norm": 37.95241165161133,
      "learning_rate": 2.814647745455343e-05,
      "loss": 0.3542,
      "step": 32470
    },
    {
      "gate_value": 0.4077601730823517,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 32470
    },
    {
      "grad_norm": 96.07141876220703,
      "learning_rate": 2.8074201209446185e-05,
      "loss": 0.3692,
      "step": 32480
    },
    {
      "gate_value": 0.4078661799430847,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 32480
    },
    {
      "grad_norm": 51.653724670410156,
      "learning_rate": 2.80020082994727e-05,
      "loss": 0.3529,
      "step": 32490
    },
    {
      "gate_value": 0.4079096019268036,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 32490
    },
    {
      "grad_norm": 42464.8828125,
      "learning_rate": 2.7929898773976155e-05,
      "loss": 0.3816,
      "step": 32500
    },
    {
      "gate_value": 0.4079228341579437,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 32500
    },
    {
      "grad_norm": 28.320405960083008,
      "learning_rate": 2.7857872682242792e-05,
      "loss": 0.3596,
      "step": 32510
    },
    {
      "gate_value": 0.407973974943161,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 32510
    },
    {
      "grad_norm": 2898.142578125,
      "learning_rate": 2.7785930073501684e-05,
      "loss": 0.3773,
      "step": 32520
    },
    {
      "gate_value": 0.40802299976348877,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 32520
    },
    {
      "grad_norm": 50.13523483276367,
      "learning_rate": 2.771407099692496e-05,
      "loss": 0.3706,
      "step": 32530
    },
    {
      "gate_value": 0.4080987870693207,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 32530
    },
    {
      "grad_norm": 17.791433334350586,
      "learning_rate": 2.7642295501627613e-05,
      "loss": 0.3855,
      "step": 32540
    },
    {
      "gate_value": 0.4081122577190399,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 32540
    },
    {
      "grad_norm": 25.099008560180664,
      "learning_rate": 2.7570603636667576e-05,
      "loss": 0.3452,
      "step": 32550
    },
    {
      "gate_value": 0.4081073999404907,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 32550
    },
    {
      "grad_norm": 56.72127914428711,
      "learning_rate": 2.7498995451045463e-05,
      "loss": 0.379,
      "step": 32560
    },
    {
      "gate_value": 0.40811750292778015,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 32560
    },
    {
      "grad_norm": 65.63322448730469,
      "learning_rate": 2.742747099370489e-05,
      "loss": 0.3653,
      "step": 32570
    },
    {
      "gate_value": 0.40815386176109314,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 32570
    },
    {
      "grad_norm": 123.76846313476562,
      "learning_rate": 2.7356030313532074e-05,
      "loss": 0.3637,
      "step": 32580
    },
    {
      "gate_value": 0.40820392966270447,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 32580
    },
    {
      "grad_norm": 55.345069885253906,
      "learning_rate": 2.7284673459356067e-05,
      "loss": 0.3666,
      "step": 32590
    },
    {
      "gate_value": 0.40824827551841736,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32590
    },
    {
      "grad_norm": 147.07064819335938,
      "learning_rate": 2.7213400479948607e-05,
      "loss": 0.3503,
      "step": 32600
    },
    {
      "gate_value": 0.4082948863506317,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 32600
    },
    {
      "grad_norm": 56.571807861328125,
      "learning_rate": 2.7142211424024157e-05,
      "loss": 0.359,
      "step": 32610
    },
    {
      "gate_value": 0.4083560109138489,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 32610
    },
    {
      "grad_norm": 246.4196014404297,
      "learning_rate": 2.707110634023967e-05,
      "loss": 0.3888,
      "step": 32620
    },
    {
      "gate_value": 0.4084080159664154,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 32620
    },
    {
      "grad_norm": 27.42348289489746,
      "learning_rate": 2.700008527719486e-05,
      "loss": 0.3504,
      "step": 32630
    },
    {
      "gate_value": 0.40847039222717285,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 32630
    },
    {
      "grad_norm": 32.832275390625,
      "learning_rate": 2.692914828343194e-05,
      "loss": 0.3558,
      "step": 32640
    },
    {
      "gate_value": 0.4085494875907898,
      "icl_sequence_length": 60,
      "num_contexts": 3,
      "step": 32640
    },
    {
      "grad_norm": 39.22434997558594,
      "learning_rate": 2.685829540743572e-05,
      "loss": 0.3718,
      "step": 32650
    },
    {
      "gate_value": 0.40860962867736816,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 32650
    },
    {
      "grad_norm": 40.54300308227539,
      "learning_rate": 2.6787526697633383e-05,
      "loss": 0.3666,
      "step": 32660
    },
    {
      "gate_value": 0.40863949060440063,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32660
    },
    {
      "grad_norm": 283.6440734863281,
      "learning_rate": 2.671684220239477e-05,
      "loss": 0.3655,
      "step": 32670
    },
    {
      "gate_value": 0.40866610407829285,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 32670
    },
    {
      "grad_norm": 1028.0677490234375,
      "learning_rate": 2.6646241970031995e-05,
      "loss": 0.3598,
      "step": 32680
    },
    {
      "gate_value": 0.40876686573028564,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 32680
    },
    {
      "grad_norm": 30.990869522094727,
      "learning_rate": 2.6575726048799667e-05,
      "loss": 0.3731,
      "step": 32690
    },
    {
      "gate_value": 0.4088298976421356,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32690
    },
    {
      "grad_norm": 50.804988861083984,
      "learning_rate": 2.6505294486894764e-05,
      "loss": 0.3608,
      "step": 32700
    },
    {
      "gate_value": 0.40888458490371704,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 32700
    },
    {
      "grad_norm": 19.51559829711914,
      "learning_rate": 2.6434947332456625e-05,
      "loss": 0.3637,
      "step": 32710
    },
    {
      "gate_value": 0.408957302570343,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 32710
    },
    {
      "grad_norm": 225.3372344970703,
      "learning_rate": 2.6364684633566792e-05,
      "loss": 0.3581,
      "step": 32720
    },
    {
      "gate_value": 0.4090772271156311,
      "icl_sequence_length": 74,
      "num_contexts": 3,
      "step": 32720
    },
    {
      "grad_norm": 4149.73046875,
      "learning_rate": 2.6294506438249213e-05,
      "loss": 0.3603,
      "step": 32730
    },
    {
      "gate_value": 0.40914449095726013,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 32730
    },
    {
      "grad_norm": 13865.302734375,
      "learning_rate": 2.6224412794470008e-05,
      "loss": 0.3636,
      "step": 32740
    },
    {
      "gate_value": 0.40925562381744385,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32740
    },
    {
      "grad_norm": 102.5394287109375,
      "learning_rate": 2.6154403750137565e-05,
      "loss": 0.3523,
      "step": 32750
    },
    {
      "gate_value": 0.4092884957790375,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32750
    },
    {
      "grad_norm": 607.7672729492188,
      "learning_rate": 2.608447935310236e-05,
      "loss": 0.3604,
      "step": 32760
    },
    {
      "gate_value": 0.40925315022468567,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 32760
    },
    {
      "grad_norm": 148.76336669921875,
      "learning_rate": 2.6014639651157032e-05,
      "loss": 0.3512,
      "step": 32770
    },
    {
      "gate_value": 0.4092670679092407,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 32770
    },
    {
      "grad_norm": 2247.474365234375,
      "learning_rate": 2.5944884692036393e-05,
      "loss": 0.3456,
      "step": 32780
    },
    {
      "gate_value": 0.40928658843040466,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 32780
    },
    {
      "grad_norm": 160.3250274658203,
      "learning_rate": 2.5875214523417275e-05,
      "loss": 0.3594,
      "step": 32790
    },
    {
      "gate_value": 0.4093087911605835,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 32790
    },
    {
      "grad_norm": 45.89481735229492,
      "learning_rate": 2.580562919291862e-05,
      "loss": 0.3675,
      "step": 32800
    },
    {
      "gate_value": 0.4093517065048218,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 32800
    },
    {
      "grad_norm": 57.58846664428711,
      "learning_rate": 2.5736128748101365e-05,
      "loss": 0.3757,
      "step": 32810
    },
    {
      "gate_value": 0.4093685448169708,
      "icl_sequence_length": 82,
      "num_contexts": 3,
      "step": 32810
    },
    {
      "grad_norm": 71.6854019165039,
      "learning_rate": 2.5666713236468344e-05,
      "loss": 0.3645,
      "step": 32820
    },
    {
      "gate_value": 0.409373015165329,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 32820
    },
    {
      "grad_norm": 26.344486236572266,
      "learning_rate": 2.5597382705464425e-05,
      "loss": 0.3556,
      "step": 32830
    },
    {
      "gate_value": 0.40939274430274963,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 32830
    },
    {
      "grad_norm": 3726.2080078125,
      "learning_rate": 2.5528137202476384e-05,
      "loss": 0.3737,
      "step": 32840
    },
    {
      "gate_value": 0.4094216525554657,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 32840
    },
    {
      "grad_norm": 14.94805908203125,
      "learning_rate": 2.5458976774832895e-05,
      "loss": 0.3538,
      "step": 32850
    },
    {
      "gate_value": 0.40946733951568604,
      "icl_sequence_length": 90,
      "num_contexts": 3,
      "step": 32850
    },
    {
      "grad_norm": 43.41163635253906,
      "learning_rate": 2.538990146980443e-05,
      "loss": 0.3801,
      "step": 32860
    },
    {
      "gate_value": 0.4094873070716858,
      "icl_sequence_length": 72,
      "num_contexts": 3,
      "step": 32860
    },
    {
      "grad_norm": 67.83159637451172,
      "learning_rate": 2.5320911334603273e-05,
      "loss": 0.3557,
      "step": 32870
    },
    {
      "gate_value": 0.40953534841537476,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 32870
    },
    {
      "grad_norm": 44.28196334838867,
      "learning_rate": 2.525200641638357e-05,
      "loss": 0.3759,
      "step": 32880
    },
    {
      "gate_value": 0.4095556437969208,
      "icl_sequence_length": 68,
      "num_contexts": 3,
      "step": 32880
    },
    {
      "grad_norm": 51.88390350341797,
      "learning_rate": 2.5183186762241163e-05,
      "loss": 0.3531,
      "step": 32890
    },
    {
      "gate_value": 0.40955135226249695,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 32890
    },
    {
      "grad_norm": 48.15521240234375,
      "learning_rate": 2.5114452419213665e-05,
      "loss": 0.3643,
      "step": 32900
    },
    {
      "gate_value": 0.40955406427383423,
      "icl_sequence_length": 88,
      "num_contexts": 3,
      "step": 32900
    },
    {
      "grad_norm": 81.54961395263672,
      "learning_rate": 2.5045803434280377e-05,
      "loss": 0.3921,
      "step": 32910
    },
    {
      "gate_value": 0.4095805287361145,
      "icl_sequence_length": 66,
      "num_contexts": 3,
      "step": 32910
    },
    {
      "grad_norm": 29.33376121520996,
      "learning_rate": 2.4977239854362146e-05,
      "loss": 0.3564,
      "step": 32920
    },
    {
      "gate_value": 0.40961137413978577,
      "icl_sequence_length": 76,
      "num_contexts": 3,
      "step": 32920
    },
    {
      "grad_norm": 19.240026473999023,
      "learning_rate": 2.4908761726321592e-05,
      "loss": 0.3555,
      "step": 32930
    },
    {
      "gate_value": 0.40965530276298523,
      "icl_sequence_length": 70,
      "num_contexts": 3,
      "step": 32930
    },
    {
      "grad_norm": 124.72468566894531,
      "learning_rate": 2.4840369096962852e-05,
      "loss": 0.3456,
      "step": 32940
    },
    {
      "gate_value": 0.40971893072128296,
      "icl_sequence_length": 80,
      "num_contexts": 3,
      "step": 32940
    },
    {
      "grad_norm": 29.96973419189453,
      "learning_rate": 2.4772062013031675e-05,
      "loss": 0.3706,
      "step": 32950
    },
    {
      "gate_value": 0.4097919464111328,
      "icl_sequence_length": 86,
      "num_contexts": 3,
      "step": 32950
    },
    {
      "grad_norm": 97.25133514404297,
      "learning_rate": 2.4703840521215258e-05,
      "loss": 0.3733,
      "step": 32960
    },
    {
      "gate_value": 0.4098777770996094,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 32960
    },
    {
      "grad_norm": 62.41849136352539,
      "learning_rate": 2.463570466814242e-05,
      "loss": 0.3477,
      "step": 32970
    },
    {
      "gate_value": 0.4099633991718292,
      "icl_sequence_length": 84,
      "num_contexts": 3,
      "step": 32970
    },
    {
      "grad_norm": 41.57551193237305,
      "learning_rate": 2.4567654500383276e-05,
      "loss": 0.3871,
      "step": 32980
    },
    {
      "gate_value": 0.40999799966812134,
      "icl_sequence_length": 92,
      "num_contexts": 3,
      "step": 32980
    },
    {
      "grad_norm": 29.298206329345703,
      "learning_rate": 2.4499690064449522e-05,
      "loss": 0.3746,
      "step": 32990
    },
    {
      "gate_value": 0.41000160574913025,
      "icl_sequence_length": 78,
      "num_contexts": 3,
      "step": 32990
    },
    {
      "grad_norm": 193.3474884033203,
      "learning_rate": 2.4431811406794216e-05,
      "loss": 0.3659,
      "step": 33000
    }
  ],
  "logging_steps": 10,
  "max_steps": 40000,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 9223372036854775807,
  "save_steps": 1000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 3,
  "trial_name": null,
  "trial_params": null
}