{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.425, "eval_steps": 500, "global_step": 33000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "gate_value": 0.0, "icl_sequence_length": 86, "num_contexts": 3, "step": 0 }, { "grad_norm": 0.0005426404532045126, "learning_rate": 1.3499999999999998e-06, "loss": 0.5017, "step": 10 }, { "gate_value": 2.407148258498637e-06, "icl_sequence_length": 86, "num_contexts": 3, "step": 10 }, { "grad_norm": 0.0014531903434544802, "learning_rate": 2.85e-06, "loss": 0.4952, "step": 20 }, { "gate_value": 2.730801497818902e-06, "icl_sequence_length": 86, "num_contexts": 3, "step": 20 }, { "grad_norm": 0.0007584649138152599, "learning_rate": 4.35e-06, "loss": 0.5197, "step": 30 }, { "gate_value": -4.140019882470369e-09, "icl_sequence_length": 74, "num_contexts": 3, "step": 30 }, { "grad_norm": 0.002548313234001398, "learning_rate": 5.85e-06, "loss": 0.5194, "step": 40 }, { "gate_value": -9.026175575854722e-07, "icl_sequence_length": 78, "num_contexts": 3, "step": 40 }, { "grad_norm": 0.002914809389039874, "learning_rate": 7.35e-06, "loss": 0.5269, "step": 50 }, { "gate_value": -1.088102408175473e-06, "icl_sequence_length": 86, "num_contexts": 3, "step": 50 }, { "grad_norm": 0.0004767653881572187, "learning_rate": 8.849999999999998e-06, "loss": 0.51, "step": 60 }, { "gate_value": 5.859649718331639e-06, "icl_sequence_length": 80, "num_contexts": 3, "step": 60 }, { "grad_norm": 0.000854952319059521, "learning_rate": 1.035e-05, "loss": 0.5154, "step": 70 }, { "gate_value": 3.494698103168048e-05, "icl_sequence_length": 68, "num_contexts": 3, "step": 70 }, { "grad_norm": 0.006400048267096281, "learning_rate": 1.1849999999999998e-05, "loss": 0.5026, "step": 80 }, { "gate_value": 9.951820538844913e-05, "icl_sequence_length": 78, "num_contexts": 3, "step": 80 }, { "grad_norm": 0.0003364745352882892, "learning_rate": 1.3349999999999998e-05, "loss": 0.5083, "step": 90 }, { "gate_value": 0.00029271937091834843, "icl_sequence_length": 74, "num_contexts": 3, "step": 90 }, { "grad_norm": 0.09524580836296082, "learning_rate": 1.485e-05, "loss": 0.5008, "step": 100 }, { "gate_value": 0.000530488439835608, "icl_sequence_length": 72, "num_contexts": 3, "step": 100 }, { "grad_norm": 0.05546204373240471, "learning_rate": 1.6349999999999998e-05, "loss": 0.4951, "step": 110 }, { "gate_value": 0.0008035043138079345, "icl_sequence_length": 70, "num_contexts": 3, "step": 110 }, { "grad_norm": 0.16272345185279846, "learning_rate": 1.7849999999999997e-05, "loss": 0.5003, "step": 120 }, { "gate_value": 0.0010982438689097762, "icl_sequence_length": 86, "num_contexts": 3, "step": 120 }, { "grad_norm": 0.23696964979171753, "learning_rate": 1.935e-05, "loss": 0.4987, "step": 130 }, { "gate_value": 0.001406953320838511, "icl_sequence_length": 94, "num_contexts": 3, "step": 130 }, { "grad_norm": 0.13120773434638977, "learning_rate": 2.085e-05, "loss": 0.5179, "step": 140 }, { "gate_value": 0.001694629667326808, "icl_sequence_length": 60, "num_contexts": 3, "step": 140 }, { "grad_norm": 0.1184849962592125, "learning_rate": 2.2349999999999998e-05, "loss": 0.4964, "step": 150 }, { "gate_value": 0.002073641400784254, "icl_sequence_length": 70, "num_contexts": 3, "step": 150 }, { "grad_norm": 0.04556508734822273, "learning_rate": 2.3849999999999997e-05, "loss": 0.5121, "step": 160 }, { "gate_value": 0.002421831712126732, "icl_sequence_length": 72, "num_contexts": 3, "step": 160 }, { "grad_norm": 1.6241488456726074, "learning_rate": 2.535e-05, "loss": 0.5208, "step": 170 }, { "gate_value": 0.0027016534004360437, "icl_sequence_length": 62, "num_contexts": 3, "step": 170 }, { "grad_norm": 0.16671186685562134, "learning_rate": 2.6849999999999995e-05, "loss": 0.5104, "step": 180 }, { "gate_value": 0.0029992451891303062, "icl_sequence_length": 78, "num_contexts": 3, "step": 180 }, { "grad_norm": 1.049537181854248, "learning_rate": 2.8349999999999998e-05, "loss": 0.5182, "step": 190 }, { "gate_value": 0.003241482889279723, "icl_sequence_length": 70, "num_contexts": 3, "step": 190 }, { "grad_norm": 0.29614609479904175, "learning_rate": 2.985e-05, "loss": 0.5094, "step": 200 }, { "gate_value": 0.0032604828011244535, "icl_sequence_length": 74, "num_contexts": 3, "step": 200 }, { "grad_norm": 0.44253769516944885, "learning_rate": 3.1349999999999996e-05, "loss": 0.508, "step": 210 }, { "gate_value": 0.0032824440859258175, "icl_sequence_length": 80, "num_contexts": 3, "step": 210 }, { "grad_norm": 0.010273229330778122, "learning_rate": 3.285e-05, "loss": 0.502, "step": 220 }, { "gate_value": 0.0034144630189985037, "icl_sequence_length": 76, "num_contexts": 3, "step": 220 }, { "grad_norm": 0.9588198661804199, "learning_rate": 3.435e-05, "loss": 0.5176, "step": 230 }, { "gate_value": 0.0035216009709984064, "icl_sequence_length": 78, "num_contexts": 3, "step": 230 }, { "grad_norm": 1.1617745161056519, "learning_rate": 3.585e-05, "loss": 0.5123, "step": 240 }, { "gate_value": 0.0036562951281666756, "icl_sequence_length": 76, "num_contexts": 3, "step": 240 }, { "grad_norm": 1.0517369508743286, "learning_rate": 3.735e-05, "loss": 0.515, "step": 250 }, { "gate_value": 0.0038379342295229435, "icl_sequence_length": 82, "num_contexts": 3, "step": 250 }, { "grad_norm": 0.42224568128585815, "learning_rate": 3.8849999999999996e-05, "loss": 0.5109, "step": 260 }, { "gate_value": 0.004033830948174, "icl_sequence_length": 82, "num_contexts": 3, "step": 260 }, { "grad_norm": 1.4201754331588745, "learning_rate": 4.035e-05, "loss": 0.5135, "step": 270 }, { "gate_value": 0.004163349512964487, "icl_sequence_length": 80, "num_contexts": 3, "step": 270 }, { "grad_norm": 0.2649329900741577, "learning_rate": 4.185e-05, "loss": 0.5231, "step": 280 }, { "gate_value": 0.0040314337238669395, "icl_sequence_length": 84, "num_contexts": 3, "step": 280 }, { "grad_norm": 0.06176350265741348, "learning_rate": 4.334999999999999e-05, "loss": 0.5, "step": 290 }, { "gate_value": 0.004018639679998159, "icl_sequence_length": 78, "num_contexts": 3, "step": 290 }, { "grad_norm": 2.028596878051758, "learning_rate": 4.484999999999999e-05, "loss": 0.5075, "step": 300 }, { "gate_value": 0.00426053861156106, "icl_sequence_length": 74, "num_contexts": 3, "step": 300 }, { "grad_norm": 1.6869006156921387, "learning_rate": 4.6349999999999995e-05, "loss": 0.5225, "step": 310 }, { "gate_value": 0.004472165368497372, "icl_sequence_length": 90, "num_contexts": 3, "step": 310 }, { "grad_norm": 0.09726118296384811, "learning_rate": 4.785e-05, "loss": 0.4986, "step": 320 }, { "gate_value": 0.004443428013473749, "icl_sequence_length": 76, "num_contexts": 3, "step": 320 }, { "grad_norm": 1.6347602605819702, "learning_rate": 4.935e-05, "loss": 0.5104, "step": 330 }, { "gate_value": 0.004441737663000822, "icl_sequence_length": 74, "num_contexts": 3, "step": 330 }, { "grad_norm": 0.5094766616821289, "learning_rate": 5.0849999999999996e-05, "loss": 0.5075, "step": 340 }, { "gate_value": 0.0046890913508832455, "icl_sequence_length": 88, "num_contexts": 3, "step": 340 }, { "grad_norm": 0.13526400923728943, "learning_rate": 5.234999999999999e-05, "loss": 0.4973, "step": 350 }, { "gate_value": 0.0050169299356639385, "icl_sequence_length": 78, "num_contexts": 3, "step": 350 }, { "grad_norm": 0.2546197175979614, "learning_rate": 5.3849999999999994e-05, "loss": 0.4989, "step": 360 }, { "gate_value": 0.005079896654933691, "icl_sequence_length": 84, "num_contexts": 3, "step": 360 }, { "grad_norm": 0.4461250901222229, "learning_rate": 5.535e-05, "loss": 0.4993, "step": 370 }, { "gate_value": 0.005180824548006058, "icl_sequence_length": 78, "num_contexts": 3, "step": 370 }, { "grad_norm": 0.11173674464225769, "learning_rate": 5.684999999999999e-05, "loss": 0.4963, "step": 380 }, { "gate_value": 0.005200548097491264, "icl_sequence_length": 86, "num_contexts": 3, "step": 380 }, { "grad_norm": 0.050604645162820816, "learning_rate": 5.8349999999999995e-05, "loss": 0.5173, "step": 390 }, { "gate_value": 0.005443707574158907, "icl_sequence_length": 76, "num_contexts": 3, "step": 390 }, { "grad_norm": 0.23654121160507202, "learning_rate": 5.985e-05, "loss": 0.5035, "step": 400 }, { "gate_value": 0.005672121420502663, "icl_sequence_length": 78, "num_contexts": 3, "step": 400 }, { "grad_norm": 0.854729413986206, "learning_rate": 6.134999999999999e-05, "loss": 0.5289, "step": 410 }, { "gate_value": 0.005770199932157993, "icl_sequence_length": 74, "num_contexts": 3, "step": 410 }, { "grad_norm": 0.31689170002937317, "learning_rate": 6.285e-05, "loss": 0.4897, "step": 420 }, { "gate_value": 0.005838444456458092, "icl_sequence_length": 70, "num_contexts": 3, "step": 420 }, { "grad_norm": 0.13151150941848755, "learning_rate": 6.434999999999999e-05, "loss": 0.5071, "step": 430 }, { "gate_value": 0.006072549149394035, "icl_sequence_length": 82, "num_contexts": 3, "step": 430 }, { "grad_norm": 0.3894720673561096, "learning_rate": 6.584999999999999e-05, "loss": 0.5082, "step": 440 }, { "gate_value": 0.006371940486133099, "icl_sequence_length": 84, "num_contexts": 3, "step": 440 }, { "grad_norm": 1.0877070426940918, "learning_rate": 6.735e-05, "loss": 0.5012, "step": 450 }, { "gate_value": 0.006615795660763979, "icl_sequence_length": 82, "num_contexts": 3, "step": 450 }, { "grad_norm": 1.3406215906143188, "learning_rate": 6.884999999999999e-05, "loss": 0.5193, "step": 460 }, { "gate_value": 0.006942338310182095, "icl_sequence_length": 84, "num_contexts": 3, "step": 460 }, { "grad_norm": 1.0344425439834595, "learning_rate": 7.034999999999999e-05, "loss": 0.4969, "step": 470 }, { "gate_value": 0.007014387287199497, "icl_sequence_length": 86, "num_contexts": 3, "step": 470 }, { "grad_norm": 0.24051937460899353, "learning_rate": 7.184999999999998e-05, "loss": 0.5019, "step": 480 }, { "gate_value": 0.007041481789201498, "icl_sequence_length": 84, "num_contexts": 3, "step": 480 }, { "grad_norm": 0.24314218759536743, "learning_rate": 7.335e-05, "loss": 0.5165, "step": 490 }, { "gate_value": 0.007269471418112516, "icl_sequence_length": 76, "num_contexts": 3, "step": 490 }, { "grad_norm": 0.8075150847434998, "learning_rate": 7.484999999999999e-05, "loss": 0.5024, "step": 500 }, { "gate_value": 0.00759664922952652, "icl_sequence_length": 82, "num_contexts": 3, "step": 500 }, { "grad_norm": 1.4133530855178833, "learning_rate": 7.635e-05, "loss": 0.5059, "step": 510 }, { "gate_value": 0.007728520315140486, "icl_sequence_length": 66, "num_contexts": 3, "step": 510 }, { "grad_norm": 0.4607851505279541, "learning_rate": 7.785e-05, "loss": 0.5165, "step": 520 }, { "gate_value": 0.008264156058430672, "icl_sequence_length": 70, "num_contexts": 3, "step": 520 }, { "grad_norm": 0.3550325930118561, "learning_rate": 7.934999999999999e-05, "loss": 0.5042, "step": 530 }, { "gate_value": 0.008512669242918491, "icl_sequence_length": 74, "num_contexts": 3, "step": 530 }, { "grad_norm": 0.1921839714050293, "learning_rate": 8.085e-05, "loss": 0.4936, "step": 540 }, { "gate_value": 0.008503591641783714, "icl_sequence_length": 74, "num_contexts": 3, "step": 540 }, { "grad_norm": 0.5971819162368774, "learning_rate": 8.235e-05, "loss": 0.5087, "step": 550 }, { "gate_value": 0.008713321760296822, "icl_sequence_length": 84, "num_contexts": 3, "step": 550 }, { "grad_norm": 0.0914035215973854, "learning_rate": 8.385e-05, "loss": 0.4861, "step": 560 }, { "gate_value": 0.009013736620545387, "icl_sequence_length": 76, "num_contexts": 3, "step": 560 }, { "grad_norm": 0.11908543854951859, "learning_rate": 8.534999999999999e-05, "loss": 0.4941, "step": 570 }, { "gate_value": 0.009317001327872276, "icl_sequence_length": 88, "num_contexts": 3, "step": 570 }, { "grad_norm": 0.2162892073392868, "learning_rate": 8.684999999999998e-05, "loss": 0.5106, "step": 580 }, { "gate_value": 0.009594669565558434, "icl_sequence_length": 92, "num_contexts": 3, "step": 580 }, { "grad_norm": 0.7542089223861694, "learning_rate": 8.834999999999999e-05, "loss": 0.5217, "step": 590 }, { "gate_value": 0.009778150357306004, "icl_sequence_length": 84, "num_contexts": 3, "step": 590 }, { "grad_norm": 0.09826211631298065, "learning_rate": 8.984999999999999e-05, "loss": 0.51, "step": 600 }, { "gate_value": 0.010008268058300018, "icl_sequence_length": 82, "num_contexts": 3, "step": 600 }, { "grad_norm": 0.8503474593162537, "learning_rate": 9.134999999999998e-05, "loss": 0.4968, "step": 610 }, { "gate_value": 0.010128550231456757, "icl_sequence_length": 92, "num_contexts": 3, "step": 610 }, { "grad_norm": 1.022375226020813, "learning_rate": 9.285e-05, "loss": 0.5022, "step": 620 }, { "gate_value": 0.010452290996909142, "icl_sequence_length": 78, "num_contexts": 3, "step": 620 }, { "grad_norm": 1.4550477266311646, "learning_rate": 9.434999999999999e-05, "loss": 0.5124, "step": 630 }, { "gate_value": 0.01016128808259964, "icl_sequence_length": 84, "num_contexts": 3, "step": 630 }, { "grad_norm": 0.3279017508029938, "learning_rate": 9.585e-05, "loss": 0.5014, "step": 640 }, { "gate_value": 0.010357659310102463, "icl_sequence_length": 82, "num_contexts": 3, "step": 640 }, { "grad_norm": 0.12235884368419647, "learning_rate": 9.735e-05, "loss": 0.5094, "step": 650 }, { "gate_value": 0.010804083198308945, "icl_sequence_length": 80, "num_contexts": 3, "step": 650 }, { "grad_norm": 0.11897007375955582, "learning_rate": 9.884999999999999e-05, "loss": 0.4895, "step": 660 }, { "gate_value": 0.011202951893210411, "icl_sequence_length": 80, "num_contexts": 3, "step": 660 }, { "grad_norm": 0.2711750268936157, "learning_rate": 0.00010035, "loss": 0.4964, "step": 670 }, { "gate_value": 0.011760725639760494, "icl_sequence_length": 74, "num_contexts": 3, "step": 670 }, { "grad_norm": 0.3249741494655609, "learning_rate": 0.00010185, "loss": 0.4867, "step": 680 }, { "gate_value": 0.012173856608569622, "icl_sequence_length": 76, "num_contexts": 3, "step": 680 }, { "grad_norm": 0.30625203251838684, "learning_rate": 0.00010334999999999998, "loss": 0.5046, "step": 690 }, { "gate_value": 0.01261813659220934, "icl_sequence_length": 86, "num_contexts": 3, "step": 690 }, { "grad_norm": 0.014923992566764355, "learning_rate": 0.00010484999999999999, "loss": 0.5079, "step": 700 }, { "gate_value": 0.012826438061892986, "icl_sequence_length": 74, "num_contexts": 3, "step": 700 }, { "grad_norm": 0.14983655512332916, "learning_rate": 0.00010634999999999998, "loss": 0.5051, "step": 710 }, { "gate_value": 0.012988142669200897, "icl_sequence_length": 86, "num_contexts": 3, "step": 710 }, { "grad_norm": 0.5405359864234924, "learning_rate": 0.00010784999999999999, "loss": 0.4939, "step": 720 }, { "gate_value": 0.013321064412593842, "icl_sequence_length": 64, "num_contexts": 3, "step": 720 }, { "grad_norm": 0.16480578482151031, "learning_rate": 0.00010934999999999999, "loss": 0.487, "step": 730 }, { "gate_value": 0.013345051556825638, "icl_sequence_length": 78, "num_contexts": 3, "step": 730 }, { "grad_norm": 0.0591035857796669, "learning_rate": 0.00011084999999999998, "loss": 0.4962, "step": 740 }, { "gate_value": 0.013593264855444431, "icl_sequence_length": 90, "num_contexts": 3, "step": 740 }, { "grad_norm": 0.20157171785831451, "learning_rate": 0.00011235, "loss": 0.5034, "step": 750 }, { "gate_value": 0.014044249430298805, "icl_sequence_length": 68, "num_contexts": 3, "step": 750 }, { "grad_norm": 0.8499253392219543, "learning_rate": 0.00011384999999999999, "loss": 0.4846, "step": 760 }, { "gate_value": 0.014514867216348648, "icl_sequence_length": 68, "num_contexts": 3, "step": 760 }, { "grad_norm": 0.3363804817199707, "learning_rate": 0.00011535, "loss": 0.492, "step": 770 }, { "gate_value": 0.014975903555750847, "icl_sequence_length": 68, "num_contexts": 3, "step": 770 }, { "grad_norm": 0.4271162152290344, "learning_rate": 0.00011685, "loss": 0.5139, "step": 780 }, { "gate_value": 0.015347362495958805, "icl_sequence_length": 58, "num_contexts": 3, "step": 780 }, { "grad_norm": 0.07361973077058792, "learning_rate": 0.00011834999999999999, "loss": 0.5002, "step": 790 }, { "gate_value": 0.0156552717089653, "icl_sequence_length": 92, "num_contexts": 3, "step": 790 }, { "grad_norm": 0.042372770607471466, "learning_rate": 0.00011985, "loss": 0.4942, "step": 800 }, { "gate_value": 0.01595964841544628, "icl_sequence_length": 72, "num_contexts": 3, "step": 800 }, { "grad_norm": 0.3930019438266754, "learning_rate": 0.00012135, "loss": 0.5074, "step": 810 }, { "gate_value": 0.016275372356176376, "icl_sequence_length": 78, "num_contexts": 3, "step": 810 }, { "grad_norm": 0.2732306718826294, "learning_rate": 0.00012284999999999998, "loss": 0.5001, "step": 820 }, { "gate_value": 0.01652705669403076, "icl_sequence_length": 88, "num_contexts": 3, "step": 820 }, { "grad_norm": 0.06455976516008377, "learning_rate": 0.00012435, "loss": 0.5188, "step": 830 }, { "gate_value": 0.016465678811073303, "icl_sequence_length": 78, "num_contexts": 3, "step": 830 }, { "grad_norm": 0.19495975971221924, "learning_rate": 0.00012585, "loss": 0.4919, "step": 840 }, { "gate_value": 0.016236618161201477, "icl_sequence_length": 92, "num_contexts": 3, "step": 840 }, { "grad_norm": 0.2919803261756897, "learning_rate": 0.00012734999999999998, "loss": 0.5025, "step": 850 }, { "gate_value": 0.016103582456707954, "icl_sequence_length": 72, "num_contexts": 3, "step": 850 }, { "grad_norm": 0.06609172374010086, "learning_rate": 0.00012885, "loss": 0.4796, "step": 860 }, { "gate_value": 0.016391385346651077, "icl_sequence_length": 72, "num_contexts": 3, "step": 860 }, { "grad_norm": 0.08503394573926926, "learning_rate": 0.00013035, "loss": 0.486, "step": 870 }, { "gate_value": 0.016892239451408386, "icl_sequence_length": 76, "num_contexts": 3, "step": 870 }, { "grad_norm": 0.0417107455432415, "learning_rate": 0.00013184999999999998, "loss": 0.5064, "step": 880 }, { "gate_value": 0.01728055067360401, "icl_sequence_length": 66, "num_contexts": 3, "step": 880 }, { "grad_norm": 0.15999619662761688, "learning_rate": 0.00013335, "loss": 0.4913, "step": 890 }, { "gate_value": 0.017700038850307465, "icl_sequence_length": 90, "num_contexts": 3, "step": 890 }, { "grad_norm": 0.3381291627883911, "learning_rate": 0.00013485, "loss": 0.478, "step": 900 }, { "gate_value": 0.017996180802583694, "icl_sequence_length": 96, "num_contexts": 3, "step": 900 }, { "grad_norm": 0.294606477022171, "learning_rate": 0.00013634999999999998, "loss": 0.5028, "step": 910 }, { "gate_value": 0.018202368170022964, "icl_sequence_length": 90, "num_contexts": 3, "step": 910 }, { "grad_norm": 0.34997233748435974, "learning_rate": 0.00013785, "loss": 0.4892, "step": 920 }, { "gate_value": 0.01846320927143097, "icl_sequence_length": 76, "num_contexts": 3, "step": 920 }, { "grad_norm": 0.045470427721738815, "learning_rate": 0.00013935, "loss": 0.4955, "step": 930 }, { "gate_value": 0.019063180312514305, "icl_sequence_length": 78, "num_contexts": 3, "step": 930 }, { "grad_norm": 0.5815165042877197, "learning_rate": 0.00014084999999999998, "loss": 0.5002, "step": 940 }, { "gate_value": 0.019558217376470566, "icl_sequence_length": 82, "num_contexts": 3, "step": 940 }, { "grad_norm": 0.2606048583984375, "learning_rate": 0.00014235, "loss": 0.4999, "step": 950 }, { "gate_value": 0.019863948225975037, "icl_sequence_length": 72, "num_contexts": 3, "step": 950 }, { "grad_norm": 0.17119236290454865, "learning_rate": 0.00014384999999999997, "loss": 0.4776, "step": 960 }, { "gate_value": 0.019959047436714172, "icl_sequence_length": 78, "num_contexts": 3, "step": 960 }, { "grad_norm": 0.34382596611976624, "learning_rate": 0.00014534999999999998, "loss": 0.4833, "step": 970 }, { "gate_value": 0.0202496275305748, "icl_sequence_length": 84, "num_contexts": 3, "step": 970 }, { "grad_norm": 0.35615915060043335, "learning_rate": 0.00014685, "loss": 0.4879, "step": 980 }, { "gate_value": 0.020738650113344193, "icl_sequence_length": 80, "num_contexts": 3, "step": 980 }, { "grad_norm": 1.0342373847961426, "learning_rate": 0.00014834999999999997, "loss": 0.4922, "step": 990 }, { "gate_value": 0.02130601368844509, "icl_sequence_length": 72, "num_contexts": 3, "step": 990 }, { "grad_norm": 0.5985687375068665, "learning_rate": 0.00014984999999999998, "loss": 0.4829, "step": 1000 }, { "gate_value": 0.021172812208533287, "icl_sequence_length": 84, "num_contexts": 3, "step": 1000 }, { "grad_norm": 0.6002101302146912, "learning_rate": 0.00015134999999999997, "loss": 0.4958, "step": 1010 }, { "gate_value": 0.021218104287981987, "icl_sequence_length": 76, "num_contexts": 3, "step": 1010 }, { "grad_norm": 0.041248299181461334, "learning_rate": 0.00015284999999999997, "loss": 0.4881, "step": 1020 }, { "gate_value": 0.021419478580355644, "icl_sequence_length": 80, "num_contexts": 3, "step": 1020 }, { "grad_norm": 0.024791941046714783, "learning_rate": 0.00015434999999999998, "loss": 0.4768, "step": 1030 }, { "gate_value": 0.021730070933699608, "icl_sequence_length": 82, "num_contexts": 3, "step": 1030 }, { "grad_norm": 0.3756122887134552, "learning_rate": 0.00015584999999999997, "loss": 0.4765, "step": 1040 }, { "gate_value": 0.02219421975314617, "icl_sequence_length": 94, "num_contexts": 3, "step": 1040 }, { "grad_norm": 0.10375858843326569, "learning_rate": 0.00015734999999999998, "loss": 0.4935, "step": 1050 }, { "gate_value": 0.02247510477900505, "icl_sequence_length": 66, "num_contexts": 3, "step": 1050 }, { "grad_norm": 0.1254405975341797, "learning_rate": 0.00015884999999999999, "loss": 0.4967, "step": 1060 }, { "gate_value": 0.02249990962445736, "icl_sequence_length": 78, "num_contexts": 3, "step": 1060 }, { "grad_norm": 0.41738972067832947, "learning_rate": 0.00016034999999999997, "loss": 0.4802, "step": 1070 }, { "gate_value": 0.02269868366420269, "icl_sequence_length": 72, "num_contexts": 3, "step": 1070 }, { "grad_norm": 0.28267791867256165, "learning_rate": 0.00016184999999999998, "loss": 0.4799, "step": 1080 }, { "gate_value": 0.023243827745318413, "icl_sequence_length": 74, "num_contexts": 3, "step": 1080 }, { "grad_norm": 0.07439376413822174, "learning_rate": 0.00016334999999999999, "loss": 0.4946, "step": 1090 }, { "gate_value": 0.023553457111120224, "icl_sequence_length": 92, "num_contexts": 3, "step": 1090 }, { "grad_norm": 0.08923624455928802, "learning_rate": 0.00016485, "loss": 0.5008, "step": 1100 }, { "gate_value": 0.02370520494878292, "icl_sequence_length": 90, "num_contexts": 3, "step": 1100 }, { "grad_norm": 0.04910886287689209, "learning_rate": 0.00016634999999999998, "loss": 0.5038, "step": 1110 }, { "gate_value": 0.02379109151661396, "icl_sequence_length": 92, "num_contexts": 3, "step": 1110 }, { "grad_norm": 0.2725479304790497, "learning_rate": 0.00016785, "loss": 0.48, "step": 1120 }, { "gate_value": 0.024141671136021614, "icl_sequence_length": 82, "num_contexts": 3, "step": 1120 }, { "grad_norm": 0.025722775608301163, "learning_rate": 0.00016935, "loss": 0.4886, "step": 1130 }, { "gate_value": 0.024660132825374603, "icl_sequence_length": 66, "num_contexts": 3, "step": 1130 }, { "grad_norm": 0.2205311357975006, "learning_rate": 0.00017084999999999998, "loss": 0.4879, "step": 1140 }, { "gate_value": 0.0249018631875515, "icl_sequence_length": 82, "num_contexts": 3, "step": 1140 }, { "grad_norm": 0.24804756045341492, "learning_rate": 0.00017235, "loss": 0.4851, "step": 1150 }, { "gate_value": 0.024769756942987442, "icl_sequence_length": 72, "num_contexts": 3, "step": 1150 }, { "grad_norm": 0.029389042407274246, "learning_rate": 0.00017385, "loss": 0.4914, "step": 1160 }, { "gate_value": 0.0248092133551836, "icl_sequence_length": 70, "num_contexts": 3, "step": 1160 }, { "grad_norm": 0.4203813076019287, "learning_rate": 0.00017534999999999998, "loss": 0.4618, "step": 1170 }, { "gate_value": 0.025401754304766655, "icl_sequence_length": 82, "num_contexts": 3, "step": 1170 }, { "grad_norm": 0.30038025975227356, "learning_rate": 0.00017685, "loss": 0.4842, "step": 1180 }, { "gate_value": 0.02615329623222351, "icl_sequence_length": 76, "num_contexts": 3, "step": 1180 }, { "grad_norm": 0.05151379108428955, "learning_rate": 0.00017835, "loss": 0.4929, "step": 1190 }, { "gate_value": 0.02670441009104252, "icl_sequence_length": 74, "num_contexts": 3, "step": 1190 }, { "grad_norm": 0.03685954958200455, "learning_rate": 0.00017984999999999998, "loss": 0.4851, "step": 1200 }, { "gate_value": 0.026601877063512802, "icl_sequence_length": 72, "num_contexts": 3, "step": 1200 }, { "grad_norm": 0.15710294246673584, "learning_rate": 0.00018135, "loss": 0.4759, "step": 1210 }, { "gate_value": 0.02667396143078804, "icl_sequence_length": 72, "num_contexts": 3, "step": 1210 }, { "grad_norm": 0.3143344819545746, "learning_rate": 0.00018285, "loss": 0.5068, "step": 1220 }, { "gate_value": 0.02670447900891304, "icl_sequence_length": 84, "num_contexts": 3, "step": 1220 }, { "grad_norm": 0.1960684210062027, "learning_rate": 0.00018435, "loss": 0.4893, "step": 1230 }, { "gate_value": 0.02659144438803196, "icl_sequence_length": 86, "num_contexts": 3, "step": 1230 }, { "grad_norm": 0.05600379407405853, "learning_rate": 0.00018585, "loss": 0.4886, "step": 1240 }, { "gate_value": 0.026692554354667664, "icl_sequence_length": 92, "num_contexts": 3, "step": 1240 }, { "grad_norm": 0.4063480794429779, "learning_rate": 0.00018735, "loss": 0.4997, "step": 1250 }, { "gate_value": 0.027202336117625237, "icl_sequence_length": 86, "num_contexts": 3, "step": 1250 }, { "grad_norm": 0.12925776839256287, "learning_rate": 0.00018884999999999996, "loss": 0.486, "step": 1260 }, { "gate_value": 0.027684736996889114, "icl_sequence_length": 76, "num_contexts": 3, "step": 1260 }, { "grad_norm": 0.1763681322336197, "learning_rate": 0.00019034999999999996, "loss": 0.4925, "step": 1270 }, { "gate_value": 0.028168512508273125, "icl_sequence_length": 90, "num_contexts": 3, "step": 1270 }, { "grad_norm": 0.2327224165201187, "learning_rate": 0.00019184999999999997, "loss": 0.4672, "step": 1280 }, { "gate_value": 0.028568653389811516, "icl_sequence_length": 92, "num_contexts": 3, "step": 1280 }, { "grad_norm": 0.0987858697772026, "learning_rate": 0.00019334999999999998, "loss": 0.4948, "step": 1290 }, { "gate_value": 0.028939686715602875, "icl_sequence_length": 80, "num_contexts": 3, "step": 1290 }, { "grad_norm": 0.4070354998111725, "learning_rate": 0.00019484999999999997, "loss": 0.4959, "step": 1300 }, { "gate_value": 0.028898224234580994, "icl_sequence_length": 68, "num_contexts": 3, "step": 1300 }, { "grad_norm": 0.14461100101470947, "learning_rate": 0.00019634999999999998, "loss": 0.4818, "step": 1310 }, { "gate_value": 0.029195407405495644, "icl_sequence_length": 84, "num_contexts": 3, "step": 1310 }, { "grad_norm": 0.0757102370262146, "learning_rate": 0.00019784999999999998, "loss": 0.4879, "step": 1320 }, { "gate_value": 0.029616594314575195, "icl_sequence_length": 80, "num_contexts": 3, "step": 1320 }, { "grad_norm": 0.07737571746110916, "learning_rate": 0.00019934999999999997, "loss": 0.4633, "step": 1330 }, { "gate_value": 0.03013782575726509, "icl_sequence_length": 68, "num_contexts": 3, "step": 1330 }, { "grad_norm": 0.08557803928852081, "learning_rate": 0.00020084999999999998, "loss": 0.4849, "step": 1340 }, { "gate_value": 0.030645808205008507, "icl_sequence_length": 80, "num_contexts": 3, "step": 1340 }, { "grad_norm": 0.062334273010492325, "learning_rate": 0.00020234999999999999, "loss": 0.4856, "step": 1350 }, { "gate_value": 0.030954955145716667, "icl_sequence_length": 70, "num_contexts": 3, "step": 1350 }, { "grad_norm": 0.10366081446409225, "learning_rate": 0.00020384999999999997, "loss": 0.4857, "step": 1360 }, { "gate_value": 0.030923420563340187, "icl_sequence_length": 90, "num_contexts": 3, "step": 1360 }, { "grad_norm": 0.045627713203430176, "learning_rate": 0.00020534999999999998, "loss": 0.4701, "step": 1370 }, { "gate_value": 0.030991991981863976, "icl_sequence_length": 82, "num_contexts": 3, "step": 1370 }, { "grad_norm": 0.2241339385509491, "learning_rate": 0.00020684999999999999, "loss": 0.4736, "step": 1380 }, { "gate_value": 0.03133467212319374, "icl_sequence_length": 84, "num_contexts": 3, "step": 1380 }, { "grad_norm": 0.29695796966552734, "learning_rate": 0.00020835, "loss": 0.4808, "step": 1390 }, { "gate_value": 0.03150676190853119, "icl_sequence_length": 72, "num_contexts": 3, "step": 1390 }, { "grad_norm": 0.28195545077323914, "learning_rate": 0.00020984999999999998, "loss": 0.496, "step": 1400 }, { "gate_value": 0.03169822692871094, "icl_sequence_length": 66, "num_contexts": 3, "step": 1400 }, { "grad_norm": 0.2775692045688629, "learning_rate": 0.00021135, "loss": 0.4751, "step": 1410 }, { "gate_value": 0.031678296625614166, "icl_sequence_length": 92, "num_contexts": 3, "step": 1410 }, { "grad_norm": 0.2424466758966446, "learning_rate": 0.00021285, "loss": 0.467, "step": 1420 }, { "gate_value": 0.032020535320043564, "icl_sequence_length": 76, "num_contexts": 3, "step": 1420 }, { "grad_norm": 0.22923263907432556, "learning_rate": 0.00021434999999999998, "loss": 0.4701, "step": 1430 }, { "gate_value": 0.03269083425402641, "icl_sequence_length": 74, "num_contexts": 3, "step": 1430 }, { "grad_norm": 0.34625235199928284, "learning_rate": 0.00021585, "loss": 0.471, "step": 1440 }, { "gate_value": 0.033144090324640274, "icl_sequence_length": 78, "num_contexts": 3, "step": 1440 }, { "grad_norm": 0.11075719445943832, "learning_rate": 0.00021735, "loss": 0.4731, "step": 1450 }, { "gate_value": 0.033079419285058975, "icl_sequence_length": 80, "num_contexts": 3, "step": 1450 }, { "grad_norm": 0.12235695123672485, "learning_rate": 0.00021884999999999998, "loss": 0.4775, "step": 1460 }, { "gate_value": 0.032961998134851456, "icl_sequence_length": 76, "num_contexts": 3, "step": 1460 }, { "grad_norm": 0.023144006729125977, "learning_rate": 0.00022035, "loss": 0.4742, "step": 1470 }, { "gate_value": 0.0333966389298439, "icl_sequence_length": 84, "num_contexts": 3, "step": 1470 }, { "grad_norm": 0.09035952389240265, "learning_rate": 0.00022185, "loss": 0.4869, "step": 1480 }, { "gate_value": 0.033850379288196564, "icl_sequence_length": 60, "num_contexts": 3, "step": 1480 }, { "grad_norm": 0.028102407231926918, "learning_rate": 0.00022335, "loss": 0.459, "step": 1490 }, { "gate_value": 0.03416411206126213, "icl_sequence_length": 78, "num_contexts": 3, "step": 1490 }, { "grad_norm": 0.02450348250567913, "learning_rate": 0.00022485, "loss": 0.4591, "step": 1500 }, { "gate_value": 0.034449104219675064, "icl_sequence_length": 72, "num_contexts": 3, "step": 1500 }, { "grad_norm": 0.05895009636878967, "learning_rate": 0.00022634999999999997, "loss": 0.4898, "step": 1510 }, { "gate_value": 0.034758225083351135, "icl_sequence_length": 76, "num_contexts": 3, "step": 1510 }, { "grad_norm": 0.0686354786157608, "learning_rate": 0.00022784999999999995, "loss": 0.4675, "step": 1520 }, { "gate_value": 0.03516167402267456, "icl_sequence_length": 90, "num_contexts": 3, "step": 1520 }, { "grad_norm": 0.028159357607364655, "learning_rate": 0.00022934999999999996, "loss": 0.4851, "step": 1530 }, { "gate_value": 0.03552531823515892, "icl_sequence_length": 72, "num_contexts": 3, "step": 1530 }, { "grad_norm": 0.13609439134597778, "learning_rate": 0.00023084999999999997, "loss": 0.4766, "step": 1540 }, { "gate_value": 0.03593320772051811, "icl_sequence_length": 70, "num_contexts": 3, "step": 1540 }, { "grad_norm": 0.15504056215286255, "learning_rate": 0.00023234999999999998, "loss": 0.4619, "step": 1550 }, { "gate_value": 0.03641683608293533, "icl_sequence_length": 84, "num_contexts": 3, "step": 1550 }, { "grad_norm": 0.040739111602306366, "learning_rate": 0.00023384999999999997, "loss": 0.4636, "step": 1560 }, { "gate_value": 0.03673742339015007, "icl_sequence_length": 68, "num_contexts": 3, "step": 1560 }, { "grad_norm": 0.14744669198989868, "learning_rate": 0.00023534999999999997, "loss": 0.4955, "step": 1570 }, { "gate_value": 0.036518871784210205, "icl_sequence_length": 90, "num_contexts": 3, "step": 1570 }, { "grad_norm": 0.20779630541801453, "learning_rate": 0.00023684999999999998, "loss": 0.4931, "step": 1580 }, { "gate_value": 0.03666940703988075, "icl_sequence_length": 86, "num_contexts": 3, "step": 1580 }, { "grad_norm": 0.02847031131386757, "learning_rate": 0.00023834999999999997, "loss": 0.4923, "step": 1590 }, { "gate_value": 0.03700125217437744, "icl_sequence_length": 70, "num_contexts": 3, "step": 1590 }, { "grad_norm": 0.16238997876644135, "learning_rate": 0.00023984999999999998, "loss": 0.4749, "step": 1600 }, { "gate_value": 0.03693028539419174, "icl_sequence_length": 72, "num_contexts": 3, "step": 1600 }, { "grad_norm": 0.04399807006120682, "learning_rate": 0.00024134999999999998, "loss": 0.4827, "step": 1610 }, { "gate_value": 0.03712141513824463, "icl_sequence_length": 72, "num_contexts": 3, "step": 1610 }, { "grad_norm": 0.07293123006820679, "learning_rate": 0.00024284999999999997, "loss": 0.4892, "step": 1620 }, { "gate_value": 0.0372476652264595, "icl_sequence_length": 70, "num_contexts": 3, "step": 1620 }, { "grad_norm": 0.14840328693389893, "learning_rate": 0.00024435, "loss": 0.4627, "step": 1630 }, { "gate_value": 0.037470173090696335, "icl_sequence_length": 70, "num_contexts": 3, "step": 1630 }, { "grad_norm": 0.13052290678024292, "learning_rate": 0.00024585, "loss": 0.4734, "step": 1640 }, { "gate_value": 0.03787413239479065, "icl_sequence_length": 74, "num_contexts": 3, "step": 1640 }, { "grad_norm": 0.04076918587088585, "learning_rate": 0.00024734999999999997, "loss": 0.485, "step": 1650 }, { "gate_value": 0.03815087303519249, "icl_sequence_length": 78, "num_contexts": 3, "step": 1650 }, { "grad_norm": 0.2749229669570923, "learning_rate": 0.00024885, "loss": 0.457, "step": 1660 }, { "gate_value": 0.03850769251585007, "icl_sequence_length": 82, "num_contexts": 3, "step": 1660 }, { "grad_norm": 0.2708996534347534, "learning_rate": 0.00025035, "loss": 0.4775, "step": 1670 }, { "gate_value": 0.03835766762495041, "icl_sequence_length": 78, "num_contexts": 3, "step": 1670 }, { "grad_norm": 0.08414936065673828, "learning_rate": 0.00025184999999999997, "loss": 0.4748, "step": 1680 }, { "gate_value": 0.03850135579705238, "icl_sequence_length": 80, "num_contexts": 3, "step": 1680 }, { "grad_norm": 0.04802856966853142, "learning_rate": 0.00025335, "loss": 0.4668, "step": 1690 }, { "gate_value": 0.03900991007685661, "icl_sequence_length": 88, "num_contexts": 3, "step": 1690 }, { "grad_norm": 0.24531228840351105, "learning_rate": 0.00025485, "loss": 0.4846, "step": 1700 }, { "gate_value": 0.03973180428147316, "icl_sequence_length": 74, "num_contexts": 3, "step": 1700 }, { "grad_norm": 0.08588869124650955, "learning_rate": 0.00025634999999999997, "loss": 0.4847, "step": 1710 }, { "gate_value": 0.0403984971344471, "icl_sequence_length": 64, "num_contexts": 3, "step": 1710 }, { "grad_norm": 0.2342216819524765, "learning_rate": 0.00025785, "loss": 0.4727, "step": 1720 }, { "gate_value": 0.04085838794708252, "icl_sequence_length": 80, "num_contexts": 3, "step": 1720 }, { "grad_norm": 0.06164858862757683, "learning_rate": 0.00025935, "loss": 0.483, "step": 1730 }, { "gate_value": 0.04160254821181297, "icl_sequence_length": 66, "num_contexts": 3, "step": 1730 }, { "grad_norm": 0.10281935334205627, "learning_rate": 0.00026084999999999997, "loss": 0.4664, "step": 1740 }, { "gate_value": 0.04127310961484909, "icl_sequence_length": 82, "num_contexts": 3, "step": 1740 }, { "grad_norm": 0.18388307094573975, "learning_rate": 0.00026235, "loss": 0.4757, "step": 1750 }, { "gate_value": 0.041262272745370865, "icl_sequence_length": 78, "num_contexts": 3, "step": 1750 }, { "grad_norm": 0.08633152395486832, "learning_rate": 0.00026384999999999994, "loss": 0.4685, "step": 1760 }, { "gate_value": 0.041280150413513184, "icl_sequence_length": 88, "num_contexts": 3, "step": 1760 }, { "grad_norm": 0.05839018523693085, "learning_rate": 0.00026534999999999997, "loss": 0.5055, "step": 1770 }, { "gate_value": 0.0408598892390728, "icl_sequence_length": 90, "num_contexts": 3, "step": 1770 }, { "grad_norm": 0.1357850283384323, "learning_rate": 0.00026684999999999995, "loss": 0.4611, "step": 1780 }, { "gate_value": 0.04074737802147865, "icl_sequence_length": 58, "num_contexts": 3, "step": 1780 }, { "grad_norm": 0.26462557911872864, "learning_rate": 0.00026835, "loss": 0.4787, "step": 1790 }, { "gate_value": 0.04107224568724632, "icl_sequence_length": 74, "num_contexts": 3, "step": 1790 }, { "grad_norm": 0.11214753240346909, "learning_rate": 0.00026984999999999997, "loss": 0.4654, "step": 1800 }, { "gate_value": 0.04110245779156685, "icl_sequence_length": 84, "num_contexts": 3, "step": 1800 }, { "grad_norm": 0.095204658806324, "learning_rate": 0.00027134999999999995, "loss": 0.4927, "step": 1810 }, { "gate_value": 0.04148384928703308, "icl_sequence_length": 84, "num_contexts": 3, "step": 1810 }, { "grad_norm": 0.22705064713954926, "learning_rate": 0.00027285, "loss": 0.4727, "step": 1820 }, { "gate_value": 0.04165264591574669, "icl_sequence_length": 90, "num_contexts": 3, "step": 1820 }, { "grad_norm": 0.051133785396814346, "learning_rate": 0.00027435, "loss": 0.4743, "step": 1830 }, { "gate_value": 0.041887927800416946, "icl_sequence_length": 76, "num_contexts": 3, "step": 1830 }, { "grad_norm": 0.024134185165166855, "learning_rate": 0.00027584999999999996, "loss": 0.4624, "step": 1840 }, { "gate_value": 0.04222255200147629, "icl_sequence_length": 86, "num_contexts": 3, "step": 1840 }, { "grad_norm": 0.06055133417248726, "learning_rate": 0.00027735, "loss": 0.4866, "step": 1850 }, { "gate_value": 0.04211531952023506, "icl_sequence_length": 74, "num_contexts": 3, "step": 1850 }, { "grad_norm": 0.10934657603502274, "learning_rate": 0.00027885, "loss": 0.4649, "step": 1860 }, { "gate_value": 0.042022984474897385, "icl_sequence_length": 60, "num_contexts": 3, "step": 1860 }, { "grad_norm": 0.1160777285695076, "learning_rate": 0.00028034999999999996, "loss": 0.4423, "step": 1870 }, { "gate_value": 0.04227515682578087, "icl_sequence_length": 70, "num_contexts": 3, "step": 1870 }, { "grad_norm": 0.12855137884616852, "learning_rate": 0.00028185, "loss": 0.4899, "step": 1880 }, { "gate_value": 0.04226217046380043, "icl_sequence_length": 80, "num_contexts": 3, "step": 1880 }, { "grad_norm": 0.05965856835246086, "learning_rate": 0.00028335, "loss": 0.4737, "step": 1890 }, { "gate_value": 0.041885219514369965, "icl_sequence_length": 72, "num_contexts": 3, "step": 1890 }, { "grad_norm": 0.19230695068836212, "learning_rate": 0.00028484999999999996, "loss": 0.4779, "step": 1900 }, { "gate_value": 0.04192354902625084, "icl_sequence_length": 88, "num_contexts": 3, "step": 1900 }, { "grad_norm": 0.05444691330194473, "learning_rate": 0.00028635, "loss": 0.4896, "step": 1910 }, { "gate_value": 0.04169577360153198, "icl_sequence_length": 90, "num_contexts": 3, "step": 1910 }, { "grad_norm": 0.030353045091032982, "learning_rate": 0.00028785, "loss": 0.4935, "step": 1920 }, { "gate_value": 0.04207930713891983, "icl_sequence_length": 80, "num_contexts": 3, "step": 1920 }, { "grad_norm": 0.15619803965091705, "learning_rate": 0.00028934999999999996, "loss": 0.4779, "step": 1930 }, { "gate_value": 0.04228321462869644, "icl_sequence_length": 94, "num_contexts": 3, "step": 1930 }, { "grad_norm": 0.11080749332904816, "learning_rate": 0.00029085, "loss": 0.4513, "step": 1940 }, { "gate_value": 0.0428403876721859, "icl_sequence_length": 80, "num_contexts": 3, "step": 1940 }, { "grad_norm": 0.1745726615190506, "learning_rate": 0.00029235, "loss": 0.4838, "step": 1950 }, { "gate_value": 0.04332621768116951, "icl_sequence_length": 64, "num_contexts": 3, "step": 1950 }, { "grad_norm": 0.11912817507982254, "learning_rate": 0.00029384999999999996, "loss": 0.4669, "step": 1960 }, { "gate_value": 0.0436142161488533, "icl_sequence_length": 78, "num_contexts": 3, "step": 1960 }, { "grad_norm": 0.05294900760054588, "learning_rate": 0.00029535, "loss": 0.4674, "step": 1970 }, { "gate_value": 0.0439755953848362, "icl_sequence_length": 76, "num_contexts": 3, "step": 1970 }, { "grad_norm": 0.03383156657218933, "learning_rate": 0.00029685, "loss": 0.4676, "step": 1980 }, { "gate_value": 0.0445464663207531, "icl_sequence_length": 78, "num_contexts": 3, "step": 1980 }, { "grad_norm": 0.14340122044086456, "learning_rate": 0.00029835, "loss": 0.484, "step": 1990 }, { "gate_value": 0.044701505452394485, "icl_sequence_length": 78, "num_contexts": 3, "step": 1990 }, { "grad_norm": 0.1341182142496109, "learning_rate": 0.00029985, "loss": 0.4739, "step": 2000 }, { "gate_value": 0.044620223343372345, "icl_sequence_length": 92, "num_contexts": 3, "step": 2000 }, { "grad_norm": 0.15212862193584442, "learning_rate": 0.00029999995847794736, "loss": 0.4748, "step": 2010 }, { "gate_value": 0.0448896661400795, "icl_sequence_length": 82, "num_contexts": 3, "step": 2010 }, { "grad_norm": 0.1537584662437439, "learning_rate": 0.0002999998149449555, "loss": 0.4752, "step": 2020 }, { "gate_value": 0.045422472059726715, "icl_sequence_length": 80, "num_contexts": 3, "step": 2020 }, { "grad_norm": 0.07360043376684189, "learning_rate": 0.0002999995688885045, "loss": 0.4605, "step": 2030 }, { "gate_value": 0.04550457373261452, "icl_sequence_length": 64, "num_contexts": 3, "step": 2030 }, { "grad_norm": 0.030054304748773575, "learning_rate": 0.0002999992203087627, "loss": 0.4835, "step": 2040 }, { "gate_value": 0.04544057324528694, "icl_sequence_length": 56, "num_contexts": 3, "step": 2040 }, { "grad_norm": 0.03757965564727783, "learning_rate": 0.00029999876920596807, "loss": 0.4776, "step": 2050 }, { "gate_value": 0.04557052254676819, "icl_sequence_length": 90, "num_contexts": 3, "step": 2050 }, { "grad_norm": 0.2675529420375824, "learning_rate": 0.0002999982155804292, "loss": 0.4698, "step": 2060 }, { "gate_value": 0.046622686088085175, "icl_sequence_length": 92, "num_contexts": 3, "step": 2060 }, { "grad_norm": 0.26907795667648315, "learning_rate": 0.0002999975594325243, "loss": 0.4853, "step": 2070 }, { "gate_value": 0.04718781262636185, "icl_sequence_length": 72, "num_contexts": 3, "step": 2070 }, { "grad_norm": 0.07856488227844238, "learning_rate": 0.00029999680076270204, "loss": 0.4859, "step": 2080 }, { "gate_value": 0.04703153297305107, "icl_sequence_length": 86, "num_contexts": 3, "step": 2080 }, { "grad_norm": 0.12983615696430206, "learning_rate": 0.00029999593957148073, "loss": 0.4663, "step": 2090 }, { "gate_value": 0.046463314443826675, "icl_sequence_length": 80, "num_contexts": 3, "step": 2090 }, { "grad_norm": 0.08406257629394531, "learning_rate": 0.00029999497585944917, "loss": 0.4668, "step": 2100 }, { "gate_value": 0.046239107847213745, "icl_sequence_length": 82, "num_contexts": 3, "step": 2100 }, { "grad_norm": 0.0722755640745163, "learning_rate": 0.0002999939096272659, "loss": 0.4527, "step": 2110 }, { "gate_value": 0.04589474946260452, "icl_sequence_length": 80, "num_contexts": 3, "step": 2110 }, { "grad_norm": 0.18733340501785278, "learning_rate": 0.0002999927408756598, "loss": 0.4601, "step": 2120 }, { "gate_value": 0.04604862630367279, "icl_sequence_length": 90, "num_contexts": 3, "step": 2120 }, { "grad_norm": 0.032498251646757126, "learning_rate": 0.0002999914696054297, "loss": 0.4585, "step": 2130 }, { "gate_value": 0.047017332166433334, "icl_sequence_length": 88, "num_contexts": 3, "step": 2130 }, { "grad_norm": 0.04576157405972481, "learning_rate": 0.0002999900958174444, "loss": 0.476, "step": 2140 }, { "gate_value": 0.047887593507766724, "icl_sequence_length": 84, "num_contexts": 3, "step": 2140 }, { "grad_norm": 0.034247320145368576, "learning_rate": 0.00029998861951264296, "loss": 0.4756, "step": 2150 }, { "gate_value": 0.04841304570436478, "icl_sequence_length": 82, "num_contexts": 3, "step": 2150 }, { "grad_norm": 0.022174621000885963, "learning_rate": 0.00029998704069203436, "loss": 0.4666, "step": 2160 }, { "gate_value": 0.048587359488010406, "icl_sequence_length": 78, "num_contexts": 3, "step": 2160 }, { "grad_norm": 0.0923171117901802, "learning_rate": 0.0002999853593566978, "loss": 0.4671, "step": 2170 }, { "gate_value": 0.04884525388479233, "icl_sequence_length": 86, "num_contexts": 3, "step": 2170 }, { "grad_norm": 0.026745932176709175, "learning_rate": 0.00029998357550778236, "loss": 0.4598, "step": 2180 }, { "gate_value": 0.04901081696152687, "icl_sequence_length": 74, "num_contexts": 3, "step": 2180 }, { "grad_norm": 0.029031245037913322, "learning_rate": 0.00029998168914650733, "loss": 0.4664, "step": 2190 }, { "gate_value": 0.049416057765483856, "icl_sequence_length": 76, "num_contexts": 3, "step": 2190 }, { "grad_norm": 0.11745218932628632, "learning_rate": 0.000299979700274162, "loss": 0.4619, "step": 2200 }, { "gate_value": 0.04974166676402092, "icl_sequence_length": 82, "num_contexts": 3, "step": 2200 }, { "grad_norm": 0.18687647581100464, "learning_rate": 0.0002999776088921058, "loss": 0.4771, "step": 2210 }, { "gate_value": 0.04972897097468376, "icl_sequence_length": 84, "num_contexts": 3, "step": 2210 }, { "grad_norm": 0.2829289436340332, "learning_rate": 0.00029997541500176804, "loss": 0.4785, "step": 2220 }, { "gate_value": 0.04986254498362541, "icl_sequence_length": 90, "num_contexts": 3, "step": 2220 }, { "grad_norm": 0.12327824532985687, "learning_rate": 0.0002999731186046484, "loss": 0.4658, "step": 2230 }, { "gate_value": 0.05015156418085098, "icl_sequence_length": 68, "num_contexts": 3, "step": 2230 }, { "grad_norm": 0.07140158116817474, "learning_rate": 0.00029997071970231623, "loss": 0.4648, "step": 2240 }, { "gate_value": 0.04988813400268555, "icl_sequence_length": 80, "num_contexts": 3, "step": 2240 }, { "grad_norm": 0.10163920372724533, "learning_rate": 0.0002999682182964114, "loss": 0.4697, "step": 2250 }, { "gate_value": 0.04962354525923729, "icl_sequence_length": 90, "num_contexts": 3, "step": 2250 }, { "grad_norm": 0.2764798402786255, "learning_rate": 0.00029996561438864344, "loss": 0.4698, "step": 2260 }, { "gate_value": 0.04965106397867203, "icl_sequence_length": 74, "num_contexts": 3, "step": 2260 }, { "grad_norm": 0.024615973234176636, "learning_rate": 0.00029996290798079214, "loss": 0.4581, "step": 2270 }, { "gate_value": 0.049827586859464645, "icl_sequence_length": 88, "num_contexts": 3, "step": 2270 }, { "grad_norm": 0.1376759111881256, "learning_rate": 0.0002999600990747073, "loss": 0.4773, "step": 2280 }, { "gate_value": 0.04991050437092781, "icl_sequence_length": 72, "num_contexts": 3, "step": 2280 }, { "grad_norm": 0.04156076908111572, "learning_rate": 0.0002999571876723088, "loss": 0.4807, "step": 2290 }, { "gate_value": 0.049750544130802155, "icl_sequence_length": 74, "num_contexts": 3, "step": 2290 }, { "grad_norm": 0.10670457035303116, "learning_rate": 0.00029995417377558654, "loss": 0.469, "step": 2300 }, { "gate_value": 0.0495314821600914, "icl_sequence_length": 72, "num_contexts": 3, "step": 2300 }, { "grad_norm": 0.1581500917673111, "learning_rate": 0.0002999510573866005, "loss": 0.4652, "step": 2310 }, { "gate_value": 0.0497625507414341, "icl_sequence_length": 70, "num_contexts": 3, "step": 2310 }, { "grad_norm": 0.0921480730175972, "learning_rate": 0.00029994783850748063, "loss": 0.4724, "step": 2320 }, { "gate_value": 0.0499161072075367, "icl_sequence_length": 74, "num_contexts": 3, "step": 2320 }, { "grad_norm": 0.07795640826225281, "learning_rate": 0.00029994451714042707, "loss": 0.4849, "step": 2330 }, { "gate_value": 0.05015277490019798, "icl_sequence_length": 78, "num_contexts": 3, "step": 2330 }, { "grad_norm": 0.03227244317531586, "learning_rate": 0.00029994109328770993, "loss": 0.4723, "step": 2340 }, { "gate_value": 0.050364185124635696, "icl_sequence_length": 72, "num_contexts": 3, "step": 2340 }, { "grad_norm": 0.43648087978363037, "learning_rate": 0.00029993756695166943, "loss": 0.4874, "step": 2350 }, { "gate_value": 0.05060106888413429, "icl_sequence_length": 92, "num_contexts": 3, "step": 2350 }, { "grad_norm": 0.10248932242393494, "learning_rate": 0.00029993393813471575, "loss": 0.4489, "step": 2360 }, { "gate_value": 0.05064888298511505, "icl_sequence_length": 90, "num_contexts": 3, "step": 2360 }, { "grad_norm": 0.20501983165740967, "learning_rate": 0.0002999302068393291, "loss": 0.4513, "step": 2370 }, { "gate_value": 0.05121821165084839, "icl_sequence_length": 78, "num_contexts": 3, "step": 2370 }, { "grad_norm": 0.22547470033168793, "learning_rate": 0.0002999263730680599, "loss": 0.4572, "step": 2380 }, { "gate_value": 0.052095528692007065, "icl_sequence_length": 76, "num_contexts": 3, "step": 2380 }, { "grad_norm": 0.12664879858493805, "learning_rate": 0.0002999224368235284, "loss": 0.4546, "step": 2390 }, { "gate_value": 0.051816221326589584, "icl_sequence_length": 90, "num_contexts": 3, "step": 2390 }, { "grad_norm": 0.02900230884552002, "learning_rate": 0.000299918398108425, "loss": 0.4522, "step": 2400 }, { "gate_value": 0.05174678564071655, "icl_sequence_length": 84, "num_contexts": 3, "step": 2400 }, { "grad_norm": 0.06299610435962677, "learning_rate": 0.00029991425692551014, "loss": 0.4706, "step": 2410 }, { "gate_value": 0.05185849592089653, "icl_sequence_length": 74, "num_contexts": 3, "step": 2410 }, { "grad_norm": 0.20941582322120667, "learning_rate": 0.00029991001327761427, "loss": 0.4686, "step": 2420 }, { "gate_value": 0.05203567072749138, "icl_sequence_length": 88, "num_contexts": 3, "step": 2420 }, { "grad_norm": 0.08857905119657516, "learning_rate": 0.00029990566716763797, "loss": 0.4835, "step": 2430 }, { "gate_value": 0.05252448096871376, "icl_sequence_length": 88, "num_contexts": 3, "step": 2430 }, { "grad_norm": 0.023652268573641777, "learning_rate": 0.0002999012185985516, "loss": 0.48, "step": 2440 }, { "gate_value": 0.05304626375436783, "icl_sequence_length": 80, "num_contexts": 3, "step": 2440 }, { "grad_norm": 0.06863145530223846, "learning_rate": 0.0002998966675733958, "loss": 0.4819, "step": 2450 }, { "gate_value": 0.05308978632092476, "icl_sequence_length": 64, "num_contexts": 3, "step": 2450 }, { "grad_norm": 0.09551398456096649, "learning_rate": 0.0002998920140952812, "loss": 0.4741, "step": 2460 }, { "gate_value": 0.05287903547286987, "icl_sequence_length": 86, "num_contexts": 3, "step": 2460 }, { "grad_norm": 0.17554223537445068, "learning_rate": 0.00029988725816738833, "loss": 0.4555, "step": 2470 }, { "gate_value": 0.053737007081508636, "icl_sequence_length": 94, "num_contexts": 3, "step": 2470 }, { "grad_norm": 0.13025638461112976, "learning_rate": 0.00029988239979296784, "loss": 0.4517, "step": 2480 }, { "gate_value": 0.05436123162508011, "icl_sequence_length": 80, "num_contexts": 3, "step": 2480 }, { "grad_norm": 0.07448191940784454, "learning_rate": 0.00029987743897534044, "loss": 0.4611, "step": 2490 }, { "gate_value": 0.054754164069890976, "icl_sequence_length": 84, "num_contexts": 3, "step": 2490 }, { "grad_norm": 0.353579044342041, "learning_rate": 0.00029987237571789675, "loss": 0.472, "step": 2500 }, { "gate_value": 0.05532870441675186, "icl_sequence_length": 86, "num_contexts": 3, "step": 2500 }, { "grad_norm": 0.10554645210504532, "learning_rate": 0.0002998672100240975, "loss": 0.4759, "step": 2510 }, { "gate_value": 0.055256184190511703, "icl_sequence_length": 70, "num_contexts": 3, "step": 2510 }, { "grad_norm": 0.08449801802635193, "learning_rate": 0.00029986194189747333, "loss": 0.4543, "step": 2520 }, { "gate_value": 0.054673366248607635, "icl_sequence_length": 80, "num_contexts": 3, "step": 2520 }, { "grad_norm": 0.06385741382837296, "learning_rate": 0.000299856571341625, "loss": 0.4588, "step": 2530 }, { "gate_value": 0.05435393378138542, "icl_sequence_length": 80, "num_contexts": 3, "step": 2530 }, { "grad_norm": 0.12640225887298584, "learning_rate": 0.00029985109836022314, "loss": 0.4553, "step": 2540 }, { "gate_value": 0.05447719991207123, "icl_sequence_length": 70, "num_contexts": 3, "step": 2540 }, { "grad_norm": 0.04395339637994766, "learning_rate": 0.00029984552295700867, "loss": 0.4685, "step": 2550 }, { "gate_value": 0.05475514754652977, "icl_sequence_length": 64, "num_contexts": 3, "step": 2550 }, { "grad_norm": 0.06887280941009521, "learning_rate": 0.0002998398451357921, "loss": 0.4424, "step": 2560 }, { "gate_value": 0.055225860327482224, "icl_sequence_length": 84, "num_contexts": 3, "step": 2560 }, { "grad_norm": 0.2743040919303894, "learning_rate": 0.00029983406490045444, "loss": 0.458, "step": 2570 }, { "gate_value": 0.05573255196213722, "icl_sequence_length": 82, "num_contexts": 3, "step": 2570 }, { "grad_norm": 0.1339769959449768, "learning_rate": 0.0002998281822549462, "loss": 0.4706, "step": 2580 }, { "gate_value": 0.056551918387413025, "icl_sequence_length": 94, "num_contexts": 3, "step": 2580 }, { "grad_norm": 0.09543494880199432, "learning_rate": 0.00029982219720328814, "loss": 0.456, "step": 2590 }, { "gate_value": 0.05700315535068512, "icl_sequence_length": 64, "num_contexts": 3, "step": 2590 }, { "grad_norm": 0.17091640830039978, "learning_rate": 0.0002998161097495711, "loss": 0.4662, "step": 2600 }, { "gate_value": 0.05705432966351509, "icl_sequence_length": 90, "num_contexts": 3, "step": 2600 }, { "grad_norm": 0.25809577107429504, "learning_rate": 0.00029980991989795566, "loss": 0.4694, "step": 2610 }, { "gate_value": 0.057157181203365326, "icl_sequence_length": 80, "num_contexts": 3, "step": 2610 }, { "grad_norm": 0.06581462919712067, "learning_rate": 0.00029980362765267264, "loss": 0.4877, "step": 2620 }, { "gate_value": 0.05712047219276428, "icl_sequence_length": 86, "num_contexts": 3, "step": 2620 }, { "grad_norm": 0.13200043141841888, "learning_rate": 0.00029979723301802266, "loss": 0.4728, "step": 2630 }, { "gate_value": 0.057480890303850174, "icl_sequence_length": 70, "num_contexts": 3, "step": 2630 }, { "grad_norm": 0.1044127494096756, "learning_rate": 0.0002997907359983764, "loss": 0.4721, "step": 2640 }, { "gate_value": 0.057946741580963135, "icl_sequence_length": 76, "num_contexts": 3, "step": 2640 }, { "grad_norm": 0.036862898617982864, "learning_rate": 0.00029978413659817455, "loss": 0.4665, "step": 2650 }, { "gate_value": 0.05749443545937538, "icl_sequence_length": 70, "num_contexts": 3, "step": 2650 }, { "grad_norm": 0.16739346086978912, "learning_rate": 0.00029977743482192774, "loss": 0.465, "step": 2660 }, { "gate_value": 0.0574820339679718, "icl_sequence_length": 90, "num_contexts": 3, "step": 2660 }, { "grad_norm": 0.06671813130378723, "learning_rate": 0.0002997706306742165, "loss": 0.471, "step": 2670 }, { "gate_value": 0.0580253079533577, "icl_sequence_length": 68, "num_contexts": 3, "step": 2670 }, { "grad_norm": 0.06606713682413101, "learning_rate": 0.0002997637241596915, "loss": 0.4842, "step": 2680 }, { "gate_value": 0.058306340128183365, "icl_sequence_length": 74, "num_contexts": 3, "step": 2680 }, { "grad_norm": 0.1716826856136322, "learning_rate": 0.0002997567152830732, "loss": 0.4661, "step": 2690 }, { "gate_value": 0.058256857097148895, "icl_sequence_length": 84, "num_contexts": 3, "step": 2690 }, { "grad_norm": 0.028431877493858337, "learning_rate": 0.0002997496040491521, "loss": 0.4693, "step": 2700 }, { "gate_value": 0.05827565863728523, "icl_sequence_length": 94, "num_contexts": 3, "step": 2700 }, { "grad_norm": 0.09539343416690826, "learning_rate": 0.0002997423904627887, "loss": 0.456, "step": 2710 }, { "gate_value": 0.05831799656152725, "icl_sequence_length": 90, "num_contexts": 3, "step": 2710 }, { "grad_norm": 0.03917115554213524, "learning_rate": 0.0002997350745289134, "loss": 0.471, "step": 2720 }, { "gate_value": 0.058655787259340286, "icl_sequence_length": 86, "num_contexts": 3, "step": 2720 }, { "grad_norm": 0.17695604264736176, "learning_rate": 0.0002997276562525266, "loss": 0.4678, "step": 2730 }, { "gate_value": 0.059178613126277924, "icl_sequence_length": 78, "num_contexts": 3, "step": 2730 }, { "grad_norm": 0.07515741884708405, "learning_rate": 0.00029972013563869863, "loss": 0.475, "step": 2740 }, { "gate_value": 0.059215329587459564, "icl_sequence_length": 76, "num_contexts": 3, "step": 2740 }, { "grad_norm": 0.07590952515602112, "learning_rate": 0.00029971251269256965, "loss": 0.4602, "step": 2750 }, { "gate_value": 0.05963375046849251, "icl_sequence_length": 82, "num_contexts": 3, "step": 2750 }, { "grad_norm": 0.030797870829701424, "learning_rate": 0.00029970478741934997, "loss": 0.448, "step": 2760 }, { "gate_value": 0.060271523892879486, "icl_sequence_length": 90, "num_contexts": 3, "step": 2760 }, { "grad_norm": 0.03239508345723152, "learning_rate": 0.00029969695982431975, "loss": 0.4738, "step": 2770 }, { "gate_value": 0.06078481674194336, "icl_sequence_length": 66, "num_contexts": 3, "step": 2770 }, { "grad_norm": 0.05549526959657669, "learning_rate": 0.000299689029912829, "loss": 0.4734, "step": 2780 }, { "gate_value": 0.06084667146205902, "icl_sequence_length": 58, "num_contexts": 3, "step": 2780 }, { "grad_norm": 0.03209852799773216, "learning_rate": 0.00029968099769029787, "loss": 0.4521, "step": 2790 }, { "gate_value": 0.06139722093939781, "icl_sequence_length": 66, "num_contexts": 3, "step": 2790 }, { "grad_norm": 0.050206031650304794, "learning_rate": 0.00029967286316221614, "loss": 0.4769, "step": 2800 }, { "gate_value": 0.06171596422791481, "icl_sequence_length": 82, "num_contexts": 3, "step": 2800 }, { "grad_norm": 0.4463041126728058, "learning_rate": 0.00029966462633414383, "loss": 0.463, "step": 2810 }, { "gate_value": 0.061615847051143646, "icl_sequence_length": 88, "num_contexts": 3, "step": 2810 }, { "grad_norm": 0.04375645890831947, "learning_rate": 0.0002996562872117106, "loss": 0.4443, "step": 2820 }, { "gate_value": 0.061395056545734406, "icl_sequence_length": 76, "num_contexts": 3, "step": 2820 }, { "grad_norm": 0.08162671327590942, "learning_rate": 0.00029964784580061634, "loss": 0.4716, "step": 2830 }, { "gate_value": 0.06112854182720184, "icl_sequence_length": 78, "num_contexts": 3, "step": 2830 }, { "grad_norm": 0.029063262045383453, "learning_rate": 0.0002996393021066305, "loss": 0.4406, "step": 2840 }, { "gate_value": 0.061373159289360046, "icl_sequence_length": 80, "num_contexts": 3, "step": 2840 }, { "grad_norm": 0.2159145176410675, "learning_rate": 0.0002996306561355927, "loss": 0.4558, "step": 2850 }, { "gate_value": 0.061609722673892975, "icl_sequence_length": 78, "num_contexts": 3, "step": 2850 }, { "grad_norm": 0.1503428816795349, "learning_rate": 0.00029962190789341233, "loss": 0.4751, "step": 2860 }, { "gate_value": 0.06192683055996895, "icl_sequence_length": 82, "num_contexts": 3, "step": 2860 }, { "grad_norm": 0.03672568127512932, "learning_rate": 0.00029961305738606883, "loss": 0.4424, "step": 2870 }, { "gate_value": 0.06256543844938278, "icl_sequence_length": 82, "num_contexts": 3, "step": 2870 }, { "grad_norm": 0.1288597583770752, "learning_rate": 0.00029960410461961134, "loss": 0.4569, "step": 2880 }, { "gate_value": 0.06346543878316879, "icl_sequence_length": 70, "num_contexts": 3, "step": 2880 }, { "grad_norm": 0.14279378950595856, "learning_rate": 0.00029959504960015904, "loss": 0.4591, "step": 2890 }, { "gate_value": 0.06372940540313721, "icl_sequence_length": 86, "num_contexts": 3, "step": 2890 }, { "grad_norm": 0.09644819051027298, "learning_rate": 0.0002995858923339009, "loss": 0.4624, "step": 2900 }, { "gate_value": 0.0635669156908989, "icl_sequence_length": 84, "num_contexts": 3, "step": 2900 }, { "grad_norm": 0.0700480043888092, "learning_rate": 0.00029957663282709587, "loss": 0.44, "step": 2910 }, { "gate_value": 0.06419818103313446, "icl_sequence_length": 76, "num_contexts": 3, "step": 2910 }, { "grad_norm": 0.14946900308132172, "learning_rate": 0.00029956727108607274, "loss": 0.4672, "step": 2920 }, { "gate_value": 0.06462504714727402, "icl_sequence_length": 80, "num_contexts": 3, "step": 2920 }, { "grad_norm": 0.0538860447704792, "learning_rate": 0.0002995578071172302, "loss": 0.4541, "step": 2930 }, { "gate_value": 0.06455416232347488, "icl_sequence_length": 72, "num_contexts": 3, "step": 2930 }, { "grad_norm": 0.03887515142560005, "learning_rate": 0.0002995482409270367, "loss": 0.4544, "step": 2940 }, { "gate_value": 0.06476987153291702, "icl_sequence_length": 80, "num_contexts": 3, "step": 2940 }, { "grad_norm": 0.031727902591228485, "learning_rate": 0.00029953857252203067, "loss": 0.4748, "step": 2950 }, { "gate_value": 0.06503763049840927, "icl_sequence_length": 72, "num_contexts": 3, "step": 2950 }, { "grad_norm": 0.04176465794444084, "learning_rate": 0.00029952880190882035, "loss": 0.463, "step": 2960 }, { "gate_value": 0.0653427243232727, "icl_sequence_length": 76, "num_contexts": 3, "step": 2960 }, { "grad_norm": 0.026758193969726562, "learning_rate": 0.0002995189290940839, "loss": 0.4666, "step": 2970 }, { "gate_value": 0.06526095420122147, "icl_sequence_length": 92, "num_contexts": 3, "step": 2970 }, { "grad_norm": 0.15899816155433655, "learning_rate": 0.0002995089540845694, "loss": 0.4504, "step": 2980 }, { "gate_value": 0.06506571173667908, "icl_sequence_length": 84, "num_contexts": 3, "step": 2980 }, { "grad_norm": 0.2439725548028946, "learning_rate": 0.0002994988768870945, "loss": 0.4547, "step": 2990 }, { "gate_value": 0.06512710452079773, "icl_sequence_length": 74, "num_contexts": 3, "step": 2990 }, { "grad_norm": 0.18706493079662323, "learning_rate": 0.00029948869750854695, "loss": 0.4626, "step": 3000 }, { "gate_value": 0.06544939428567886, "icl_sequence_length": 94, "num_contexts": 3, "step": 3000 }, { "grad_norm": 0.16449908912181854, "learning_rate": 0.0002994784159558842, "loss": 0.4509, "step": 3010 }, { "gate_value": 0.06566469371318817, "icl_sequence_length": 80, "num_contexts": 3, "step": 3010 }, { "grad_norm": 0.07561691105365753, "learning_rate": 0.00029946803223613374, "loss": 0.4484, "step": 3020 }, { "gate_value": 0.06594062596559525, "icl_sequence_length": 88, "num_contexts": 3, "step": 3020 }, { "grad_norm": 0.10772091150283813, "learning_rate": 0.0002994575463563925, "loss": 0.4524, "step": 3030 }, { "gate_value": 0.06642261147499084, "icl_sequence_length": 94, "num_contexts": 3, "step": 3030 }, { "grad_norm": 0.08066357672214508, "learning_rate": 0.00029944695832382777, "loss": 0.4494, "step": 3040 }, { "gate_value": 0.06684261560440063, "icl_sequence_length": 94, "num_contexts": 3, "step": 3040 }, { "grad_norm": 0.0352531373500824, "learning_rate": 0.00029943626814567617, "loss": 0.4564, "step": 3050 }, { "gate_value": 0.06724800169467926, "icl_sequence_length": 80, "num_contexts": 3, "step": 3050 }, { "grad_norm": 0.05193459987640381, "learning_rate": 0.0002994254758292444, "loss": 0.4642, "step": 3060 }, { "gate_value": 0.06739187985658646, "icl_sequence_length": 90, "num_contexts": 3, "step": 3060 }, { "grad_norm": 0.05352374166250229, "learning_rate": 0.0002994145813819089, "loss": 0.4611, "step": 3070 }, { "gate_value": 0.06732220947742462, "icl_sequence_length": 88, "num_contexts": 3, "step": 3070 }, { "grad_norm": 0.03663728013634682, "learning_rate": 0.0002994035848111159, "loss": 0.459, "step": 3080 }, { "gate_value": 0.06713177263736725, "icl_sequence_length": 70, "num_contexts": 3, "step": 3080 }, { "grad_norm": 0.08161330968141556, "learning_rate": 0.00029939248612438147, "loss": 0.4586, "step": 3090 }, { "gate_value": 0.06736379861831665, "icl_sequence_length": 66, "num_contexts": 3, "step": 3090 }, { "grad_norm": 0.1583651602268219, "learning_rate": 0.0002993812853292915, "loss": 0.4577, "step": 3100 }, { "gate_value": 0.06740887463092804, "icl_sequence_length": 70, "num_contexts": 3, "step": 3100 }, { "grad_norm": 0.07588821649551392, "learning_rate": 0.00029936998243350153, "loss": 0.4599, "step": 3110 }, { "gate_value": 0.06748247146606445, "icl_sequence_length": 88, "num_contexts": 3, "step": 3110 }, { "grad_norm": 0.04477818310260773, "learning_rate": 0.00029935857744473705, "loss": 0.438, "step": 3120 }, { "gate_value": 0.06831636279821396, "icl_sequence_length": 66, "num_contexts": 3, "step": 3120 }, { "grad_norm": 0.11088748276233673, "learning_rate": 0.0002993470703707933, "loss": 0.4511, "step": 3130 }, { "gate_value": 0.06878136098384857, "icl_sequence_length": 72, "num_contexts": 3, "step": 3130 }, { "grad_norm": 0.07485643774271011, "learning_rate": 0.0002993354612195352, "loss": 0.4567, "step": 3140 }, { "gate_value": 0.06921064108610153, "icl_sequence_length": 88, "num_contexts": 3, "step": 3140 }, { "grad_norm": 0.04625440016388893, "learning_rate": 0.0002993237499988975, "loss": 0.45, "step": 3150 }, { "gate_value": 0.06880256533622742, "icl_sequence_length": 68, "num_contexts": 3, "step": 3150 }, { "grad_norm": 0.037098273634910583, "learning_rate": 0.0002993119367168847, "loss": 0.4516, "step": 3160 }, { "gate_value": 0.06890203058719635, "icl_sequence_length": 88, "num_contexts": 3, "step": 3160 }, { "grad_norm": 0.1960594356060028, "learning_rate": 0.0002993000213815711, "loss": 0.4566, "step": 3170 }, { "gate_value": 0.06906116008758545, "icl_sequence_length": 74, "num_contexts": 3, "step": 3170 }, { "grad_norm": 0.0907864198088646, "learning_rate": 0.0002992880040011007, "loss": 0.4524, "step": 3180 }, { "gate_value": 0.06909805536270142, "icl_sequence_length": 72, "num_contexts": 3, "step": 3180 }, { "grad_norm": 0.037198636680841446, "learning_rate": 0.0002992758845836873, "loss": 0.4641, "step": 3190 }, { "gate_value": 0.06939682364463806, "icl_sequence_length": 74, "num_contexts": 3, "step": 3190 }, { "grad_norm": 0.24401098489761353, "learning_rate": 0.00029926366313761424, "loss": 0.4631, "step": 3200 }, { "gate_value": 0.06982459127902985, "icl_sequence_length": 70, "num_contexts": 3, "step": 3200 }, { "grad_norm": 0.0669318288564682, "learning_rate": 0.000299251339671235, "loss": 0.4619, "step": 3210 }, { "gate_value": 0.07002398371696472, "icl_sequence_length": 74, "num_contexts": 3, "step": 3210 }, { "grad_norm": 0.13897764682769775, "learning_rate": 0.0002992389141929724, "loss": 0.4531, "step": 3220 }, { "gate_value": 0.07005146890878677, "icl_sequence_length": 82, "num_contexts": 3, "step": 3220 }, { "grad_norm": 0.04050607606768608, "learning_rate": 0.00029922638671131926, "loss": 0.4563, "step": 3230 }, { "gate_value": 0.0697348564863205, "icl_sequence_length": 86, "num_contexts": 3, "step": 3230 }, { "grad_norm": 0.11345645785331726, "learning_rate": 0.0002992137572348379, "loss": 0.4592, "step": 3240 }, { "gate_value": 0.06972472369670868, "icl_sequence_length": 72, "num_contexts": 3, "step": 3240 }, { "grad_norm": 0.11163768172264099, "learning_rate": 0.00029920102577216047, "loss": 0.4337, "step": 3250 }, { "gate_value": 0.0699225589632988, "icl_sequence_length": 82, "num_contexts": 3, "step": 3250 }, { "grad_norm": 0.09180065244436264, "learning_rate": 0.0002991881923319888, "loss": 0.4614, "step": 3260 }, { "gate_value": 0.06991465389728546, "icl_sequence_length": 88, "num_contexts": 3, "step": 3260 }, { "grad_norm": 0.16114754974842072, "learning_rate": 0.00029917525692309445, "loss": 0.4514, "step": 3270 }, { "gate_value": 0.06968920677900314, "icl_sequence_length": 80, "num_contexts": 3, "step": 3270 }, { "grad_norm": 0.08679977059364319, "learning_rate": 0.0002991622195543186, "loss": 0.4711, "step": 3280 }, { "gate_value": 0.069790318608284, "icl_sequence_length": 80, "num_contexts": 3, "step": 3280 }, { "grad_norm": 0.04381372407078743, "learning_rate": 0.0002991490802345722, "loss": 0.4695, "step": 3290 }, { "gate_value": 0.07006606459617615, "icl_sequence_length": 82, "num_contexts": 3, "step": 3290 }, { "grad_norm": 0.09389933198690414, "learning_rate": 0.0002991358389728359, "loss": 0.4612, "step": 3300 }, { "gate_value": 0.07035694271326065, "icl_sequence_length": 74, "num_contexts": 3, "step": 3300 }, { "grad_norm": 0.045784782618284225, "learning_rate": 0.00029912249577815987, "loss": 0.4487, "step": 3310 }, { "gate_value": 0.07085049897432327, "icl_sequence_length": 72, "num_contexts": 3, "step": 3310 }, { "grad_norm": 0.06604770570993423, "learning_rate": 0.0002991090506596641, "loss": 0.4448, "step": 3320 }, { "gate_value": 0.07110884040594101, "icl_sequence_length": 74, "num_contexts": 3, "step": 3320 }, { "grad_norm": 0.15612971782684326, "learning_rate": 0.0002990955036265383, "loss": 0.449, "step": 3330 }, { "gate_value": 0.07128405570983887, "icl_sequence_length": 68, "num_contexts": 3, "step": 3330 }, { "grad_norm": 0.13052192330360413, "learning_rate": 0.0002990818546880416, "loss": 0.4533, "step": 3340 }, { "gate_value": 0.07144956290721893, "icl_sequence_length": 58, "num_contexts": 3, "step": 3340 }, { "grad_norm": 0.10136935114860535, "learning_rate": 0.000299068103853503, "loss": 0.4516, "step": 3350 }, { "gate_value": 0.07161328196525574, "icl_sequence_length": 82, "num_contexts": 3, "step": 3350 }, { "grad_norm": 0.042142871767282486, "learning_rate": 0.00029905425113232103, "loss": 0.4645, "step": 3360 }, { "gate_value": 0.07148417830467224, "icl_sequence_length": 90, "num_contexts": 3, "step": 3360 }, { "grad_norm": 0.1631345897912979, "learning_rate": 0.0002990402965339639, "loss": 0.4448, "step": 3370 }, { "gate_value": 0.0716477707028389, "icl_sequence_length": 74, "num_contexts": 3, "step": 3370 }, { "grad_norm": 0.11523299664258957, "learning_rate": 0.0002990262400679695, "loss": 0.4547, "step": 3380 }, { "gate_value": 0.07228045165538788, "icl_sequence_length": 62, "num_contexts": 3, "step": 3380 }, { "grad_norm": 0.08556123077869415, "learning_rate": 0.0002990120817439452, "loss": 0.4426, "step": 3390 }, { "gate_value": 0.0726013332605362, "icl_sequence_length": 86, "num_contexts": 3, "step": 3390 }, { "grad_norm": 0.04559708759188652, "learning_rate": 0.00029899782157156817, "loss": 0.4481, "step": 3400 }, { "gate_value": 0.07277483493089676, "icl_sequence_length": 86, "num_contexts": 3, "step": 3400 }, { "grad_norm": 0.11405778676271439, "learning_rate": 0.000298983459560585, "loss": 0.4429, "step": 3410 }, { "gate_value": 0.07276701927185059, "icl_sequence_length": 80, "num_contexts": 3, "step": 3410 }, { "grad_norm": 0.05555117875337601, "learning_rate": 0.00029896899572081216, "loss": 0.4561, "step": 3420 }, { "gate_value": 0.0728112980723381, "icl_sequence_length": 84, "num_contexts": 3, "step": 3420 }, { "grad_norm": 0.03393098711967468, "learning_rate": 0.00029895443006213536, "loss": 0.4507, "step": 3430 }, { "gate_value": 0.07333412766456604, "icl_sequence_length": 80, "num_contexts": 3, "step": 3430 }, { "grad_norm": 0.08307201415300369, "learning_rate": 0.0002989397625945102, "loss": 0.448, "step": 3440 }, { "gate_value": 0.07309713959693909, "icl_sequence_length": 76, "num_contexts": 3, "step": 3440 }, { "grad_norm": 0.04834305867552757, "learning_rate": 0.00029892499332796166, "loss": 0.4551, "step": 3450 }, { "gate_value": 0.0728369876742363, "icl_sequence_length": 78, "num_contexts": 3, "step": 3450 }, { "grad_norm": 0.03444100171327591, "learning_rate": 0.00029891012227258447, "loss": 0.462, "step": 3460 }, { "gate_value": 0.07332317531108856, "icl_sequence_length": 78, "num_contexts": 3, "step": 3460 }, { "grad_norm": 0.03249925747513771, "learning_rate": 0.00029889514943854284, "loss": 0.4605, "step": 3470 }, { "gate_value": 0.07408445328474045, "icl_sequence_length": 76, "num_contexts": 3, "step": 3470 }, { "grad_norm": 0.1658170372247696, "learning_rate": 0.0002988800748360706, "loss": 0.4629, "step": 3480 }, { "gate_value": 0.0744827389717102, "icl_sequence_length": 90, "num_contexts": 3, "step": 3480 }, { "grad_norm": 0.10207600891590118, "learning_rate": 0.00029886489847547114, "loss": 0.4501, "step": 3490 }, { "gate_value": 0.07433953136205673, "icl_sequence_length": 62, "num_contexts": 3, "step": 3490 }, { "grad_norm": 0.11472195386886597, "learning_rate": 0.00029884962036711717, "loss": 0.444, "step": 3500 }, { "gate_value": 0.07421142607927322, "icl_sequence_length": 90, "num_contexts": 3, "step": 3500 }, { "grad_norm": 0.06376820802688599, "learning_rate": 0.00029883424052145127, "loss": 0.4551, "step": 3510 }, { "gate_value": 0.07423210889101028, "icl_sequence_length": 62, "num_contexts": 3, "step": 3510 }, { "grad_norm": 0.09487133473157883, "learning_rate": 0.00029881875894898543, "loss": 0.4733, "step": 3520 }, { "gate_value": 0.07424761354923248, "icl_sequence_length": 74, "num_contexts": 3, "step": 3520 }, { "grad_norm": 0.04656476154923439, "learning_rate": 0.00029880317566030113, "loss": 0.4543, "step": 3530 }, { "gate_value": 0.07421663403511047, "icl_sequence_length": 94, "num_contexts": 3, "step": 3530 }, { "grad_norm": 0.09370381385087967, "learning_rate": 0.00029878749066604936, "loss": 0.447, "step": 3540 }, { "gate_value": 0.07410692423582077, "icl_sequence_length": 80, "num_contexts": 3, "step": 3540 }, { "grad_norm": 0.16475284099578857, "learning_rate": 0.0002987717039769507, "loss": 0.4584, "step": 3550 }, { "gate_value": 0.0745421051979065, "icl_sequence_length": 88, "num_contexts": 3, "step": 3550 }, { "grad_norm": 0.16828690469264984, "learning_rate": 0.00029875581560379527, "loss": 0.4671, "step": 3560 }, { "gate_value": 0.07483482360839844, "icl_sequence_length": 84, "num_contexts": 3, "step": 3560 }, { "grad_norm": 0.14615994691848755, "learning_rate": 0.0002987398255574425, "loss": 0.4543, "step": 3570 }, { "gate_value": 0.0748705267906189, "icl_sequence_length": 78, "num_contexts": 3, "step": 3570 }, { "grad_norm": 0.0632835254073143, "learning_rate": 0.00029872373384882153, "loss": 0.4583, "step": 3580 }, { "gate_value": 0.07477834075689316, "icl_sequence_length": 76, "num_contexts": 3, "step": 3580 }, { "grad_norm": 0.08285976946353912, "learning_rate": 0.0002987075404889308, "loss": 0.4417, "step": 3590 }, { "gate_value": 0.07480471581220627, "icl_sequence_length": 80, "num_contexts": 3, "step": 3590 }, { "grad_norm": 0.033416613936424255, "learning_rate": 0.00029869124548883837, "loss": 0.4526, "step": 3600 }, { "gate_value": 0.07461623102426529, "icl_sequence_length": 86, "num_contexts": 3, "step": 3600 }, { "grad_norm": 0.032648663967847824, "learning_rate": 0.0002986748488596818, "loss": 0.4543, "step": 3610 }, { "gate_value": 0.075102798640728, "icl_sequence_length": 70, "num_contexts": 3, "step": 3610 }, { "grad_norm": 0.036768630146980286, "learning_rate": 0.0002986583506126679, "loss": 0.4596, "step": 3620 }, { "gate_value": 0.07561081647872925, "icl_sequence_length": 76, "num_contexts": 3, "step": 3620 }, { "grad_norm": 0.08868744969367981, "learning_rate": 0.0002986417507590731, "loss": 0.4374, "step": 3630 }, { "gate_value": 0.07600181549787521, "icl_sequence_length": 68, "num_contexts": 3, "step": 3630 }, { "grad_norm": 0.042598139494657516, "learning_rate": 0.0002986250493102433, "loss": 0.4493, "step": 3640 }, { "gate_value": 0.0765034481883049, "icl_sequence_length": 90, "num_contexts": 3, "step": 3640 }, { "grad_norm": 0.07039535045623779, "learning_rate": 0.00029860824627759376, "loss": 0.4465, "step": 3650 }, { "gate_value": 0.07682037353515625, "icl_sequence_length": 80, "num_contexts": 3, "step": 3650 }, { "grad_norm": 0.06170298904180527, "learning_rate": 0.00029859134167260917, "loss": 0.4435, "step": 3660 }, { "gate_value": 0.07736477255821228, "icl_sequence_length": 86, "num_contexts": 3, "step": 3660 }, { "grad_norm": 0.07327844947576523, "learning_rate": 0.0002985743355068437, "loss": 0.4603, "step": 3670 }, { "gate_value": 0.0773945227265358, "icl_sequence_length": 70, "num_contexts": 3, "step": 3670 }, { "grad_norm": 0.0467827171087265, "learning_rate": 0.0002985572277919208, "loss": 0.4543, "step": 3680 }, { "gate_value": 0.07693289965391159, "icl_sequence_length": 82, "num_contexts": 3, "step": 3680 }, { "grad_norm": 0.10239314287900925, "learning_rate": 0.0002985400185395336, "loss": 0.452, "step": 3690 }, { "gate_value": 0.07675682753324509, "icl_sequence_length": 90, "num_contexts": 3, "step": 3690 }, { "grad_norm": 0.032947756350040436, "learning_rate": 0.00029852270776144435, "loss": 0.4479, "step": 3700 }, { "gate_value": 0.0769607424736023, "icl_sequence_length": 82, "num_contexts": 3, "step": 3700 }, { "grad_norm": 0.05921289697289467, "learning_rate": 0.00029850529546948483, "loss": 0.4503, "step": 3710 }, { "gate_value": 0.07716976851224899, "icl_sequence_length": 76, "num_contexts": 3, "step": 3710 }, { "grad_norm": 0.1941101998090744, "learning_rate": 0.0002984877816755562, "loss": 0.438, "step": 3720 }, { "gate_value": 0.07710455358028412, "icl_sequence_length": 74, "num_contexts": 3, "step": 3720 }, { "grad_norm": 0.13056175410747528, "learning_rate": 0.0002984701663916289, "loss": 0.4458, "step": 3730 }, { "gate_value": 0.077248215675354, "icl_sequence_length": 76, "num_contexts": 3, "step": 3730 }, { "grad_norm": 0.22387555241584778, "learning_rate": 0.0002984524496297429, "loss": 0.4501, "step": 3740 }, { "gate_value": 0.07783285528421402, "icl_sequence_length": 86, "num_contexts": 3, "step": 3740 }, { "grad_norm": 0.11868324875831604, "learning_rate": 0.0002984346314020074, "loss": 0.4433, "step": 3750 }, { "gate_value": 0.07809463143348694, "icl_sequence_length": 94, "num_contexts": 3, "step": 3750 }, { "grad_norm": 0.1095101460814476, "learning_rate": 0.000298416711720601, "loss": 0.4435, "step": 3760 }, { "gate_value": 0.07806791365146637, "icl_sequence_length": 88, "num_contexts": 3, "step": 3760 }, { "grad_norm": 0.0775548443198204, "learning_rate": 0.0002983986905977716, "loss": 0.4514, "step": 3770 }, { "gate_value": 0.07827196270227432, "icl_sequence_length": 60, "num_contexts": 3, "step": 3770 }, { "grad_norm": 0.04364806041121483, "learning_rate": 0.00029838056804583644, "loss": 0.4645, "step": 3780 }, { "gate_value": 0.07875936478376389, "icl_sequence_length": 78, "num_contexts": 3, "step": 3780 }, { "grad_norm": 0.04026861488819122, "learning_rate": 0.00029836234407718226, "loss": 0.4511, "step": 3790 }, { "gate_value": 0.07864929735660553, "icl_sequence_length": 90, "num_contexts": 3, "step": 3790 }, { "grad_norm": 0.10821370035409927, "learning_rate": 0.00029834401870426484, "loss": 0.4273, "step": 3800 }, { "gate_value": 0.07822828739881516, "icl_sequence_length": 92, "num_contexts": 3, "step": 3800 }, { "grad_norm": 0.056853052228689194, "learning_rate": 0.00029832559193960947, "loss": 0.4466, "step": 3810 }, { "gate_value": 0.07868239283561707, "icl_sequence_length": 70, "num_contexts": 3, "step": 3810 }, { "grad_norm": 0.12730729579925537, "learning_rate": 0.0002983070637958106, "loss": 0.4359, "step": 3820 }, { "gate_value": 0.07887009531259537, "icl_sequence_length": 88, "num_contexts": 3, "step": 3820 }, { "grad_norm": 0.061061352491378784, "learning_rate": 0.00029828843428553203, "loss": 0.4637, "step": 3830 }, { "gate_value": 0.07961258292198181, "icl_sequence_length": 84, "num_contexts": 3, "step": 3830 }, { "grad_norm": 0.03473556786775589, "learning_rate": 0.000298269703421507, "loss": 0.4454, "step": 3840 }, { "gate_value": 0.07969710230827332, "icl_sequence_length": 72, "num_contexts": 3, "step": 3840 }, { "grad_norm": 0.11743411421775818, "learning_rate": 0.0002982508712165377, "loss": 0.4597, "step": 3850 }, { "gate_value": 0.07997772842645645, "icl_sequence_length": 78, "num_contexts": 3, "step": 3850 }, { "grad_norm": 0.12020798772573471, "learning_rate": 0.0002982319376834959, "loss": 0.4585, "step": 3860 }, { "gate_value": 0.08042223006486893, "icl_sequence_length": 72, "num_contexts": 3, "step": 3860 }, { "grad_norm": 0.14048640429973602, "learning_rate": 0.0002982129028353224, "loss": 0.4577, "step": 3870 }, { "gate_value": 0.08117994666099548, "icl_sequence_length": 78, "num_contexts": 3, "step": 3870 }, { "grad_norm": 0.08375728875398636, "learning_rate": 0.0002981937666850274, "loss": 0.4393, "step": 3880 }, { "gate_value": 0.08134108036756516, "icl_sequence_length": 86, "num_contexts": 3, "step": 3880 }, { "grad_norm": 0.15470552444458008, "learning_rate": 0.00029817452924569025, "loss": 0.4463, "step": 3890 }, { "gate_value": 0.08139047771692276, "icl_sequence_length": 88, "num_contexts": 3, "step": 3890 }, { "grad_norm": 0.05757463350892067, "learning_rate": 0.00029815519053045955, "loss": 0.4409, "step": 3900 }, { "gate_value": 0.08170637488365173, "icl_sequence_length": 78, "num_contexts": 3, "step": 3900 }, { "grad_norm": 0.10937239229679108, "learning_rate": 0.0002981357505525532, "loss": 0.4497, "step": 3910 }, { "gate_value": 0.08137400448322296, "icl_sequence_length": 82, "num_contexts": 3, "step": 3910 }, { "grad_norm": 0.07456956058740616, "learning_rate": 0.0002981162093252581, "loss": 0.4574, "step": 3920 }, { "gate_value": 0.0812961757183075, "icl_sequence_length": 80, "num_contexts": 3, "step": 3920 }, { "grad_norm": 0.14056850969791412, "learning_rate": 0.00029809656686193063, "loss": 0.4477, "step": 3930 }, { "gate_value": 0.08145935088396072, "icl_sequence_length": 80, "num_contexts": 3, "step": 3930 }, { "grad_norm": 0.13515639305114746, "learning_rate": 0.0002980768231759961, "loss": 0.4441, "step": 3940 }, { "gate_value": 0.08180077373981476, "icl_sequence_length": 88, "num_contexts": 3, "step": 3940 }, { "grad_norm": 0.07286737859249115, "learning_rate": 0.00029805697828094935, "loss": 0.4426, "step": 3950 }, { "gate_value": 0.08208008110523224, "icl_sequence_length": 84, "num_contexts": 3, "step": 3950 }, { "grad_norm": 0.02850443497300148, "learning_rate": 0.00029803703219035397, "loss": 0.4327, "step": 3960 }, { "gate_value": 0.0816754624247551, "icl_sequence_length": 86, "num_contexts": 3, "step": 3960 }, { "grad_norm": 0.02943161129951477, "learning_rate": 0.00029801698491784294, "loss": 0.439, "step": 3970 }, { "gate_value": 0.08196882903575897, "icl_sequence_length": 94, "num_contexts": 3, "step": 3970 }, { "grad_norm": 0.09917061030864716, "learning_rate": 0.0002979968364771185, "loss": 0.4544, "step": 3980 }, { "gate_value": 0.08232785761356354, "icl_sequence_length": 68, "num_contexts": 3, "step": 3980 }, { "grad_norm": 0.09027360379695892, "learning_rate": 0.0002979765868819518, "loss": 0.4497, "step": 3990 }, { "gate_value": 0.08220331370830536, "icl_sequence_length": 74, "num_contexts": 3, "step": 3990 }, { "grad_norm": 0.03193968906998634, "learning_rate": 0.0002979562361461834, "loss": 0.449, "step": 4000 }, { "gate_value": 0.08218622207641602, "icl_sequence_length": 90, "num_contexts": 3, "step": 4000 }, { "grad_norm": 0.08486006408929825, "learning_rate": 0.00029793578428372264, "loss": 0.4432, "step": 4010 }, { "gate_value": 0.08235253393650055, "icl_sequence_length": 72, "num_contexts": 3, "step": 4010 }, { "grad_norm": 0.1432875394821167, "learning_rate": 0.00029791523130854827, "loss": 0.4585, "step": 4020 }, { "gate_value": 0.0827346071600914, "icl_sequence_length": 88, "num_contexts": 3, "step": 4020 }, { "grad_norm": 0.03996672108769417, "learning_rate": 0.00029789457723470816, "loss": 0.4513, "step": 4030 }, { "gate_value": 0.0829230546951294, "icl_sequence_length": 80, "num_contexts": 3, "step": 4030 }, { "grad_norm": 0.025110652670264244, "learning_rate": 0.000297873822076319, "loss": 0.4506, "step": 4040 }, { "gate_value": 0.08226954191923141, "icl_sequence_length": 82, "num_contexts": 3, "step": 4040 }, { "grad_norm": 0.10905573517084122, "learning_rate": 0.00029785296584756684, "loss": 0.4371, "step": 4050 }, { "gate_value": 0.08197779953479767, "icl_sequence_length": 80, "num_contexts": 3, "step": 4050 }, { "grad_norm": 0.11699944734573364, "learning_rate": 0.00029783200856270663, "loss": 0.4536, "step": 4060 }, { "gate_value": 0.0821978896856308, "icl_sequence_length": 94, "num_contexts": 3, "step": 4060 }, { "grad_norm": 0.02820429392158985, "learning_rate": 0.0002978109502360626, "loss": 0.4617, "step": 4070 }, { "gate_value": 0.0826956108212471, "icl_sequence_length": 82, "num_contexts": 3, "step": 4070 }, { "grad_norm": 0.07080373913049698, "learning_rate": 0.00029778979088202785, "loss": 0.4366, "step": 4080 }, { "gate_value": 0.08400825411081314, "icl_sequence_length": 88, "num_contexts": 3, "step": 4080 }, { "grad_norm": 0.046099573373794556, "learning_rate": 0.0002977685305150646, "loss": 0.4533, "step": 4090 }, { "gate_value": 0.0844913125038147, "icl_sequence_length": 88, "num_contexts": 3, "step": 4090 }, { "grad_norm": 0.03396543487906456, "learning_rate": 0.0002977471691497041, "loss": 0.4376, "step": 4100 }, { "gate_value": 0.08502555638551712, "icl_sequence_length": 68, "num_contexts": 3, "step": 4100 }, { "grad_norm": 0.14182616770267487, "learning_rate": 0.0002977257068005467, "loss": 0.4663, "step": 4110 }, { "gate_value": 0.08497641980648041, "icl_sequence_length": 94, "num_contexts": 3, "step": 4110 }, { "grad_norm": 0.03911354020237923, "learning_rate": 0.00029770414348226164, "loss": 0.4472, "step": 4120 }, { "gate_value": 0.08530503511428833, "icl_sequence_length": 84, "num_contexts": 3, "step": 4120 }, { "grad_norm": 0.12039171159267426, "learning_rate": 0.0002976824792095873, "loss": 0.444, "step": 4130 }, { "gate_value": 0.08518508821725845, "icl_sequence_length": 76, "num_contexts": 3, "step": 4130 }, { "grad_norm": 0.03984901309013367, "learning_rate": 0.000297660713997331, "loss": 0.4426, "step": 4140 }, { "gate_value": 0.0854906365275383, "icl_sequence_length": 72, "num_contexts": 3, "step": 4140 }, { "grad_norm": 0.04110978543758392, "learning_rate": 0.000297638847860369, "loss": 0.44, "step": 4150 }, { "gate_value": 0.08582597225904465, "icl_sequence_length": 82, "num_contexts": 3, "step": 4150 }, { "grad_norm": 0.03553116321563721, "learning_rate": 0.00029761688081364663, "loss": 0.4409, "step": 4160 }, { "gate_value": 0.08649842441082001, "icl_sequence_length": 78, "num_contexts": 3, "step": 4160 }, { "grad_norm": 0.0862579196691513, "learning_rate": 0.0002975948128721782, "loss": 0.4589, "step": 4170 }, { "gate_value": 0.08685018122196198, "icl_sequence_length": 80, "num_contexts": 3, "step": 4170 }, { "grad_norm": 0.048685476183891296, "learning_rate": 0.0002975726440510469, "loss": 0.4296, "step": 4180 }, { "gate_value": 0.08682183921337128, "icl_sequence_length": 88, "num_contexts": 3, "step": 4180 }, { "grad_norm": 0.07612019032239914, "learning_rate": 0.0002975503743654049, "loss": 0.4376, "step": 4190 }, { "gate_value": 0.0863993689417839, "icl_sequence_length": 96, "num_contexts": 3, "step": 4190 }, { "grad_norm": 0.07147730886936188, "learning_rate": 0.00029752800383047335, "loss": 0.4444, "step": 4200 }, { "gate_value": 0.08639863133430481, "icl_sequence_length": 92, "num_contexts": 3, "step": 4200 }, { "grad_norm": 0.12619084119796753, "learning_rate": 0.0002975055324615423, "loss": 0.432, "step": 4210 }, { "gate_value": 0.08714673668146133, "icl_sequence_length": 86, "num_contexts": 3, "step": 4210 }, { "grad_norm": 0.028788521885871887, "learning_rate": 0.00029748296027397065, "loss": 0.4538, "step": 4220 }, { "gate_value": 0.08739547431468964, "icl_sequence_length": 72, "num_contexts": 3, "step": 4220 }, { "grad_norm": 0.11368781328201294, "learning_rate": 0.0002974602872831864, "loss": 0.4531, "step": 4230 }, { "gate_value": 0.08731156587600708, "icl_sequence_length": 84, "num_contexts": 3, "step": 4230 }, { "grad_norm": 0.11662064492702484, "learning_rate": 0.00029743751350468626, "loss": 0.4487, "step": 4240 }, { "gate_value": 0.08763561397790909, "icl_sequence_length": 88, "num_contexts": 3, "step": 4240 }, { "grad_norm": 0.14818547666072845, "learning_rate": 0.0002974146389540358, "loss": 0.4344, "step": 4250 }, { "gate_value": 0.08787613362073898, "icl_sequence_length": 88, "num_contexts": 3, "step": 4250 }, { "grad_norm": 0.035743940621614456, "learning_rate": 0.0002973916636468698, "loss": 0.4496, "step": 4260 }, { "gate_value": 0.08736476302146912, "icl_sequence_length": 88, "num_contexts": 3, "step": 4260 }, { "grad_norm": 0.04561284929513931, "learning_rate": 0.00029736858759889137, "loss": 0.4432, "step": 4270 }, { "gate_value": 0.08761058002710342, "icl_sequence_length": 80, "num_contexts": 3, "step": 4270 }, { "grad_norm": 0.06140894815325737, "learning_rate": 0.000297345410825873, "loss": 0.4497, "step": 4280 }, { "gate_value": 0.08794830739498138, "icl_sequence_length": 76, "num_contexts": 3, "step": 4280 }, { "grad_norm": 0.12474972754716873, "learning_rate": 0.0002973221333436557, "loss": 0.4489, "step": 4290 }, { "gate_value": 0.08841531723737717, "icl_sequence_length": 72, "num_contexts": 3, "step": 4290 }, { "grad_norm": 0.0846974104642868, "learning_rate": 0.00029729875516814946, "loss": 0.4488, "step": 4300 }, { "gate_value": 0.08881258219480515, "icl_sequence_length": 58, "num_contexts": 3, "step": 4300 }, { "grad_norm": 0.07251457870006561, "learning_rate": 0.000297275276315333, "loss": 0.424, "step": 4310 }, { "gate_value": 0.088979572057724, "icl_sequence_length": 82, "num_contexts": 3, "step": 4310 }, { "grad_norm": 0.10169383883476257, "learning_rate": 0.00029725169680125385, "loss": 0.4563, "step": 4320 }, { "gate_value": 0.0891449972987175, "icl_sequence_length": 92, "num_contexts": 3, "step": 4320 }, { "grad_norm": 0.048810429871082306, "learning_rate": 0.00029722801664202843, "loss": 0.433, "step": 4330 }, { "gate_value": 0.09007365256547928, "icl_sequence_length": 82, "num_contexts": 3, "step": 4330 }, { "grad_norm": 0.09916849434375763, "learning_rate": 0.00029720423585384196, "loss": 0.4407, "step": 4340 }, { "gate_value": 0.0902184247970581, "icl_sequence_length": 80, "num_contexts": 3, "step": 4340 }, { "grad_norm": 0.07443194836378098, "learning_rate": 0.00029718035445294835, "loss": 0.4396, "step": 4350 }, { "gate_value": 0.09020433574914932, "icl_sequence_length": 74, "num_contexts": 3, "step": 4350 }, { "grad_norm": 0.06431616097688675, "learning_rate": 0.0002971563724556703, "loss": 0.4333, "step": 4360 }, { "gate_value": 0.0905480906367302, "icl_sequence_length": 74, "num_contexts": 3, "step": 4360 }, { "grad_norm": 0.07431776076555252, "learning_rate": 0.0002971322898783992, "loss": 0.4274, "step": 4370 }, { "gate_value": 0.09058386087417603, "icl_sequence_length": 84, "num_contexts": 3, "step": 4370 }, { "grad_norm": 0.07152583450078964, "learning_rate": 0.0002971081067375954, "loss": 0.4202, "step": 4380 }, { "gate_value": 0.09066424518823624, "icl_sequence_length": 72, "num_contexts": 3, "step": 4380 }, { "grad_norm": 0.10134628415107727, "learning_rate": 0.0002970838230497878, "loss": 0.436, "step": 4390 }, { "gate_value": 0.09120754152536392, "icl_sequence_length": 68, "num_contexts": 3, "step": 4390 }, { "grad_norm": 0.03166228160262108, "learning_rate": 0.000297059438831574, "loss": 0.454, "step": 4400 }, { "gate_value": 0.09206748008728027, "icl_sequence_length": 88, "num_contexts": 3, "step": 4400 }, { "grad_norm": 0.1006300076842308, "learning_rate": 0.0002970349540996205, "loss": 0.4661, "step": 4410 }, { "gate_value": 0.09231452643871307, "icl_sequence_length": 74, "num_contexts": 3, "step": 4410 }, { "grad_norm": 0.039432503283023834, "learning_rate": 0.0002970103688706623, "loss": 0.4409, "step": 4420 }, { "gate_value": 0.0917162299156189, "icl_sequence_length": 76, "num_contexts": 3, "step": 4420 }, { "grad_norm": 0.13075602054595947, "learning_rate": 0.00029698568316150327, "loss": 0.4256, "step": 4430 }, { "gate_value": 0.09100466966629028, "icl_sequence_length": 80, "num_contexts": 3, "step": 4430 }, { "grad_norm": 0.0706099420785904, "learning_rate": 0.00029696089698901575, "loss": 0.4452, "step": 4440 }, { "gate_value": 0.0907866507768631, "icl_sequence_length": 82, "num_contexts": 3, "step": 4440 }, { "grad_norm": 0.04131682217121124, "learning_rate": 0.0002969360103701409, "loss": 0.4473, "step": 4450 }, { "gate_value": 0.09110541641712189, "icl_sequence_length": 94, "num_contexts": 3, "step": 4450 }, { "grad_norm": 0.08504730463027954, "learning_rate": 0.0002969110233218885, "loss": 0.4435, "step": 4460 }, { "gate_value": 0.09132962673902512, "icl_sequence_length": 68, "num_contexts": 3, "step": 4460 }, { "grad_norm": 0.1154944896697998, "learning_rate": 0.00029688593586133687, "loss": 0.4311, "step": 4470 }, { "gate_value": 0.09130672365427017, "icl_sequence_length": 74, "num_contexts": 3, "step": 4470 }, { "grad_norm": 0.04087964445352554, "learning_rate": 0.0002968607480056332, "loss": 0.4496, "step": 4480 }, { "gate_value": 0.09184221923351288, "icl_sequence_length": 76, "num_contexts": 3, "step": 4480 }, { "grad_norm": 0.06298123300075531, "learning_rate": 0.00029683545977199306, "loss": 0.4331, "step": 4490 }, { "gate_value": 0.09272352606058121, "icl_sequence_length": 92, "num_contexts": 3, "step": 4490 }, { "grad_norm": 0.12888146936893463, "learning_rate": 0.0002968100711777008, "loss": 0.4632, "step": 4500 }, { "gate_value": 0.0925886407494545, "icl_sequence_length": 88, "num_contexts": 3, "step": 4500 }, { "grad_norm": 0.04741204157471657, "learning_rate": 0.0002967845822401091, "loss": 0.4255, "step": 4510 }, { "gate_value": 0.0922568216919899, "icl_sequence_length": 94, "num_contexts": 3, "step": 4510 }, { "grad_norm": 0.0282837375998497, "learning_rate": 0.00029675899297663965, "loss": 0.4332, "step": 4520 }, { "gate_value": 0.09246893227100372, "icl_sequence_length": 88, "num_contexts": 3, "step": 4520 }, { "grad_norm": 0.044491998851299286, "learning_rate": 0.00029673330340478234, "loss": 0.44, "step": 4530 }, { "gate_value": 0.09226234257221222, "icl_sequence_length": 82, "num_contexts": 3, "step": 4530 }, { "grad_norm": 0.0732334777712822, "learning_rate": 0.0002967075135420957, "loss": 0.4077, "step": 4540 }, { "gate_value": 0.09291594475507736, "icl_sequence_length": 88, "num_contexts": 3, "step": 4540 }, { "grad_norm": 0.16014868021011353, "learning_rate": 0.00029668162340620695, "loss": 0.4489, "step": 4550 }, { "gate_value": 0.09284153580665588, "icl_sequence_length": 76, "num_contexts": 3, "step": 4550 }, { "grad_norm": 0.19823066890239716, "learning_rate": 0.00029665563301481174, "loss": 0.4326, "step": 4560 }, { "gate_value": 0.09242615848779678, "icl_sequence_length": 86, "num_contexts": 3, "step": 4560 }, { "grad_norm": 0.1002328023314476, "learning_rate": 0.00029662954238567427, "loss": 0.4508, "step": 4570 }, { "gate_value": 0.09263605624437332, "icl_sequence_length": 88, "num_contexts": 3, "step": 4570 }, { "grad_norm": 0.03767078369855881, "learning_rate": 0.00029660335153662717, "loss": 0.4481, "step": 4580 }, { "gate_value": 0.09308427572250366, "icl_sequence_length": 88, "num_contexts": 3, "step": 4580 }, { "grad_norm": 0.055940892547369, "learning_rate": 0.0002965770604855717, "loss": 0.4429, "step": 4590 }, { "gate_value": 0.09287799894809723, "icl_sequence_length": 86, "num_contexts": 3, "step": 4590 }, { "grad_norm": 0.06824788451194763, "learning_rate": 0.00029655066925047754, "loss": 0.4463, "step": 4600 }, { "gate_value": 0.09253347665071487, "icl_sequence_length": 70, "num_contexts": 3, "step": 4600 }, { "grad_norm": 0.11449204385280609, "learning_rate": 0.0002965241778493828, "loss": 0.4275, "step": 4610 }, { "gate_value": 0.09268172085285187, "icl_sequence_length": 66, "num_contexts": 3, "step": 4610 }, { "grad_norm": 0.13200627267360687, "learning_rate": 0.0002964975863003942, "loss": 0.4334, "step": 4620 }, { "gate_value": 0.09307952225208282, "icl_sequence_length": 80, "num_contexts": 3, "step": 4620 }, { "grad_norm": 0.219753697514534, "learning_rate": 0.0002964708946216867, "loss": 0.457, "step": 4630 }, { "gate_value": 0.09279076755046844, "icl_sequence_length": 64, "num_contexts": 3, "step": 4630 }, { "grad_norm": 0.07256842404603958, "learning_rate": 0.00029644410283150393, "loss": 0.4342, "step": 4640 }, { "gate_value": 0.09255042672157288, "icl_sequence_length": 68, "num_contexts": 3, "step": 4640 }, { "grad_norm": 0.03685823827981949, "learning_rate": 0.00029641721094815764, "loss": 0.4533, "step": 4650 }, { "gate_value": 0.09305860102176666, "icl_sequence_length": 92, "num_contexts": 3, "step": 4650 }, { "grad_norm": 0.04822506383061409, "learning_rate": 0.0002963902189900284, "loss": 0.4521, "step": 4660 }, { "gate_value": 0.093348428606987, "icl_sequence_length": 74, "num_contexts": 3, "step": 4660 }, { "grad_norm": 0.044017307460308075, "learning_rate": 0.00029636312697556484, "loss": 0.4566, "step": 4670 }, { "gate_value": 0.09334082901477814, "icl_sequence_length": 68, "num_contexts": 3, "step": 4670 }, { "grad_norm": 0.07033849507570267, "learning_rate": 0.0002963359349232841, "loss": 0.4526, "step": 4680 }, { "gate_value": 0.09388935565948486, "icl_sequence_length": 74, "num_contexts": 3, "step": 4680 }, { "grad_norm": 0.037495627999305725, "learning_rate": 0.00029630864285177166, "loss": 0.43, "step": 4690 }, { "gate_value": 0.09439925849437714, "icl_sequence_length": 84, "num_contexts": 3, "step": 4690 }, { "grad_norm": 0.040242332965135574, "learning_rate": 0.0002962812507796815, "loss": 0.4209, "step": 4700 }, { "gate_value": 0.09456100314855576, "icl_sequence_length": 80, "num_contexts": 3, "step": 4700 }, { "grad_norm": 0.12894514203071594, "learning_rate": 0.0002962537587257358, "loss": 0.4321, "step": 4710 }, { "gate_value": 0.09533104300498962, "icl_sequence_length": 74, "num_contexts": 3, "step": 4710 }, { "grad_norm": 0.08391169458627701, "learning_rate": 0.0002962261667087251, "loss": 0.4371, "step": 4720 }, { "gate_value": 0.09611310064792633, "icl_sequence_length": 90, "num_contexts": 3, "step": 4720 }, { "grad_norm": 0.08664949983358383, "learning_rate": 0.00029619847474750825, "loss": 0.4519, "step": 4730 }, { "gate_value": 0.09609542787075043, "icl_sequence_length": 84, "num_contexts": 3, "step": 4730 }, { "grad_norm": 0.08422094583511353, "learning_rate": 0.0002961706828610125, "loss": 0.441, "step": 4740 }, { "gate_value": 0.09666085988283157, "icl_sequence_length": 90, "num_contexts": 3, "step": 4740 }, { "grad_norm": 0.10419321805238724, "learning_rate": 0.00029614279106823327, "loss": 0.4484, "step": 4750 }, { "gate_value": 0.09734027087688446, "icl_sequence_length": 84, "num_contexts": 3, "step": 4750 }, { "grad_norm": 0.07651486992835999, "learning_rate": 0.0002961147993882344, "loss": 0.4208, "step": 4760 }, { "gate_value": 0.09778045862913132, "icl_sequence_length": 70, "num_contexts": 3, "step": 4760 }, { "grad_norm": 0.030980784446001053, "learning_rate": 0.0002960867078401479, "loss": 0.4397, "step": 4770 }, { "gate_value": 0.09714986383914948, "icl_sequence_length": 84, "num_contexts": 3, "step": 4770 }, { "grad_norm": 0.14434650540351868, "learning_rate": 0.0002960585164431742, "loss": 0.4332, "step": 4780 }, { "gate_value": 0.09717854857444763, "icl_sequence_length": 92, "num_contexts": 3, "step": 4780 }, { "grad_norm": 0.042256616055965424, "learning_rate": 0.00029603022521658174, "loss": 0.4431, "step": 4790 }, { "gate_value": 0.09656146913766861, "icl_sequence_length": 80, "num_contexts": 3, "step": 4790 }, { "grad_norm": 0.06562954187393188, "learning_rate": 0.0002960018341797073, "loss": 0.4455, "step": 4800 }, { "gate_value": 0.09680372476577759, "icl_sequence_length": 72, "num_contexts": 3, "step": 4800 }, { "grad_norm": 0.0575077123939991, "learning_rate": 0.0002959733433519559, "loss": 0.4349, "step": 4810 }, { "gate_value": 0.09677128493785858, "icl_sequence_length": 88, "num_contexts": 3, "step": 4810 }, { "grad_norm": 0.04724402353167534, "learning_rate": 0.0002959447527528008, "loss": 0.4317, "step": 4820 }, { "gate_value": 0.0971720814704895, "icl_sequence_length": 86, "num_contexts": 3, "step": 4820 }, { "grad_norm": 0.08383426070213318, "learning_rate": 0.00029591606240178336, "loss": 0.4491, "step": 4830 }, { "gate_value": 0.09781701862812042, "icl_sequence_length": 80, "num_contexts": 3, "step": 4830 }, { "grad_norm": 0.045405976474285126, "learning_rate": 0.00029588727231851317, "loss": 0.4264, "step": 4840 }, { "gate_value": 0.09881711006164551, "icl_sequence_length": 86, "num_contexts": 3, "step": 4840 }, { "grad_norm": 0.04647388681769371, "learning_rate": 0.00029585838252266797, "loss": 0.4422, "step": 4850 }, { "gate_value": 0.09951039403676987, "icl_sequence_length": 62, "num_contexts": 3, "step": 4850 }, { "grad_norm": 0.049380771815776825, "learning_rate": 0.0002958293930339937, "loss": 0.4484, "step": 4860 }, { "gate_value": 0.09975112974643707, "icl_sequence_length": 82, "num_contexts": 3, "step": 4860 }, { "grad_norm": 0.11173932999372482, "learning_rate": 0.00029580030387230436, "loss": 0.429, "step": 4870 }, { "gate_value": 0.10008491575717926, "icl_sequence_length": 90, "num_contexts": 3, "step": 4870 }, { "grad_norm": 0.038873374462127686, "learning_rate": 0.00029577111505748216, "loss": 0.4521, "step": 4880 }, { "gate_value": 0.10003317147493362, "icl_sequence_length": 58, "num_contexts": 3, "step": 4880 }, { "grad_norm": 0.036797404289245605, "learning_rate": 0.00029574182660947735, "loss": 0.4587, "step": 4890 }, { "gate_value": 0.10025697946548462, "icl_sequence_length": 72, "num_contexts": 3, "step": 4890 }, { "grad_norm": 0.04378237947821617, "learning_rate": 0.00029571243854830835, "loss": 0.4413, "step": 4900 }, { "gate_value": 0.10004623234272003, "icl_sequence_length": 90, "num_contexts": 3, "step": 4900 }, { "grad_norm": 0.059293054044246674, "learning_rate": 0.00029568295089406154, "loss": 0.4358, "step": 4910 }, { "gate_value": 0.10010574758052826, "icl_sequence_length": 78, "num_contexts": 3, "step": 4910 }, { "grad_norm": 0.05134032666683197, "learning_rate": 0.00029565336366689146, "loss": 0.4418, "step": 4920 }, { "gate_value": 0.10006356984376907, "icl_sequence_length": 60, "num_contexts": 3, "step": 4920 }, { "grad_norm": 0.06267619878053665, "learning_rate": 0.00029562367688702084, "loss": 0.4357, "step": 4930 }, { "gate_value": 0.09998317062854767, "icl_sequence_length": 84, "num_contexts": 3, "step": 4930 }, { "grad_norm": 0.07684914022684097, "learning_rate": 0.0002955938905747402, "loss": 0.4332, "step": 4940 }, { "gate_value": 0.10052120685577393, "icl_sequence_length": 90, "num_contexts": 3, "step": 4940 }, { "grad_norm": 0.10818302631378174, "learning_rate": 0.00029556400475040813, "loss": 0.4445, "step": 4950 }, { "gate_value": 0.10102952271699905, "icl_sequence_length": 90, "num_contexts": 3, "step": 4950 }, { "grad_norm": 0.03710390627384186, "learning_rate": 0.0002955340194344515, "loss": 0.4484, "step": 4960 }, { "gate_value": 0.10203094780445099, "icl_sequence_length": 80, "num_contexts": 3, "step": 4960 }, { "grad_norm": 0.07414698600769043, "learning_rate": 0.00029550393464736484, "loss": 0.4415, "step": 4970 }, { "gate_value": 0.10241258144378662, "icl_sequence_length": 66, "num_contexts": 3, "step": 4970 }, { "grad_norm": 0.04050971195101738, "learning_rate": 0.0002954737504097109, "loss": 0.4406, "step": 4980 }, { "gate_value": 0.10247789323329926, "icl_sequence_length": 82, "num_contexts": 3, "step": 4980 }, { "grad_norm": 0.10987893491983414, "learning_rate": 0.00029544346674212026, "loss": 0.4274, "step": 4990 }, { "gate_value": 0.10240554064512253, "icl_sequence_length": 66, "num_contexts": 3, "step": 4990 }, { "grad_norm": 0.05541801080107689, "learning_rate": 0.0002954130836652916, "loss": 0.4386, "step": 5000 }, { "gate_value": 0.1022152230143547, "icl_sequence_length": 80, "num_contexts": 3, "step": 5000 }, { "grad_norm": 0.03097173199057579, "learning_rate": 0.00029538260119999133, "loss": 0.4319, "step": 5010 }, { "gate_value": 0.10129179060459137, "icl_sequence_length": 92, "num_contexts": 3, "step": 5010 }, { "grad_norm": 0.161233589053154, "learning_rate": 0.0002953520193670541, "loss": 0.4525, "step": 5020 }, { "gate_value": 0.10048361122608185, "icl_sequence_length": 72, "num_contexts": 3, "step": 5020 }, { "grad_norm": 0.06552668660879135, "learning_rate": 0.0002953213381873822, "loss": 0.4534, "step": 5030 }, { "gate_value": 0.10061473399400711, "icl_sequence_length": 84, "num_contexts": 3, "step": 5030 }, { "grad_norm": 0.08582403510808945, "learning_rate": 0.0002952905576819459, "loss": 0.4358, "step": 5040 }, { "gate_value": 0.10137443244457245, "icl_sequence_length": 80, "num_contexts": 3, "step": 5040 }, { "grad_norm": 0.046412449330091476, "learning_rate": 0.00029525967787178347, "loss": 0.4432, "step": 5050 }, { "gate_value": 0.10161323845386505, "icl_sequence_length": 78, "num_contexts": 3, "step": 5050 }, { "grad_norm": 0.10222003608942032, "learning_rate": 0.00029522869877800093, "loss": 0.442, "step": 5060 }, { "gate_value": 0.1019650399684906, "icl_sequence_length": 82, "num_contexts": 3, "step": 5060 }, { "grad_norm": 0.06821785122156143, "learning_rate": 0.00029519762042177225, "loss": 0.4388, "step": 5070 }, { "gate_value": 0.10277073085308075, "icl_sequence_length": 82, "num_contexts": 3, "step": 5070 }, { "grad_norm": 0.050177618861198425, "learning_rate": 0.0002951664428243391, "loss": 0.4466, "step": 5080 }, { "gate_value": 0.1031922847032547, "icl_sequence_length": 72, "num_contexts": 3, "step": 5080 }, { "grad_norm": 0.04561970382928848, "learning_rate": 0.00029513516600701106, "loss": 0.4523, "step": 5090 }, { "gate_value": 0.10391707718372345, "icl_sequence_length": 84, "num_contexts": 3, "step": 5090 }, { "grad_norm": 0.04120245203375816, "learning_rate": 0.0002951037899911657, "loss": 0.4512, "step": 5100 }, { "gate_value": 0.10400953143835068, "icl_sequence_length": 94, "num_contexts": 3, "step": 5100 }, { "grad_norm": 0.03396369889378548, "learning_rate": 0.0002950723147982481, "loss": 0.4368, "step": 5110 }, { "gate_value": 0.10385365039110184, "icl_sequence_length": 78, "num_contexts": 3, "step": 5110 }, { "grad_norm": 0.041298843920230865, "learning_rate": 0.0002950407404497712, "loss": 0.428, "step": 5120 }, { "gate_value": 0.1042807549238205, "icl_sequence_length": 86, "num_contexts": 3, "step": 5120 }, { "grad_norm": 0.05203133821487427, "learning_rate": 0.00029500906696731596, "loss": 0.4465, "step": 5130 }, { "gate_value": 0.1041208803653717, "icl_sequence_length": 82, "num_contexts": 3, "step": 5130 }, { "grad_norm": 0.08747325837612152, "learning_rate": 0.0002949772943725307, "loss": 0.4433, "step": 5140 }, { "gate_value": 0.1040065661072731, "icl_sequence_length": 82, "num_contexts": 3, "step": 5140 }, { "grad_norm": 0.13491439819335938, "learning_rate": 0.00029494542268713184, "loss": 0.4409, "step": 5150 }, { "gate_value": 0.10431725531816483, "icl_sequence_length": 64, "num_contexts": 3, "step": 5150 }, { "grad_norm": 0.11412998288869858, "learning_rate": 0.00029491345193290337, "loss": 0.4416, "step": 5160 }, { "gate_value": 0.10433990508317947, "icl_sequence_length": 70, "num_contexts": 3, "step": 5160 }, { "grad_norm": 0.05630851164460182, "learning_rate": 0.00029488138213169693, "loss": 0.4299, "step": 5170 }, { "gate_value": 0.10372508317232132, "icl_sequence_length": 72, "num_contexts": 3, "step": 5170 }, { "grad_norm": 0.13014192879199982, "learning_rate": 0.00029484921330543193, "loss": 0.4541, "step": 5180 }, { "gate_value": 0.10315015912055969, "icl_sequence_length": 66, "num_contexts": 3, "step": 5180 }, { "grad_norm": 0.10586561262607574, "learning_rate": 0.0002948169454760955, "loss": 0.4307, "step": 5190 }, { "gate_value": 0.10308712720870972, "icl_sequence_length": 86, "num_contexts": 3, "step": 5190 }, { "grad_norm": 0.06097865477204323, "learning_rate": 0.00029478457866574236, "loss": 0.4269, "step": 5200 }, { "gate_value": 0.10377305746078491, "icl_sequence_length": 78, "num_contexts": 3, "step": 5200 }, { "grad_norm": 0.07691788673400879, "learning_rate": 0.000294752112896495, "loss": 0.4395, "step": 5210 }, { "gate_value": 0.10425736010074615, "icl_sequence_length": 96, "num_contexts": 3, "step": 5210 }, { "grad_norm": 0.10720237344503403, "learning_rate": 0.00029471954819054334, "loss": 0.4359, "step": 5220 }, { "gate_value": 0.10501399636268616, "icl_sequence_length": 74, "num_contexts": 3, "step": 5220 }, { "grad_norm": 0.034056954085826874, "learning_rate": 0.0002946868845701451, "loss": 0.4315, "step": 5230 }, { "gate_value": 0.10612098127603531, "icl_sequence_length": 84, "num_contexts": 3, "step": 5230 }, { "grad_norm": 0.07588791847229004, "learning_rate": 0.00029465412205762566, "loss": 0.4372, "step": 5240 }, { "gate_value": 0.10589048266410828, "icl_sequence_length": 84, "num_contexts": 3, "step": 5240 }, { "grad_norm": 0.06615625321865082, "learning_rate": 0.0002946212606753777, "loss": 0.4227, "step": 5250 }, { "gate_value": 0.10498031228780746, "icl_sequence_length": 78, "num_contexts": 3, "step": 5250 }, { "grad_norm": 0.08545836061239243, "learning_rate": 0.00029458830044586185, "loss": 0.4582, "step": 5260 }, { "gate_value": 0.10491736233234406, "icl_sequence_length": 76, "num_contexts": 3, "step": 5260 }, { "grad_norm": 0.03687456622719765, "learning_rate": 0.000294555241391606, "loss": 0.451, "step": 5270 }, { "gate_value": 0.10536529868841171, "icl_sequence_length": 74, "num_contexts": 3, "step": 5270 }, { "grad_norm": 0.051188874989748, "learning_rate": 0.00029452208353520574, "loss": 0.4175, "step": 5280 }, { "gate_value": 0.10574186593294144, "icl_sequence_length": 94, "num_contexts": 3, "step": 5280 }, { "grad_norm": 0.13268040120601654, "learning_rate": 0.0002944888268993241, "loss": 0.428, "step": 5290 }, { "gate_value": 0.10658949613571167, "icl_sequence_length": 84, "num_contexts": 3, "step": 5290 }, { "grad_norm": 0.04050949588418007, "learning_rate": 0.00029445547150669176, "loss": 0.4307, "step": 5300 }, { "gate_value": 0.10709504038095474, "icl_sequence_length": 82, "num_contexts": 3, "step": 5300 }, { "grad_norm": 0.0831289142370224, "learning_rate": 0.0002944220173801068, "loss": 0.4574, "step": 5310 }, { "gate_value": 0.10686801373958588, "icl_sequence_length": 80, "num_contexts": 3, "step": 5310 }, { "grad_norm": 0.10428035259246826, "learning_rate": 0.00029438846454243477, "loss": 0.4388, "step": 5320 }, { "gate_value": 0.10688181966543198, "icl_sequence_length": 70, "num_contexts": 3, "step": 5320 }, { "grad_norm": 0.08142989128828049, "learning_rate": 0.00029435481301660866, "loss": 0.4444, "step": 5330 }, { "gate_value": 0.10656416416168213, "icl_sequence_length": 78, "num_contexts": 3, "step": 5330 }, { "grad_norm": 0.13853947818279266, "learning_rate": 0.0002943210628256291, "loss": 0.4314, "step": 5340 }, { "gate_value": 0.10676518827676773, "icl_sequence_length": 86, "num_contexts": 3, "step": 5340 }, { "grad_norm": 0.0938250795006752, "learning_rate": 0.00029428721399256397, "loss": 0.4205, "step": 5350 }, { "gate_value": 0.10675190389156342, "icl_sequence_length": 72, "num_contexts": 3, "step": 5350 }, { "grad_norm": 0.11818115413188934, "learning_rate": 0.0002942532665405486, "loss": 0.4318, "step": 5360 }, { "gate_value": 0.10718467086553574, "icl_sequence_length": 88, "num_contexts": 3, "step": 5360 }, { "grad_norm": 0.12013489753007889, "learning_rate": 0.0002942192204927858, "loss": 0.4316, "step": 5370 }, { "gate_value": 0.10709141939878464, "icl_sequence_length": 72, "num_contexts": 3, "step": 5370 }, { "grad_norm": 0.08047773689031601, "learning_rate": 0.0002941850758725457, "loss": 0.4303, "step": 5380 }, { "gate_value": 0.10777509957551956, "icl_sequence_length": 82, "num_contexts": 3, "step": 5380 }, { "grad_norm": 0.11209923028945923, "learning_rate": 0.0002941508327031658, "loss": 0.4221, "step": 5390 }, { "gate_value": 0.1086001768708229, "icl_sequence_length": 80, "num_contexts": 3, "step": 5390 }, { "grad_norm": 0.08195088803768158, "learning_rate": 0.00029411649100805103, "loss": 0.4627, "step": 5400 }, { "gate_value": 0.1096612736582756, "icl_sequence_length": 78, "num_contexts": 3, "step": 5400 }, { "grad_norm": 0.05236297845840454, "learning_rate": 0.0002940820508106735, "loss": 0.4388, "step": 5410 }, { "gate_value": 0.10971608757972717, "icl_sequence_length": 92, "num_contexts": 3, "step": 5410 }, { "grad_norm": 0.0675286278128624, "learning_rate": 0.00029404751213457295, "loss": 0.4144, "step": 5420 }, { "gate_value": 0.1095128282904625, "icl_sequence_length": 90, "num_contexts": 3, "step": 5420 }, { "grad_norm": 0.05094461515545845, "learning_rate": 0.00029401287500335614, "loss": 0.4636, "step": 5430 }, { "gate_value": 0.10925433784723282, "icl_sequence_length": 88, "num_contexts": 3, "step": 5430 }, { "grad_norm": 0.03384535387158394, "learning_rate": 0.00029397813944069724, "loss": 0.4379, "step": 5440 }, { "gate_value": 0.1095501184463501, "icl_sequence_length": 88, "num_contexts": 3, "step": 5440 }, { "grad_norm": 0.055499665439128876, "learning_rate": 0.0002939433054703376, "loss": 0.4255, "step": 5450 }, { "gate_value": 0.1098218709230423, "icl_sequence_length": 90, "num_contexts": 3, "step": 5450 }, { "grad_norm": 0.07142087072134018, "learning_rate": 0.00029390837311608605, "loss": 0.4477, "step": 5460 }, { "gate_value": 0.10990960896015167, "icl_sequence_length": 66, "num_contexts": 3, "step": 5460 }, { "grad_norm": 0.08653406798839569, "learning_rate": 0.0002938733424018184, "loss": 0.4494, "step": 5470 }, { "gate_value": 0.10960792005062103, "icl_sequence_length": 76, "num_contexts": 3, "step": 5470 }, { "grad_norm": 0.10651569813489914, "learning_rate": 0.00029383821335147786, "loss": 0.4468, "step": 5480 }, { "gate_value": 0.10989756137132645, "icl_sequence_length": 90, "num_contexts": 3, "step": 5480 }, { "grad_norm": 0.036602914333343506, "learning_rate": 0.00029380298598907485, "loss": 0.4284, "step": 5490 }, { "gate_value": 0.10953273624181747, "icl_sequence_length": 76, "num_contexts": 3, "step": 5490 }, { "grad_norm": 0.04981496185064316, "learning_rate": 0.00029376766033868684, "loss": 0.4464, "step": 5500 }, { "gate_value": 0.10933338105678558, "icl_sequence_length": 74, "num_contexts": 3, "step": 5500 }, { "grad_norm": 0.03607625514268875, "learning_rate": 0.0002937322364244587, "loss": 0.4399, "step": 5510 }, { "gate_value": 0.11001531779766083, "icl_sequence_length": 92, "num_contexts": 3, "step": 5510 }, { "grad_norm": 0.048102978616952896, "learning_rate": 0.0002936967142706022, "loss": 0.4351, "step": 5520 }, { "gate_value": 0.1107465997338295, "icl_sequence_length": 84, "num_contexts": 3, "step": 5520 }, { "grad_norm": 0.07479461282491684, "learning_rate": 0.00029366109390139655, "loss": 0.4458, "step": 5530 }, { "gate_value": 0.11124894767999649, "icl_sequence_length": 62, "num_contexts": 3, "step": 5530 }, { "grad_norm": 0.07189257442951202, "learning_rate": 0.00029362537534118787, "loss": 0.4433, "step": 5540 }, { "gate_value": 0.11063691973686218, "icl_sequence_length": 88, "num_contexts": 3, "step": 5540 }, { "grad_norm": 0.047667935490608215, "learning_rate": 0.00029358955861438936, "loss": 0.4475, "step": 5550 }, { "gate_value": 0.11002617329359055, "icl_sequence_length": 62, "num_contexts": 3, "step": 5550 }, { "grad_norm": 0.0924759954214096, "learning_rate": 0.00029355364374548156, "loss": 0.4446, "step": 5560 }, { "gate_value": 0.10976750403642654, "icl_sequence_length": 76, "num_contexts": 3, "step": 5560 }, { "grad_norm": 0.06293515115976334, "learning_rate": 0.0002935176307590119, "loss": 0.4346, "step": 5570 }, { "gate_value": 0.10967667400836945, "icl_sequence_length": 86, "num_contexts": 3, "step": 5570 }, { "grad_norm": 0.06401614844799042, "learning_rate": 0.0002934815196795949, "loss": 0.4189, "step": 5580 }, { "gate_value": 0.1099473237991333, "icl_sequence_length": 86, "num_contexts": 3, "step": 5580 }, { "grad_norm": 0.09545626491308212, "learning_rate": 0.0002934453105319121, "loss": 0.4354, "step": 5590 }, { "gate_value": 0.11013666540384293, "icl_sequence_length": 82, "num_contexts": 3, "step": 5590 }, { "grad_norm": 0.04125199839472771, "learning_rate": 0.0002934090033407122, "loss": 0.4525, "step": 5600 }, { "gate_value": 0.1106431782245636, "icl_sequence_length": 88, "num_contexts": 3, "step": 5600 }, { "grad_norm": 0.04976991191506386, "learning_rate": 0.0002933725981308108, "loss": 0.436, "step": 5610 }, { "gate_value": 0.11142835021018982, "icl_sequence_length": 72, "num_contexts": 3, "step": 5610 }, { "grad_norm": 0.08596864342689514, "learning_rate": 0.0002933360949270905, "loss": 0.4333, "step": 5620 }, { "gate_value": 0.11180674284696579, "icl_sequence_length": 76, "num_contexts": 3, "step": 5620 }, { "grad_norm": 0.06628672033548355, "learning_rate": 0.0002932994937545009, "loss": 0.4294, "step": 5630 }, { "gate_value": 0.11271046847105026, "icl_sequence_length": 78, "num_contexts": 3, "step": 5630 }, { "grad_norm": 0.046628884971141815, "learning_rate": 0.0002932627946380585, "loss": 0.4305, "step": 5640 }, { "gate_value": 0.11313065141439438, "icl_sequence_length": 76, "num_contexts": 3, "step": 5640 }, { "grad_norm": 0.04577811062335968, "learning_rate": 0.0002932259976028469, "loss": 0.438, "step": 5650 }, { "gate_value": 0.11235907673835754, "icl_sequence_length": 82, "num_contexts": 3, "step": 5650 }, { "grad_norm": 0.09892956912517548, "learning_rate": 0.0002931891026740165, "loss": 0.4443, "step": 5660 }, { "gate_value": 0.1117844209074974, "icl_sequence_length": 80, "num_contexts": 3, "step": 5660 }, { "grad_norm": 0.04517688229680061, "learning_rate": 0.00029315210987678457, "loss": 0.4508, "step": 5670 }, { "gate_value": 0.11167077720165253, "icl_sequence_length": 96, "num_contexts": 3, "step": 5670 }, { "grad_norm": 0.08779658377170563, "learning_rate": 0.0002931150192364354, "loss": 0.4537, "step": 5680 }, { "gate_value": 0.11213234066963196, "icl_sequence_length": 82, "num_contexts": 3, "step": 5680 }, { "grad_norm": 0.13340982794761658, "learning_rate": 0.00029307783077832004, "loss": 0.4249, "step": 5690 }, { "gate_value": 0.11261526495218277, "icl_sequence_length": 82, "num_contexts": 3, "step": 5690 }, { "grad_norm": 0.052928730845451355, "learning_rate": 0.0002930405445278565, "loss": 0.4211, "step": 5700 }, { "gate_value": 0.1126897856593132, "icl_sequence_length": 92, "num_contexts": 3, "step": 5700 }, { "grad_norm": 0.06828558444976807, "learning_rate": 0.0002930031605105296, "loss": 0.4366, "step": 5710 }, { "gate_value": 0.11276299506425858, "icl_sequence_length": 78, "num_contexts": 3, "step": 5710 }, { "grad_norm": 0.04455326870083809, "learning_rate": 0.0002929656787518909, "loss": 0.4332, "step": 5720 }, { "gate_value": 0.11292887479066849, "icl_sequence_length": 58, "num_contexts": 3, "step": 5720 }, { "grad_norm": 0.14097444713115692, "learning_rate": 0.00029292809927755886, "loss": 0.4324, "step": 5730 }, { "gate_value": 0.11304690688848495, "icl_sequence_length": 76, "num_contexts": 3, "step": 5730 }, { "grad_norm": 0.03994961827993393, "learning_rate": 0.00029289042211321875, "loss": 0.4259, "step": 5740 }, { "gate_value": 0.11258487403392792, "icl_sequence_length": 86, "num_contexts": 3, "step": 5740 }, { "grad_norm": 0.03657664358615875, "learning_rate": 0.0002928526472846224, "loss": 0.4258, "step": 5750 }, { "gate_value": 0.1123301312327385, "icl_sequence_length": 90, "num_contexts": 3, "step": 5750 }, { "grad_norm": 0.07616100460290909, "learning_rate": 0.00029281477481758874, "loss": 0.4223, "step": 5760 }, { "gate_value": 0.11309382319450378, "icl_sequence_length": 88, "num_contexts": 3, "step": 5760 }, { "grad_norm": 0.08438175171613693, "learning_rate": 0.0002927768047380031, "loss": 0.4414, "step": 5770 }, { "gate_value": 0.11341430991888046, "icl_sequence_length": 86, "num_contexts": 3, "step": 5770 }, { "grad_norm": 0.03430478647351265, "learning_rate": 0.00029273873707181777, "loss": 0.4171, "step": 5780 }, { "gate_value": 0.11293953657150269, "icl_sequence_length": 76, "num_contexts": 3, "step": 5780 }, { "grad_norm": 0.08205977082252502, "learning_rate": 0.0002927005718450516, "loss": 0.4264, "step": 5790 }, { "gate_value": 0.11254054307937622, "icl_sequence_length": 76, "num_contexts": 3, "step": 5790 }, { "grad_norm": 0.07341016829013824, "learning_rate": 0.0002926623090837901, "loss": 0.4223, "step": 5800 }, { "gate_value": 0.11329421401023865, "icl_sequence_length": 82, "num_contexts": 3, "step": 5800 }, { "grad_norm": 0.06489834934473038, "learning_rate": 0.00029262394881418563, "loss": 0.4621, "step": 5810 }, { "gate_value": 0.11428499221801758, "icl_sequence_length": 86, "num_contexts": 3, "step": 5810 }, { "grad_norm": 0.1126764789223671, "learning_rate": 0.00029258549106245697, "loss": 0.4218, "step": 5820 }, { "gate_value": 0.11484973132610321, "icl_sequence_length": 86, "num_contexts": 3, "step": 5820 }, { "grad_norm": 0.04321017116308212, "learning_rate": 0.0002925469358548897, "loss": 0.4216, "step": 5830 }, { "gate_value": 0.11467601358890533, "icl_sequence_length": 64, "num_contexts": 3, "step": 5830 }, { "grad_norm": 0.12950527667999268, "learning_rate": 0.0002925082832178359, "loss": 0.4205, "step": 5840 }, { "gate_value": 0.11512420326471329, "icl_sequence_length": 88, "num_contexts": 3, "step": 5840 }, { "grad_norm": 0.03990564122796059, "learning_rate": 0.0002924695331777142, "loss": 0.4191, "step": 5850 }, { "gate_value": 0.11604516208171844, "icl_sequence_length": 80, "num_contexts": 3, "step": 5850 }, { "grad_norm": 0.03825264424085617, "learning_rate": 0.00029243068576101014, "loss": 0.4208, "step": 5860 }, { "gate_value": 0.1162797287106514, "icl_sequence_length": 76, "num_contexts": 3, "step": 5860 }, { "grad_norm": 0.16146664321422577, "learning_rate": 0.0002923917409942753, "loss": 0.4407, "step": 5870 }, { "gate_value": 0.11613252013921738, "icl_sequence_length": 90, "num_contexts": 3, "step": 5870 }, { "grad_norm": 0.06876587122678757, "learning_rate": 0.0002923526989041282, "loss": 0.4334, "step": 5880 }, { "gate_value": 0.1163325235247612, "icl_sequence_length": 82, "num_contexts": 3, "step": 5880 }, { "grad_norm": 0.12074162811040878, "learning_rate": 0.0002923135595172537, "loss": 0.4231, "step": 5890 }, { "gate_value": 0.11657026410102844, "icl_sequence_length": 78, "num_contexts": 3, "step": 5890 }, { "grad_norm": 0.04003237560391426, "learning_rate": 0.0002922743228604032, "loss": 0.4234, "step": 5900 }, { "gate_value": 0.11663009226322174, "icl_sequence_length": 64, "num_contexts": 3, "step": 5900 }, { "grad_norm": 0.04541704058647156, "learning_rate": 0.0002922349889603946, "loss": 0.4436, "step": 5910 }, { "gate_value": 0.11671672016382217, "icl_sequence_length": 78, "num_contexts": 3, "step": 5910 }, { "grad_norm": 0.06754721701145172, "learning_rate": 0.00029219555784411224, "loss": 0.4353, "step": 5920 }, { "gate_value": 0.1171480193734169, "icl_sequence_length": 68, "num_contexts": 3, "step": 5920 }, { "grad_norm": 0.04595010727643967, "learning_rate": 0.0002921560295385069, "loss": 0.4295, "step": 5930 }, { "gate_value": 0.11806542426347733, "icl_sequence_length": 72, "num_contexts": 3, "step": 5930 }, { "grad_norm": 0.04498855024576187, "learning_rate": 0.00029211640407059586, "loss": 0.437, "step": 5940 }, { "gate_value": 0.11824111640453339, "icl_sequence_length": 64, "num_contexts": 3, "step": 5940 }, { "grad_norm": 0.07534932345151901, "learning_rate": 0.0002920766814674627, "loss": 0.4199, "step": 5950 }, { "gate_value": 0.11814270913600922, "icl_sequence_length": 80, "num_contexts": 3, "step": 5950 }, { "grad_norm": 0.05595272779464722, "learning_rate": 0.00029203686175625747, "loss": 0.438, "step": 5960 }, { "gate_value": 0.11774881929159164, "icl_sequence_length": 70, "num_contexts": 3, "step": 5960 }, { "grad_norm": 0.0870402529835701, "learning_rate": 0.0002919969449641965, "loss": 0.4298, "step": 5970 }, { "gate_value": 0.1175171509385109, "icl_sequence_length": 76, "num_contexts": 3, "step": 5970 }, { "grad_norm": 0.03920748084783554, "learning_rate": 0.00029195693111856263, "loss": 0.4324, "step": 5980 }, { "gate_value": 0.11840686947107315, "icl_sequence_length": 64, "num_contexts": 3, "step": 5980 }, { "grad_norm": 0.10713187605142593, "learning_rate": 0.00029191682024670495, "loss": 0.4407, "step": 5990 }, { "gate_value": 0.11843453347682953, "icl_sequence_length": 72, "num_contexts": 3, "step": 5990 }, { "grad_norm": 0.1207190528512001, "learning_rate": 0.00029187661237603876, "loss": 0.423, "step": 6000 }, { "gate_value": 0.11844220012426376, "icl_sequence_length": 68, "num_contexts": 3, "step": 6000 }, { "grad_norm": 0.049522001296281815, "learning_rate": 0.0002918363075340459, "loss": 0.4376, "step": 6010 }, { "gate_value": 0.11861016601324081, "icl_sequence_length": 84, "num_contexts": 3, "step": 6010 }, { "grad_norm": 0.05734136328101158, "learning_rate": 0.00029179590574827426, "loss": 0.4061, "step": 6020 }, { "gate_value": 0.11852915585041046, "icl_sequence_length": 72, "num_contexts": 3, "step": 6020 }, { "grad_norm": 0.09304312616586685, "learning_rate": 0.00029175540704633803, "loss": 0.4322, "step": 6030 }, { "gate_value": 0.11866278946399689, "icl_sequence_length": 70, "num_contexts": 3, "step": 6030 }, { "grad_norm": 0.05417775362730026, "learning_rate": 0.00029171481145591786, "loss": 0.422, "step": 6040 }, { "gate_value": 0.11929836869239807, "icl_sequence_length": 90, "num_contexts": 3, "step": 6040 }, { "grad_norm": 0.07325364649295807, "learning_rate": 0.00029167411900476027, "loss": 0.4098, "step": 6050 }, { "gate_value": 0.11958464235067368, "icl_sequence_length": 72, "num_contexts": 3, "step": 6050 }, { "grad_norm": 0.10998177528381348, "learning_rate": 0.0002916333297206783, "loss": 0.424, "step": 6060 }, { "gate_value": 0.11943252384662628, "icl_sequence_length": 68, "num_contexts": 3, "step": 6060 }, { "grad_norm": 0.07327587902545929, "learning_rate": 0.00029159244363155095, "loss": 0.4154, "step": 6070 }, { "gate_value": 0.11973727494478226, "icl_sequence_length": 62, "num_contexts": 3, "step": 6070 }, { "grad_norm": 0.047259047627449036, "learning_rate": 0.0002915514607653235, "loss": 0.4261, "step": 6080 }, { "gate_value": 0.12109141051769257, "icl_sequence_length": 80, "num_contexts": 3, "step": 6080 }, { "grad_norm": 0.051075082272291183, "learning_rate": 0.0002915103811500074, "loss": 0.4125, "step": 6090 }, { "gate_value": 0.12103540450334549, "icl_sequence_length": 90, "num_contexts": 3, "step": 6090 }, { "grad_norm": 0.055391453206539154, "learning_rate": 0.00029146920481368016, "loss": 0.4215, "step": 6100 }, { "gate_value": 0.12145233154296875, "icl_sequence_length": 78, "num_contexts": 3, "step": 6100 }, { "grad_norm": 0.07134224474430084, "learning_rate": 0.0002914279317844854, "loss": 0.4245, "step": 6110 }, { "gate_value": 0.12148108333349228, "icl_sequence_length": 76, "num_contexts": 3, "step": 6110 }, { "grad_norm": 0.16808076202869415, "learning_rate": 0.0002913865620906328, "loss": 0.4159, "step": 6120 }, { "gate_value": 0.12126853317022324, "icl_sequence_length": 86, "num_contexts": 3, "step": 6120 }, { "grad_norm": 0.22057221829891205, "learning_rate": 0.00029134509576039824, "loss": 0.4278, "step": 6130 }, { "gate_value": 0.12128766626119614, "icl_sequence_length": 82, "num_contexts": 3, "step": 6130 }, { "grad_norm": 0.09868735820055008, "learning_rate": 0.0002913035328221236, "loss": 0.4194, "step": 6140 }, { "gate_value": 0.12114259600639343, "icl_sequence_length": 68, "num_contexts": 3, "step": 6140 }, { "grad_norm": 0.08045390993356705, "learning_rate": 0.0002912618733042166, "loss": 0.4242, "step": 6150 }, { "gate_value": 0.12184974551200867, "icl_sequence_length": 78, "num_contexts": 3, "step": 6150 }, { "grad_norm": 0.04861131310462952, "learning_rate": 0.00029122011723515124, "loss": 0.435, "step": 6160 }, { "gate_value": 0.1218714788556099, "icl_sequence_length": 84, "num_contexts": 3, "step": 6160 }, { "grad_norm": 0.09223391115665436, "learning_rate": 0.00029117826464346736, "loss": 0.429, "step": 6170 }, { "gate_value": 0.1220831573009491, "icl_sequence_length": 88, "num_contexts": 3, "step": 6170 }, { "grad_norm": 0.15670324862003326, "learning_rate": 0.00029113631555777083, "loss": 0.4133, "step": 6180 }, { "gate_value": 0.12261885404586792, "icl_sequence_length": 66, "num_contexts": 3, "step": 6180 }, { "grad_norm": 0.053096942603588104, "learning_rate": 0.0002910942700067335, "loss": 0.4304, "step": 6190 }, { "gate_value": 0.12274584919214249, "icl_sequence_length": 76, "num_contexts": 3, "step": 6190 }, { "grad_norm": 0.10016176104545593, "learning_rate": 0.000291052128019093, "loss": 0.4395, "step": 6200 }, { "gate_value": 0.12318729609251022, "icl_sequence_length": 76, "num_contexts": 3, "step": 6200 }, { "grad_norm": 0.21103353798389435, "learning_rate": 0.0002910098896236531, "loss": 0.4286, "step": 6210 }, { "gate_value": 0.12366920709609985, "icl_sequence_length": 88, "num_contexts": 3, "step": 6210 }, { "grad_norm": 0.08487042039632797, "learning_rate": 0.0002909675548492832, "loss": 0.4232, "step": 6220 }, { "gate_value": 0.12395383417606354, "icl_sequence_length": 86, "num_contexts": 3, "step": 6220 }, { "grad_norm": 0.055605944246053696, "learning_rate": 0.0002909251237249189, "loss": 0.4357, "step": 6230 }, { "gate_value": 0.12388145923614502, "icl_sequence_length": 70, "num_contexts": 3, "step": 6230 }, { "grad_norm": 0.09035728126764297, "learning_rate": 0.00029088259627956133, "loss": 0.4323, "step": 6240 }, { "gate_value": 0.12410308420658112, "icl_sequence_length": 76, "num_contexts": 3, "step": 6240 }, { "grad_norm": 0.05201800540089607, "learning_rate": 0.00029083997254227765, "loss": 0.4266, "step": 6250 }, { "gate_value": 0.1233508288860321, "icl_sequence_length": 64, "num_contexts": 3, "step": 6250 }, { "grad_norm": 0.061280108988285065, "learning_rate": 0.0002907972525422008, "loss": 0.4351, "step": 6260 }, { "gate_value": 0.12278716266155243, "icl_sequence_length": 90, "num_contexts": 3, "step": 6260 }, { "grad_norm": 0.05171182006597519, "learning_rate": 0.00029075443630852945, "loss": 0.421, "step": 6270 }, { "gate_value": 0.12314655631780624, "icl_sequence_length": 78, "num_contexts": 3, "step": 6270 }, { "grad_norm": 0.0474497452378273, "learning_rate": 0.00029071152387052815, "loss": 0.4292, "step": 6280 }, { "gate_value": 0.12386977672576904, "icl_sequence_length": 90, "num_contexts": 3, "step": 6280 }, { "grad_norm": 0.06959555298089981, "learning_rate": 0.0002906685152575271, "loss": 0.4106, "step": 6290 }, { "gate_value": 0.12407379597425461, "icl_sequence_length": 84, "num_contexts": 3, "step": 6290 }, { "grad_norm": 0.04566885530948639, "learning_rate": 0.00029062541049892227, "loss": 0.4241, "step": 6300 }, { "gate_value": 0.1249006986618042, "icl_sequence_length": 86, "num_contexts": 3, "step": 6300 }, { "grad_norm": 0.052750058472156525, "learning_rate": 0.0002905822096241754, "loss": 0.4299, "step": 6310 }, { "gate_value": 0.12427221238613129, "icl_sequence_length": 90, "num_contexts": 3, "step": 6310 }, { "grad_norm": 0.03595830127596855, "learning_rate": 0.0002905389126628139, "loss": 0.4322, "step": 6320 }, { "gate_value": 0.12405462563037872, "icl_sequence_length": 78, "num_contexts": 3, "step": 6320 }, { "grad_norm": 0.03769196197390556, "learning_rate": 0.0002904955196444307, "loss": 0.4129, "step": 6330 }, { "gate_value": 0.12411148101091385, "icl_sequence_length": 68, "num_contexts": 3, "step": 6330 }, { "grad_norm": 0.12260931730270386, "learning_rate": 0.0002904520305986847, "loss": 0.4202, "step": 6340 }, { "gate_value": 0.12482518702745438, "icl_sequence_length": 72, "num_contexts": 3, "step": 6340 }, { "grad_norm": 0.09134802967309952, "learning_rate": 0.00029040844555530015, "loss": 0.4243, "step": 6350 }, { "gate_value": 0.12580612301826477, "icl_sequence_length": 70, "num_contexts": 3, "step": 6350 }, { "grad_norm": 0.14725758135318756, "learning_rate": 0.00029036476454406704, "loss": 0.426, "step": 6360 }, { "gate_value": 0.12593281269073486, "icl_sequence_length": 72, "num_contexts": 3, "step": 6360 }, { "grad_norm": 0.11491762846708298, "learning_rate": 0.0002903209875948409, "loss": 0.4294, "step": 6370 }, { "gate_value": 0.12502771615982056, "icl_sequence_length": 58, "num_contexts": 3, "step": 6370 }, { "grad_norm": 0.0558735616505146, "learning_rate": 0.0002902771147375429, "loss": 0.4401, "step": 6380 }, { "gate_value": 0.12525469064712524, "icl_sequence_length": 78, "num_contexts": 3, "step": 6380 }, { "grad_norm": 0.07469120621681213, "learning_rate": 0.0002902331460021597, "loss": 0.4246, "step": 6390 }, { "gate_value": 0.12590870261192322, "icl_sequence_length": 84, "num_contexts": 3, "step": 6390 }, { "grad_norm": 0.11152400076389313, "learning_rate": 0.00029018908141874354, "loss": 0.4326, "step": 6400 }, { "gate_value": 0.12628869712352753, "icl_sequence_length": 64, "num_contexts": 3, "step": 6400 }, { "grad_norm": 0.1487109363079071, "learning_rate": 0.0002901449210174122, "loss": 0.4229, "step": 6410 }, { "gate_value": 0.12687274813652039, "icl_sequence_length": 80, "num_contexts": 3, "step": 6410 }, { "grad_norm": 0.08015327900648117, "learning_rate": 0.00029010066482834874, "loss": 0.4251, "step": 6420 }, { "gate_value": 0.12663473188877106, "icl_sequence_length": 78, "num_contexts": 3, "step": 6420 }, { "grad_norm": 0.06717225909233093, "learning_rate": 0.00029005631288180197, "loss": 0.4367, "step": 6430 }, { "gate_value": 0.12608718872070312, "icl_sequence_length": 86, "num_contexts": 3, "step": 6430 }, { "grad_norm": 0.3730805516242981, "learning_rate": 0.000290011865208086, "loss": 0.4288, "step": 6440 }, { "gate_value": 0.1260022222995758, "icl_sequence_length": 62, "num_contexts": 3, "step": 6440 }, { "grad_norm": 0.05443867668509483, "learning_rate": 0.0002899673218375804, "loss": 0.4078, "step": 6450 }, { "gate_value": 0.126522958278656, "icl_sequence_length": 82, "num_contexts": 3, "step": 6450 }, { "grad_norm": 0.06876882910728455, "learning_rate": 0.00028992268280073015, "loss": 0.4211, "step": 6460 }, { "gate_value": 0.12633737921714783, "icl_sequence_length": 86, "num_contexts": 3, "step": 6460 }, { "grad_norm": 0.035338129848241806, "learning_rate": 0.00028987794812804555, "loss": 0.4314, "step": 6470 }, { "gate_value": 0.12628173828125, "icl_sequence_length": 76, "num_contexts": 3, "step": 6470 }, { "grad_norm": 0.04577523469924927, "learning_rate": 0.00028983311785010237, "loss": 0.4155, "step": 6480 }, { "gate_value": 0.12714733183383942, "icl_sequence_length": 80, "num_contexts": 3, "step": 6480 }, { "grad_norm": 0.16886325180530548, "learning_rate": 0.0002897881919975417, "loss": 0.4331, "step": 6490 }, { "gate_value": 0.1283843070268631, "icl_sequence_length": 76, "num_contexts": 3, "step": 6490 }, { "grad_norm": 0.04098428413271904, "learning_rate": 0.00028974317060106997, "loss": 0.4384, "step": 6500 }, { "gate_value": 0.12877629697322845, "icl_sequence_length": 58, "num_contexts": 3, "step": 6500 }, { "grad_norm": 0.04261775314807892, "learning_rate": 0.0002896980536914588, "loss": 0.4219, "step": 6510 }, { "gate_value": 0.12851965427398682, "icl_sequence_length": 68, "num_contexts": 3, "step": 6510 }, { "grad_norm": 0.09274916350841522, "learning_rate": 0.0002896528412995452, "loss": 0.4393, "step": 6520 }, { "gate_value": 0.12854821979999542, "icl_sequence_length": 64, "num_contexts": 3, "step": 6520 }, { "grad_norm": 0.04320213571190834, "learning_rate": 0.00028960753345623144, "loss": 0.4236, "step": 6530 }, { "gate_value": 0.1293092519044876, "icl_sequence_length": 90, "num_contexts": 3, "step": 6530 }, { "grad_norm": 0.07464089244604111, "learning_rate": 0.000289562130192485, "loss": 0.4226, "step": 6540 }, { "gate_value": 0.12948086857795715, "icl_sequence_length": 72, "num_contexts": 3, "step": 6540 }, { "grad_norm": 0.0393485464155674, "learning_rate": 0.0002895166315393385, "loss": 0.4291, "step": 6550 }, { "gate_value": 0.1301802545785904, "icl_sequence_length": 62, "num_contexts": 3, "step": 6550 }, { "grad_norm": 0.062148161232471466, "learning_rate": 0.00028947103752788994, "loss": 0.4134, "step": 6560 }, { "gate_value": 0.13075850903987885, "icl_sequence_length": 78, "num_contexts": 3, "step": 6560 }, { "grad_norm": 0.04033559188246727, "learning_rate": 0.00028942534818930237, "loss": 0.42, "step": 6570 }, { "gate_value": 0.13082143664360046, "icl_sequence_length": 78, "num_contexts": 3, "step": 6570 }, { "grad_norm": 0.045840464532375336, "learning_rate": 0.00028937956355480404, "loss": 0.4349, "step": 6580 }, { "gate_value": 0.13069741427898407, "icl_sequence_length": 72, "num_contexts": 3, "step": 6580 }, { "grad_norm": 0.059474021196365356, "learning_rate": 0.00028933368365568823, "loss": 0.4269, "step": 6590 }, { "gate_value": 0.13073182106018066, "icl_sequence_length": 78, "num_contexts": 3, "step": 6590 }, { "grad_norm": 0.04966716840863228, "learning_rate": 0.0002892877085233135, "loss": 0.4253, "step": 6600 }, { "gate_value": 0.1308085173368454, "icl_sequence_length": 76, "num_contexts": 3, "step": 6600 }, { "grad_norm": 0.04608655348420143, "learning_rate": 0.0002892416381891034, "loss": 0.4356, "step": 6610 }, { "gate_value": 0.13023337721824646, "icl_sequence_length": 82, "num_contexts": 3, "step": 6610 }, { "grad_norm": 0.05291171744465828, "learning_rate": 0.0002891954726845466, "loss": 0.4167, "step": 6620 }, { "gate_value": 0.1300569474697113, "icl_sequence_length": 66, "num_contexts": 3, "step": 6620 }, { "grad_norm": 0.08100378513336182, "learning_rate": 0.0002891492120411967, "loss": 0.4268, "step": 6630 }, { "gate_value": 0.13093367218971252, "icl_sequence_length": 94, "num_contexts": 3, "step": 6630 }, { "grad_norm": 0.047419607639312744, "learning_rate": 0.00028910285629067255, "loss": 0.416, "step": 6640 }, { "gate_value": 0.13165777921676636, "icl_sequence_length": 70, "num_contexts": 3, "step": 6640 }, { "grad_norm": 0.34848541021347046, "learning_rate": 0.0002890564054646577, "loss": 0.4324, "step": 6650 }, { "gate_value": 0.13223788142204285, "icl_sequence_length": 68, "num_contexts": 3, "step": 6650 }, { "grad_norm": 0.05068269744515419, "learning_rate": 0.000289009859594901, "loss": 0.4246, "step": 6660 }, { "gate_value": 0.1314765065908432, "icl_sequence_length": 76, "num_contexts": 3, "step": 6660 }, { "grad_norm": 0.12157876789569855, "learning_rate": 0.00028896321871321604, "loss": 0.4196, "step": 6670 }, { "gate_value": 0.13127507269382477, "icl_sequence_length": 84, "num_contexts": 3, "step": 6670 }, { "grad_norm": 0.06361397355794907, "learning_rate": 0.0002889164828514814, "loss": 0.4387, "step": 6680 }, { "gate_value": 0.1312800794839859, "icl_sequence_length": 88, "num_contexts": 3, "step": 6680 }, { "grad_norm": 0.08461131155490875, "learning_rate": 0.00028886965204164065, "loss": 0.4159, "step": 6690 }, { "gate_value": 0.13146339356899261, "icl_sequence_length": 82, "num_contexts": 3, "step": 6690 }, { "grad_norm": 0.11955706030130386, "learning_rate": 0.0002888227263157022, "loss": 0.4184, "step": 6700 }, { "gate_value": 0.13245995342731476, "icl_sequence_length": 60, "num_contexts": 3, "step": 6700 }, { "grad_norm": 0.2205382138490677, "learning_rate": 0.00028877570570573936, "loss": 0.4241, "step": 6710 }, { "gate_value": 0.13298162817955017, "icl_sequence_length": 90, "num_contexts": 3, "step": 6710 }, { "grad_norm": 0.07158178836107254, "learning_rate": 0.0002887285902438902, "loss": 0.4351, "step": 6720 }, { "gate_value": 0.13305675983428955, "icl_sequence_length": 86, "num_contexts": 3, "step": 6720 }, { "grad_norm": 0.08344139903783798, "learning_rate": 0.0002886813799623578, "loss": 0.4173, "step": 6730 }, { "gate_value": 0.132954940199852, "icl_sequence_length": 84, "num_contexts": 3, "step": 6730 }, { "grad_norm": 0.061295922845602036, "learning_rate": 0.0002886340748934098, "loss": 0.4237, "step": 6740 }, { "gate_value": 0.13344383239746094, "icl_sequence_length": 84, "num_contexts": 3, "step": 6740 }, { "grad_norm": 0.1391419768333435, "learning_rate": 0.0002885866750693789, "loss": 0.4175, "step": 6750 }, { "gate_value": 0.13365666568279266, "icl_sequence_length": 76, "num_contexts": 3, "step": 6750 }, { "grad_norm": 0.08864652365446091, "learning_rate": 0.0002885391805226624, "loss": 0.422, "step": 6760 }, { "gate_value": 0.13373038172721863, "icl_sequence_length": 74, "num_contexts": 3, "step": 6760 }, { "grad_norm": 0.04955292120575905, "learning_rate": 0.0002884915912857223, "loss": 0.3975, "step": 6770 }, { "gate_value": 0.13424073159694672, "icl_sequence_length": 88, "num_contexts": 3, "step": 6770 }, { "grad_norm": 0.11102047562599182, "learning_rate": 0.0002884439073910855, "loss": 0.4205, "step": 6780 }, { "gate_value": 0.1342160701751709, "icl_sequence_length": 80, "num_contexts": 3, "step": 6780 }, { "grad_norm": 0.09720852971076965, "learning_rate": 0.00028839612887134346, "loss": 0.4218, "step": 6790 }, { "gate_value": 0.13402903079986572, "icl_sequence_length": 66, "num_contexts": 3, "step": 6790 }, { "grad_norm": 0.04908115044236183, "learning_rate": 0.0002883482557591523, "loss": 0.4236, "step": 6800 }, { "gate_value": 0.13415250182151794, "icl_sequence_length": 78, "num_contexts": 3, "step": 6800 }, { "grad_norm": 0.072776198387146, "learning_rate": 0.00028830028808723285, "loss": 0.4377, "step": 6810 }, { "gate_value": 0.1337648630142212, "icl_sequence_length": 66, "num_contexts": 3, "step": 6810 }, { "grad_norm": 0.04838380590081215, "learning_rate": 0.00028825222588837063, "loss": 0.4093, "step": 6820 }, { "gate_value": 0.13404372334480286, "icl_sequence_length": 94, "num_contexts": 3, "step": 6820 }, { "grad_norm": 0.04791904240846634, "learning_rate": 0.0002882040691954157, "loss": 0.4106, "step": 6830 }, { "gate_value": 0.13330905139446259, "icl_sequence_length": 68, "num_contexts": 3, "step": 6830 }, { "grad_norm": 0.13278383016586304, "learning_rate": 0.0002881558180412826, "loss": 0.4299, "step": 6840 }, { "gate_value": 0.13309334218502045, "icl_sequence_length": 84, "num_contexts": 3, "step": 6840 }, { "grad_norm": 0.05918443202972412, "learning_rate": 0.0002881074724589506, "loss": 0.4255, "step": 6850 }, { "gate_value": 0.13352347910404205, "icl_sequence_length": 74, "num_contexts": 3, "step": 6850 }, { "grad_norm": 0.08290646970272064, "learning_rate": 0.00028805903248146344, "loss": 0.4237, "step": 6860 }, { "gate_value": 0.1338224560022354, "icl_sequence_length": 60, "num_contexts": 3, "step": 6860 }, { "grad_norm": 0.03945367410778999, "learning_rate": 0.00028801049814192945, "loss": 0.403, "step": 6870 }, { "gate_value": 0.13464081287384033, "icl_sequence_length": 72, "num_contexts": 3, "step": 6870 }, { "grad_norm": 0.06238861754536629, "learning_rate": 0.0002879618694735213, "loss": 0.4148, "step": 6880 }, { "gate_value": 0.13544297218322754, "icl_sequence_length": 86, "num_contexts": 3, "step": 6880 }, { "grad_norm": 0.06959348171949387, "learning_rate": 0.00028791314650947626, "loss": 0.4279, "step": 6890 }, { "gate_value": 0.1356019675731659, "icl_sequence_length": 76, "num_contexts": 3, "step": 6890 }, { "grad_norm": 0.1363728791475296, "learning_rate": 0.00028786432928309605, "loss": 0.3996, "step": 6900 }, { "gate_value": 0.13566221296787262, "icl_sequence_length": 74, "num_contexts": 3, "step": 6900 }, { "grad_norm": 0.1169843077659607, "learning_rate": 0.00028781541782774676, "loss": 0.4206, "step": 6910 }, { "gate_value": 0.13622631132602692, "icl_sequence_length": 74, "num_contexts": 3, "step": 6910 }, { "grad_norm": 0.22626587748527527, "learning_rate": 0.0002877664121768589, "loss": 0.4174, "step": 6920 }, { "gate_value": 0.1364203840494156, "icl_sequence_length": 82, "num_contexts": 3, "step": 6920 }, { "grad_norm": 0.05224921554327011, "learning_rate": 0.00028771731236392736, "loss": 0.4334, "step": 6930 }, { "gate_value": 0.13636702299118042, "icl_sequence_length": 74, "num_contexts": 3, "step": 6930 }, { "grad_norm": 0.07503295689821243, "learning_rate": 0.00028766811842251147, "loss": 0.4216, "step": 6940 }, { "gate_value": 0.13606546819210052, "icl_sequence_length": 88, "num_contexts": 3, "step": 6940 }, { "grad_norm": 0.09540563076734543, "learning_rate": 0.0002876188303862347, "loss": 0.4383, "step": 6950 }, { "gate_value": 0.13603177666664124, "icl_sequence_length": 84, "num_contexts": 3, "step": 6950 }, { "grad_norm": 0.0879683643579483, "learning_rate": 0.00028756944828878505, "loss": 0.4354, "step": 6960 }, { "gate_value": 0.1359747350215912, "icl_sequence_length": 64, "num_contexts": 3, "step": 6960 }, { "grad_norm": 0.12022677063941956, "learning_rate": 0.0002875199721639147, "loss": 0.4248, "step": 6970 }, { "gate_value": 0.13565360009670258, "icl_sequence_length": 76, "num_contexts": 3, "step": 6970 }, { "grad_norm": 0.16428843140602112, "learning_rate": 0.0002874704020454401, "loss": 0.4288, "step": 6980 }, { "gate_value": 0.13588224351406097, "icl_sequence_length": 86, "num_contexts": 3, "step": 6980 }, { "grad_norm": 0.099467933177948, "learning_rate": 0.000287420737967242, "loss": 0.4192, "step": 6990 }, { "gate_value": 0.13614501059055328, "icl_sequence_length": 78, "num_contexts": 3, "step": 6990 }, { "grad_norm": 0.19383247196674347, "learning_rate": 0.00028737097996326533, "loss": 0.4215, "step": 7000 }, { "gate_value": 0.1362755447626114, "icl_sequence_length": 66, "num_contexts": 3, "step": 7000 }, { "grad_norm": 0.056437231600284576, "learning_rate": 0.0002873211280675191, "loss": 0.4208, "step": 7010 }, { "gate_value": 0.13694988191127777, "icl_sequence_length": 90, "num_contexts": 3, "step": 7010 }, { "grad_norm": 0.14202746748924255, "learning_rate": 0.0002872711823140768, "loss": 0.4161, "step": 7020 }, { "gate_value": 0.13738933205604553, "icl_sequence_length": 82, "num_contexts": 3, "step": 7020 }, { "grad_norm": 0.07661598175764084, "learning_rate": 0.0002872211427370756, "loss": 0.4003, "step": 7030 }, { "gate_value": 0.13746806979179382, "icl_sequence_length": 68, "num_contexts": 3, "step": 7030 }, { "grad_norm": 0.0649000033736229, "learning_rate": 0.00028717100937071744, "loss": 0.4194, "step": 7040 }, { "gate_value": 0.13786818087100983, "icl_sequence_length": 92, "num_contexts": 3, "step": 7040 }, { "grad_norm": 0.09948837757110596, "learning_rate": 0.0002871207822492678, "loss": 0.4135, "step": 7050 }, { "gate_value": 0.13835233449935913, "icl_sequence_length": 82, "num_contexts": 3, "step": 7050 }, { "grad_norm": 0.058601900935173035, "learning_rate": 0.0002870704614070564, "loss": 0.4143, "step": 7060 }, { "gate_value": 0.13775308430194855, "icl_sequence_length": 64, "num_contexts": 3, "step": 7060 }, { "grad_norm": 0.06780196726322174, "learning_rate": 0.0002870200468784771, "loss": 0.4234, "step": 7070 }, { "gate_value": 0.13795600831508636, "icl_sequence_length": 72, "num_contexts": 3, "step": 7070 }, { "grad_norm": 0.38374415040016174, "learning_rate": 0.00028696953869798784, "loss": 0.441, "step": 7080 }, { "gate_value": 0.13810260593891144, "icl_sequence_length": 90, "num_contexts": 3, "step": 7080 }, { "grad_norm": 0.09250040352344513, "learning_rate": 0.00028691893690011044, "loss": 0.4311, "step": 7090 }, { "gate_value": 0.13826362788677216, "icl_sequence_length": 84, "num_contexts": 3, "step": 7090 }, { "grad_norm": 0.07739558070898056, "learning_rate": 0.00028686824151943067, "loss": 0.409, "step": 7100 }, { "gate_value": 0.13859573006629944, "icl_sequence_length": 88, "num_contexts": 3, "step": 7100 }, { "grad_norm": 0.055045321583747864, "learning_rate": 0.0002868174525905985, "loss": 0.4195, "step": 7110 }, { "gate_value": 0.13870957493782043, "icl_sequence_length": 72, "num_contexts": 3, "step": 7110 }, { "grad_norm": 0.07911080121994019, "learning_rate": 0.0002867665701483275, "loss": 0.4032, "step": 7120 }, { "gate_value": 0.1389653980731964, "icl_sequence_length": 80, "num_contexts": 3, "step": 7120 }, { "grad_norm": 0.10772552341222763, "learning_rate": 0.0002867155942273955, "loss": 0.4077, "step": 7130 }, { "gate_value": 0.1391928791999817, "icl_sequence_length": 74, "num_contexts": 3, "step": 7130 }, { "grad_norm": 0.10482678562402725, "learning_rate": 0.00028666452486264397, "loss": 0.3997, "step": 7140 }, { "gate_value": 0.13888011872768402, "icl_sequence_length": 60, "num_contexts": 3, "step": 7140 }, { "grad_norm": 0.1086043193936348, "learning_rate": 0.00028661336208897834, "loss": 0.4302, "step": 7150 }, { "gate_value": 0.1387721598148346, "icl_sequence_length": 80, "num_contexts": 3, "step": 7150 }, { "grad_norm": 0.04895766079425812, "learning_rate": 0.00028656210594136795, "loss": 0.4257, "step": 7160 }, { "gate_value": 0.13895726203918457, "icl_sequence_length": 88, "num_contexts": 3, "step": 7160 }, { "grad_norm": 0.11810770630836487, "learning_rate": 0.00028651075645484583, "loss": 0.434, "step": 7170 }, { "gate_value": 0.1390237957239151, "icl_sequence_length": 70, "num_contexts": 3, "step": 7170 }, { "grad_norm": 0.05848957598209381, "learning_rate": 0.00028645931366450895, "loss": 0.4324, "step": 7180 }, { "gate_value": 0.1383313685655594, "icl_sequence_length": 66, "num_contexts": 3, "step": 7180 }, { "grad_norm": 0.160471111536026, "learning_rate": 0.0002864077776055178, "loss": 0.4032, "step": 7190 }, { "gate_value": 0.13848836719989777, "icl_sequence_length": 82, "num_contexts": 3, "step": 7190 }, { "grad_norm": 0.048046983778476715, "learning_rate": 0.000286356148313097, "loss": 0.4204, "step": 7200 }, { "gate_value": 0.13866883516311646, "icl_sequence_length": 70, "num_contexts": 3, "step": 7200 }, { "grad_norm": 0.2954862117767334, "learning_rate": 0.0002863044258225346, "loss": 0.4167, "step": 7210 }, { "gate_value": 0.1384095996618271, "icl_sequence_length": 58, "num_contexts": 3, "step": 7210 }, { "grad_norm": 0.05166821926832199, "learning_rate": 0.0002862526101691824, "loss": 0.4328, "step": 7220 }, { "gate_value": 0.1390955001115799, "icl_sequence_length": 76, "num_contexts": 3, "step": 7220 }, { "grad_norm": 0.05716368183493614, "learning_rate": 0.0002862007013884559, "loss": 0.4287, "step": 7230 }, { "gate_value": 0.13900712132453918, "icl_sequence_length": 64, "num_contexts": 3, "step": 7230 }, { "grad_norm": 0.13214118778705597, "learning_rate": 0.0002861486995158343, "loss": 0.4218, "step": 7240 }, { "gate_value": 0.13993008434772491, "icl_sequence_length": 84, "num_contexts": 3, "step": 7240 }, { "grad_norm": 0.052904047071933746, "learning_rate": 0.00028609660458686045, "loss": 0.4396, "step": 7250 }, { "gate_value": 0.14042124152183533, "icl_sequence_length": 90, "num_contexts": 3, "step": 7250 }, { "grad_norm": 0.13447511196136475, "learning_rate": 0.00028604441663714064, "loss": 0.4158, "step": 7260 }, { "gate_value": 0.14117123186588287, "icl_sequence_length": 88, "num_contexts": 3, "step": 7260 }, { "grad_norm": 0.06425243616104126, "learning_rate": 0.00028599213570234486, "loss": 0.4152, "step": 7270 }, { "gate_value": 0.14089760184288025, "icl_sequence_length": 76, "num_contexts": 3, "step": 7270 }, { "grad_norm": 0.053561557084321976, "learning_rate": 0.0002859397618182067, "loss": 0.4311, "step": 7280 }, { "gate_value": 0.140683114528656, "icl_sequence_length": 94, "num_contexts": 3, "step": 7280 }, { "grad_norm": 0.1397479921579361, "learning_rate": 0.0002858872950205231, "loss": 0.4313, "step": 7290 }, { "gate_value": 0.1405588686466217, "icl_sequence_length": 82, "num_contexts": 3, "step": 7290 }, { "grad_norm": 0.04807320237159729, "learning_rate": 0.0002858347353451548, "loss": 0.4294, "step": 7300 }, { "gate_value": 0.14052005112171173, "icl_sequence_length": 88, "num_contexts": 3, "step": 7300 }, { "grad_norm": 0.055733080953359604, "learning_rate": 0.0002857820828280257, "loss": 0.4211, "step": 7310 }, { "gate_value": 0.140594482421875, "icl_sequence_length": 82, "num_contexts": 3, "step": 7310 }, { "grad_norm": 0.048516325652599335, "learning_rate": 0.00028572933750512327, "loss": 0.4204, "step": 7320 }, { "gate_value": 0.14103937149047852, "icl_sequence_length": 82, "num_contexts": 3, "step": 7320 }, { "grad_norm": 0.05917530134320259, "learning_rate": 0.00028567649941249856, "loss": 0.4381, "step": 7330 }, { "gate_value": 0.14163275063037872, "icl_sequence_length": 84, "num_contexts": 3, "step": 7330 }, { "grad_norm": 0.06968928873538971, "learning_rate": 0.00028562356858626584, "loss": 0.4161, "step": 7340 }, { "gate_value": 0.141965851187706, "icl_sequence_length": 90, "num_contexts": 3, "step": 7340 }, { "grad_norm": 0.07425074279308319, "learning_rate": 0.0002855705450626028, "loss": 0.4276, "step": 7350 }, { "gate_value": 0.14220909774303436, "icl_sequence_length": 80, "num_contexts": 3, "step": 7350 }, { "grad_norm": 0.057516731321811676, "learning_rate": 0.00028551742887775064, "loss": 0.4164, "step": 7360 }, { "gate_value": 0.14302530884742737, "icl_sequence_length": 76, "num_contexts": 3, "step": 7360 }, { "grad_norm": 0.1238698661327362, "learning_rate": 0.0002854642200680137, "loss": 0.4192, "step": 7370 }, { "gate_value": 0.1436849981546402, "icl_sequence_length": 90, "num_contexts": 3, "step": 7370 }, { "grad_norm": 0.056957390159368515, "learning_rate": 0.00028541091866975967, "loss": 0.4155, "step": 7380 }, { "gate_value": 0.14387303590774536, "icl_sequence_length": 60, "num_contexts": 3, "step": 7380 }, { "grad_norm": 0.04767700284719467, "learning_rate": 0.0002853575247194195, "loss": 0.4141, "step": 7390 }, { "gate_value": 0.14421603083610535, "icl_sequence_length": 92, "num_contexts": 3, "step": 7390 }, { "grad_norm": 0.0633358284831047, "learning_rate": 0.0002853040382534876, "loss": 0.4307, "step": 7400 }, { "gate_value": 0.14420627057552338, "icl_sequence_length": 90, "num_contexts": 3, "step": 7400 }, { "grad_norm": 0.06384849548339844, "learning_rate": 0.0002852504593085214, "loss": 0.4199, "step": 7410 }, { "gate_value": 0.14376404881477356, "icl_sequence_length": 76, "num_contexts": 3, "step": 7410 }, { "grad_norm": 0.05320499837398529, "learning_rate": 0.0002851967879211416, "loss": 0.421, "step": 7420 }, { "gate_value": 0.1437148004770279, "icl_sequence_length": 84, "num_contexts": 3, "step": 7420 }, { "grad_norm": 0.038637008517980576, "learning_rate": 0.0002851430241280321, "loss": 0.4026, "step": 7430 }, { "gate_value": 0.1436859667301178, "icl_sequence_length": 76, "num_contexts": 3, "step": 7430 }, { "grad_norm": 0.10469973087310791, "learning_rate": 0.0002850891679659399, "loss": 0.4215, "step": 7440 }, { "gate_value": 0.14389364421367645, "icl_sequence_length": 74, "num_contexts": 3, "step": 7440 }, { "grad_norm": 0.055326368659734726, "learning_rate": 0.0002850352194716752, "loss": 0.4067, "step": 7450 }, { "gate_value": 0.1442284882068634, "icl_sequence_length": 60, "num_contexts": 3, "step": 7450 }, { "grad_norm": 0.08267883211374283, "learning_rate": 0.00028498117868211133, "loss": 0.4256, "step": 7460 }, { "gate_value": 0.14436683058738708, "icl_sequence_length": 94, "num_contexts": 3, "step": 7460 }, { "grad_norm": 0.0570523627102375, "learning_rate": 0.00028492704563418467, "loss": 0.4225, "step": 7470 }, { "gate_value": 0.14424049854278564, "icl_sequence_length": 84, "num_contexts": 3, "step": 7470 }, { "grad_norm": 0.10873137414455414, "learning_rate": 0.00028487282036489454, "loss": 0.4094, "step": 7480 }, { "gate_value": 0.14400343596935272, "icl_sequence_length": 78, "num_contexts": 3, "step": 7480 }, { "grad_norm": 0.06366758048534393, "learning_rate": 0.0002848185029113034, "loss": 0.414, "step": 7490 }, { "gate_value": 0.1446097046136856, "icl_sequence_length": 88, "num_contexts": 3, "step": 7490 }, { "grad_norm": 0.07260235399007797, "learning_rate": 0.00028476409331053694, "loss": 0.4108, "step": 7500 }, { "gate_value": 0.14570096135139465, "icl_sequence_length": 88, "num_contexts": 3, "step": 7500 }, { "grad_norm": 0.0475568063557148, "learning_rate": 0.00028470959159978334, "loss": 0.4275, "step": 7510 }, { "gate_value": 0.1455191820859909, "icl_sequence_length": 84, "num_contexts": 3, "step": 7510 }, { "grad_norm": 0.06414615362882614, "learning_rate": 0.0002846549978162941, "loss": 0.4183, "step": 7520 }, { "gate_value": 0.14565055072307587, "icl_sequence_length": 64, "num_contexts": 3, "step": 7520 }, { "grad_norm": 0.08015228062868118, "learning_rate": 0.0002846003119973837, "loss": 0.4263, "step": 7530 }, { "gate_value": 0.14574910700321198, "icl_sequence_length": 86, "num_contexts": 3, "step": 7530 }, { "grad_norm": 0.04416521638631821, "learning_rate": 0.00028454553418042915, "loss": 0.4165, "step": 7540 }, { "gate_value": 0.14608663320541382, "icl_sequence_length": 84, "num_contexts": 3, "step": 7540 }, { "grad_norm": 0.1554374396800995, "learning_rate": 0.00028449066440287065, "loss": 0.4153, "step": 7550 }, { "gate_value": 0.14639393985271454, "icl_sequence_length": 56, "num_contexts": 3, "step": 7550 }, { "grad_norm": 0.07977347075939178, "learning_rate": 0.0002844357027022113, "loss": 0.4235, "step": 7560 }, { "gate_value": 0.1465524137020111, "icl_sequence_length": 88, "num_contexts": 3, "step": 7560 }, { "grad_norm": 0.09290315210819244, "learning_rate": 0.00028438064911601673, "loss": 0.4218, "step": 7570 }, { "gate_value": 0.14587721228599548, "icl_sequence_length": 76, "num_contexts": 3, "step": 7570 }, { "grad_norm": 0.04635458439588547, "learning_rate": 0.00028432550368191566, "loss": 0.429, "step": 7580 }, { "gate_value": 0.1461162567138672, "icl_sequence_length": 92, "num_contexts": 3, "step": 7580 }, { "grad_norm": 0.04780597984790802, "learning_rate": 0.0002842702664375994, "loss": 0.4112, "step": 7590 }, { "gate_value": 0.1462957113981247, "icl_sequence_length": 90, "num_contexts": 3, "step": 7590 }, { "grad_norm": 0.08960922062397003, "learning_rate": 0.0002842149374208222, "loss": 0.4171, "step": 7600 }, { "gate_value": 0.14661051332950592, "icl_sequence_length": 88, "num_contexts": 3, "step": 7600 }, { "grad_norm": 0.074715755879879, "learning_rate": 0.00028415951666940076, "loss": 0.4247, "step": 7610 }, { "gate_value": 0.14733080565929413, "icl_sequence_length": 82, "num_contexts": 3, "step": 7610 }, { "grad_norm": 0.044907812029123306, "learning_rate": 0.00028410400422121477, "loss": 0.3986, "step": 7620 }, { "gate_value": 0.14742633700370789, "icl_sequence_length": 82, "num_contexts": 3, "step": 7620 }, { "grad_norm": 0.09000971168279648, "learning_rate": 0.00028404840011420643, "loss": 0.4254, "step": 7630 }, { "gate_value": 0.14711321890354156, "icl_sequence_length": 82, "num_contexts": 3, "step": 7630 }, { "grad_norm": 0.07416932284832001, "learning_rate": 0.00028399270438638055, "loss": 0.3919, "step": 7640 }, { "gate_value": 0.14702820777893066, "icl_sequence_length": 80, "num_contexts": 3, "step": 7640 }, { "grad_norm": 0.11321912705898285, "learning_rate": 0.00028393691707580477, "loss": 0.4133, "step": 7650 }, { "gate_value": 0.14669635891914368, "icl_sequence_length": 70, "num_contexts": 3, "step": 7650 }, { "grad_norm": 0.09754902869462967, "learning_rate": 0.00028388103822060907, "loss": 0.4216, "step": 7660 }, { "gate_value": 0.14687460660934448, "icl_sequence_length": 80, "num_contexts": 3, "step": 7660 }, { "grad_norm": 0.07661180943250656, "learning_rate": 0.0002838250678589862, "loss": 0.412, "step": 7670 }, { "gate_value": 0.14798735082149506, "icl_sequence_length": 76, "num_contexts": 3, "step": 7670 }, { "grad_norm": 0.13970467448234558, "learning_rate": 0.0002837690060291913, "loss": 0.4249, "step": 7680 }, { "gate_value": 0.14825834333896637, "icl_sequence_length": 90, "num_contexts": 3, "step": 7680 }, { "grad_norm": 0.13393306732177734, "learning_rate": 0.0002837128527695422, "loss": 0.4272, "step": 7690 }, { "gate_value": 0.1480855494737625, "icl_sequence_length": 82, "num_contexts": 3, "step": 7690 }, { "grad_norm": 0.04212678596377373, "learning_rate": 0.00028365660811841903, "loss": 0.4281, "step": 7700 }, { "gate_value": 0.147964209318161, "icl_sequence_length": 66, "num_contexts": 3, "step": 7700 }, { "grad_norm": 0.046316344290971756, "learning_rate": 0.00028360027211426456, "loss": 0.3971, "step": 7710 }, { "gate_value": 0.14794059097766876, "icl_sequence_length": 88, "num_contexts": 3, "step": 7710 }, { "grad_norm": 0.10596490651369095, "learning_rate": 0.00028354384479558384, "loss": 0.4446, "step": 7720 }, { "gate_value": 0.14815928041934967, "icl_sequence_length": 82, "num_contexts": 3, "step": 7720 }, { "grad_norm": 0.0596347413957119, "learning_rate": 0.0002834873262009444, "loss": 0.4253, "step": 7730 }, { "gate_value": 0.14768116176128387, "icl_sequence_length": 66, "num_contexts": 3, "step": 7730 }, { "grad_norm": 0.08078193664550781, "learning_rate": 0.0002834307163689763, "loss": 0.4267, "step": 7740 }, { "gate_value": 0.1484566181898117, "icl_sequence_length": 84, "num_contexts": 3, "step": 7740 }, { "grad_norm": 0.11879950016736984, "learning_rate": 0.0002833740153383717, "loss": 0.418, "step": 7750 }, { "gate_value": 0.14842697978019714, "icl_sequence_length": 72, "num_contexts": 3, "step": 7750 }, { "grad_norm": 0.06142803281545639, "learning_rate": 0.0002833172231478853, "loss": 0.4145, "step": 7760 }, { "gate_value": 0.1485043615102768, "icl_sequence_length": 72, "num_contexts": 3, "step": 7760 }, { "grad_norm": 0.046991124749183655, "learning_rate": 0.0002832603398363339, "loss": 0.4237, "step": 7770 }, { "gate_value": 0.14781975746154785, "icl_sequence_length": 88, "num_contexts": 3, "step": 7770 }, { "grad_norm": 0.08748679608106613, "learning_rate": 0.00028320336544259686, "loss": 0.4058, "step": 7780 }, { "gate_value": 0.14809411764144897, "icl_sequence_length": 70, "num_contexts": 3, "step": 7780 }, { "grad_norm": 0.12719684839248657, "learning_rate": 0.0002831463000056156, "loss": 0.4112, "step": 7790 }, { "gate_value": 0.14806553721427917, "icl_sequence_length": 88, "num_contexts": 3, "step": 7790 }, { "grad_norm": 0.08230043202638626, "learning_rate": 0.00028308914356439365, "loss": 0.41, "step": 7800 }, { "gate_value": 0.1477605700492859, "icl_sequence_length": 70, "num_contexts": 3, "step": 7800 }, { "grad_norm": 0.053649622946977615, "learning_rate": 0.00028303189615799714, "loss": 0.4172, "step": 7810 }, { "gate_value": 0.148033007979393, "icl_sequence_length": 76, "num_contexts": 3, "step": 7810 }, { "grad_norm": 0.12399522960186005, "learning_rate": 0.00028297455782555394, "loss": 0.3981, "step": 7820 }, { "gate_value": 0.148561492562294, "icl_sequence_length": 76, "num_contexts": 3, "step": 7820 }, { "grad_norm": 0.07105452567338943, "learning_rate": 0.00028291712860625443, "loss": 0.4428, "step": 7830 }, { "gate_value": 0.14870916306972504, "icl_sequence_length": 76, "num_contexts": 3, "step": 7830 }, { "grad_norm": 0.04318685829639435, "learning_rate": 0.00028285960853935085, "loss": 0.4221, "step": 7840 }, { "gate_value": 0.14865200221538544, "icl_sequence_length": 64, "num_contexts": 3, "step": 7840 }, { "grad_norm": 0.0866786539554596, "learning_rate": 0.00028280199766415756, "loss": 0.4105, "step": 7850 }, { "gate_value": 0.14928407967090607, "icl_sequence_length": 82, "num_contexts": 3, "step": 7850 }, { "grad_norm": 0.10774104297161102, "learning_rate": 0.00028274429602005117, "loss": 0.4345, "step": 7860 }, { "gate_value": 0.15012405812740326, "icl_sequence_length": 82, "num_contexts": 3, "step": 7860 }, { "grad_norm": 0.08425111323595047, "learning_rate": 0.0002826865036464701, "loss": 0.4122, "step": 7870 }, { "gate_value": 0.15047207474708557, "icl_sequence_length": 80, "num_contexts": 3, "step": 7870 }, { "grad_norm": 0.09827083349227905, "learning_rate": 0.00028262862058291496, "loss": 0.4299, "step": 7880 }, { "gate_value": 0.1503888964653015, "icl_sequence_length": 82, "num_contexts": 3, "step": 7880 }, { "grad_norm": 0.12378929555416107, "learning_rate": 0.0002825706468689483, "loss": 0.4179, "step": 7890 }, { "gate_value": 0.15025150775909424, "icl_sequence_length": 82, "num_contexts": 3, "step": 7890 }, { "grad_norm": 0.11086408793926239, "learning_rate": 0.00028251258254419453, "loss": 0.4237, "step": 7900 }, { "gate_value": 0.15031926333904266, "icl_sequence_length": 68, "num_contexts": 3, "step": 7900 }, { "grad_norm": 0.06256461143493652, "learning_rate": 0.00028245442764834015, "loss": 0.4085, "step": 7910 }, { "gate_value": 0.15006960928440094, "icl_sequence_length": 86, "num_contexts": 3, "step": 7910 }, { "grad_norm": 0.1430857628583908, "learning_rate": 0.0002823961822211334, "loss": 0.4089, "step": 7920 }, { "gate_value": 0.15034839510917664, "icl_sequence_length": 80, "num_contexts": 3, "step": 7920 }, { "grad_norm": 0.0887841060757637, "learning_rate": 0.0002823378463023845, "loss": 0.4277, "step": 7930 }, { "gate_value": 0.151446133852005, "icl_sequence_length": 92, "num_contexts": 3, "step": 7930 }, { "grad_norm": 0.06118936464190483, "learning_rate": 0.00028227941993196564, "loss": 0.4175, "step": 7940 }, { "gate_value": 0.1517830491065979, "icl_sequence_length": 82, "num_contexts": 3, "step": 7940 }, { "grad_norm": 0.16387613117694855, "learning_rate": 0.0002822209031498105, "loss": 0.4054, "step": 7950 }, { "gate_value": 0.15202456712722778, "icl_sequence_length": 82, "num_contexts": 3, "step": 7950 }, { "grad_norm": 0.046996645629405975, "learning_rate": 0.0002821622959959148, "loss": 0.4169, "step": 7960 }, { "gate_value": 0.15204818546772003, "icl_sequence_length": 76, "num_contexts": 3, "step": 7960 }, { "grad_norm": 0.15682649612426758, "learning_rate": 0.00028210359851033604, "loss": 0.4187, "step": 7970 }, { "gate_value": 0.15276777744293213, "icl_sequence_length": 88, "num_contexts": 3, "step": 7970 }, { "grad_norm": 0.15045514702796936, "learning_rate": 0.0002820448107331934, "loss": 0.409, "step": 7980 }, { "gate_value": 0.15333126485347748, "icl_sequence_length": 74, "num_contexts": 3, "step": 7980 }, { "grad_norm": 0.06689872592687607, "learning_rate": 0.0002819859327046677, "loss": 0.4305, "step": 7990 }, { "gate_value": 0.1544475108385086, "icl_sequence_length": 82, "num_contexts": 3, "step": 7990 }, { "grad_norm": 0.09526379406452179, "learning_rate": 0.0002819269644650015, "loss": 0.4142, "step": 8000 }, { "gate_value": 0.15457449853420258, "icl_sequence_length": 60, "num_contexts": 3, "step": 8000 }, { "grad_norm": 0.10891684144735336, "learning_rate": 0.0002818679060544991, "loss": 0.4212, "step": 8010 }, { "gate_value": 0.15380233526229858, "icl_sequence_length": 80, "num_contexts": 3, "step": 8010 }, { "grad_norm": 0.12320034205913544, "learning_rate": 0.0002818087575135264, "loss": 0.4263, "step": 8020 }, { "gate_value": 0.15353891253471375, "icl_sequence_length": 86, "num_contexts": 3, "step": 8020 }, { "grad_norm": 0.09334403276443481, "learning_rate": 0.0002817495188825108, "loss": 0.405, "step": 8030 }, { "gate_value": 0.1534106582403183, "icl_sequence_length": 80, "num_contexts": 3, "step": 8030 }, { "grad_norm": 0.0722557082772255, "learning_rate": 0.00028169019020194135, "loss": 0.4269, "step": 8040 }, { "gate_value": 0.15350815653800964, "icl_sequence_length": 84, "num_contexts": 3, "step": 8040 }, { "grad_norm": 0.060502052307128906, "learning_rate": 0.00028163077151236864, "loss": 0.4046, "step": 8050 }, { "gate_value": 0.15341298282146454, "icl_sequence_length": 82, "num_contexts": 3, "step": 8050 }, { "grad_norm": 0.1101309210062027, "learning_rate": 0.00028157126285440485, "loss": 0.419, "step": 8060 }, { "gate_value": 0.15298579633235931, "icl_sequence_length": 82, "num_contexts": 3, "step": 8060 }, { "grad_norm": 0.07704024761915207, "learning_rate": 0.0002815116642687236, "loss": 0.4242, "step": 8070 }, { "gate_value": 0.15233850479125977, "icl_sequence_length": 78, "num_contexts": 3, "step": 8070 }, { "grad_norm": 0.08202876150608063, "learning_rate": 0.0002814519757960598, "loss": 0.4034, "step": 8080 }, { "gate_value": 0.1520369052886963, "icl_sequence_length": 78, "num_contexts": 3, "step": 8080 }, { "grad_norm": 0.07245134562253952, "learning_rate": 0.0002813921974772101, "loss": 0.4011, "step": 8090 }, { "gate_value": 0.1525239646434784, "icl_sequence_length": 70, "num_contexts": 3, "step": 8090 }, { "grad_norm": 0.48970121145248413, "learning_rate": 0.00028133232935303234, "loss": 0.4046, "step": 8100 }, { "gate_value": 0.1524377316236496, "icl_sequence_length": 84, "num_contexts": 3, "step": 8100 }, { "grad_norm": 0.8370404243469238, "learning_rate": 0.0002812723714644459, "loss": 0.4219, "step": 8110 }, { "gate_value": 0.15254035592079163, "icl_sequence_length": 72, "num_contexts": 3, "step": 8110 }, { "grad_norm": 0.1069856658577919, "learning_rate": 0.0002812123238524314, "loss": 0.4224, "step": 8120 }, { "gate_value": 0.1529160439968109, "icl_sequence_length": 86, "num_contexts": 3, "step": 8120 }, { "grad_norm": 0.10440300405025482, "learning_rate": 0.00028115218655803075, "loss": 0.4176, "step": 8130 }, { "gate_value": 0.1537504643201828, "icl_sequence_length": 78, "num_contexts": 3, "step": 8130 }, { "grad_norm": 0.1068665012717247, "learning_rate": 0.0002810919596223474, "loss": 0.3981, "step": 8140 }, { "gate_value": 0.15441353619098663, "icl_sequence_length": 70, "num_contexts": 3, "step": 8140 }, { "grad_norm": 0.21673692762851715, "learning_rate": 0.0002810316430865456, "loss": 0.4008, "step": 8150 }, { "gate_value": 0.15410266816616058, "icl_sequence_length": 90, "num_contexts": 3, "step": 8150 }, { "grad_norm": 0.17246730625629425, "learning_rate": 0.0002809712369918514, "loss": 0.4105, "step": 8160 }, { "gate_value": 0.15456920862197876, "icl_sequence_length": 86, "num_contexts": 3, "step": 8160 }, { "grad_norm": 0.22932332754135132, "learning_rate": 0.0002809107413795517, "loss": 0.4179, "step": 8170 }, { "gate_value": 0.15458108484745026, "icl_sequence_length": 80, "num_contexts": 3, "step": 8170 }, { "grad_norm": 0.10797584801912308, "learning_rate": 0.0002808501562909947, "loss": 0.4197, "step": 8180 }, { "gate_value": 0.1545572131872177, "icl_sequence_length": 88, "num_contexts": 3, "step": 8180 }, { "grad_norm": 0.19922660291194916, "learning_rate": 0.0002807894817675897, "loss": 0.4092, "step": 8190 }, { "gate_value": 0.15465989708900452, "icl_sequence_length": 80, "num_contexts": 3, "step": 8190 }, { "grad_norm": 0.19957950711250305, "learning_rate": 0.00028072871785080717, "loss": 0.422, "step": 8200 }, { "gate_value": 0.15546606481075287, "icl_sequence_length": 64, "num_contexts": 3, "step": 8200 }, { "grad_norm": 0.5608637928962708, "learning_rate": 0.00028066786458217865, "loss": 0.4062, "step": 8210 }, { "gate_value": 0.1552291363477707, "icl_sequence_length": 80, "num_contexts": 3, "step": 8210 }, { "grad_norm": 0.07581547647714615, "learning_rate": 0.0002806069220032969, "loss": 0.4056, "step": 8220 }, { "gate_value": 0.15533456206321716, "icl_sequence_length": 78, "num_contexts": 3, "step": 8220 }, { "grad_norm": 0.07695662975311279, "learning_rate": 0.0002805458901558154, "loss": 0.4266, "step": 8230 }, { "gate_value": 0.15545304119586945, "icl_sequence_length": 72, "num_contexts": 3, "step": 8230 }, { "grad_norm": 0.13097840547561646, "learning_rate": 0.00028048476908144903, "loss": 0.4256, "step": 8240 }, { "gate_value": 0.15602801740169525, "icl_sequence_length": 78, "num_contexts": 3, "step": 8240 }, { "grad_norm": 0.21157461404800415, "learning_rate": 0.00028042355882197336, "loss": 0.42, "step": 8250 }, { "gate_value": 0.15621256828308105, "icl_sequence_length": 86, "num_contexts": 3, "step": 8250 }, { "grad_norm": 0.21507303416728973, "learning_rate": 0.0002803622594192251, "loss": 0.4267, "step": 8260 }, { "gate_value": 0.1561337411403656, "icl_sequence_length": 82, "num_contexts": 3, "step": 8260 }, { "grad_norm": 0.08249134570360184, "learning_rate": 0.00028030087091510174, "loss": 0.4033, "step": 8270 }, { "gate_value": 0.15637017786502838, "icl_sequence_length": 78, "num_contexts": 3, "step": 8270 }, { "grad_norm": 0.09764933586120605, "learning_rate": 0.0002802393933515618, "loss": 0.4019, "step": 8280 }, { "gate_value": 0.15595419704914093, "icl_sequence_length": 92, "num_contexts": 3, "step": 8280 }, { "grad_norm": 0.31223252415657043, "learning_rate": 0.00028017782677062456, "loss": 0.4018, "step": 8290 }, { "gate_value": 0.1560358852148056, "icl_sequence_length": 70, "num_contexts": 3, "step": 8290 }, { "grad_norm": 0.6203545331954956, "learning_rate": 0.0002801161712143702, "loss": 0.4075, "step": 8300 }, { "gate_value": 0.15645352005958557, "icl_sequence_length": 90, "num_contexts": 3, "step": 8300 }, { "grad_norm": 0.19463717937469482, "learning_rate": 0.0002800544267249398, "loss": 0.4133, "step": 8310 }, { "gate_value": 0.1570998579263687, "icl_sequence_length": 86, "num_contexts": 3, "step": 8310 }, { "grad_norm": 0.09994322806596756, "learning_rate": 0.00027999259334453503, "loss": 0.4043, "step": 8320 }, { "gate_value": 0.15745465457439423, "icl_sequence_length": 82, "num_contexts": 3, "step": 8320 }, { "grad_norm": 0.07742167264223099, "learning_rate": 0.0002799306711154185, "loss": 0.4041, "step": 8330 }, { "gate_value": 0.15763244032859802, "icl_sequence_length": 90, "num_contexts": 3, "step": 8330 }, { "grad_norm": 0.16606910526752472, "learning_rate": 0.0002798686600799134, "loss": 0.4279, "step": 8340 }, { "gate_value": 0.15786494314670563, "icl_sequence_length": 76, "num_contexts": 3, "step": 8340 }, { "grad_norm": 0.10738826543092728, "learning_rate": 0.00027980656028040373, "loss": 0.4096, "step": 8350 }, { "gate_value": 0.15786395967006683, "icl_sequence_length": 72, "num_contexts": 3, "step": 8350 }, { "grad_norm": 0.15681566298007965, "learning_rate": 0.0002797443717593341, "loss": 0.4092, "step": 8360 }, { "gate_value": 0.15748639404773712, "icl_sequence_length": 86, "num_contexts": 3, "step": 8360 }, { "grad_norm": 0.19968372583389282, "learning_rate": 0.0002796820945592098, "loss": 0.4107, "step": 8370 }, { "gate_value": 0.1574399322271347, "icl_sequence_length": 80, "num_contexts": 3, "step": 8370 }, { "grad_norm": 0.08654113113880157, "learning_rate": 0.00027961972872259675, "loss": 0.4032, "step": 8380 }, { "gate_value": 0.1580353081226349, "icl_sequence_length": 62, "num_contexts": 3, "step": 8380 }, { "grad_norm": 0.17560425400733948, "learning_rate": 0.0002795572742921213, "loss": 0.4045, "step": 8390 }, { "gate_value": 0.15868164598941803, "icl_sequence_length": 74, "num_contexts": 3, "step": 8390 }, { "grad_norm": 0.14503097534179688, "learning_rate": 0.0002794947313104705, "loss": 0.4134, "step": 8400 }, { "gate_value": 0.15872491896152496, "icl_sequence_length": 76, "num_contexts": 3, "step": 8400 }, { "grad_norm": 0.5805841684341431, "learning_rate": 0.00027943209982039195, "loss": 0.4269, "step": 8410 }, { "gate_value": 0.15834017097949982, "icl_sequence_length": 82, "num_contexts": 3, "step": 8410 }, { "grad_norm": 0.26208043098449707, "learning_rate": 0.0002793693798646937, "loss": 0.4062, "step": 8420 }, { "gate_value": 0.15807278454303741, "icl_sequence_length": 74, "num_contexts": 3, "step": 8420 }, { "grad_norm": 0.13806024193763733, "learning_rate": 0.00027930657148624407, "loss": 0.4181, "step": 8430 }, { "gate_value": 0.1582920253276825, "icl_sequence_length": 94, "num_contexts": 3, "step": 8430 }, { "grad_norm": 0.1110081896185875, "learning_rate": 0.0002792436747279722, "loss": 0.411, "step": 8440 }, { "gate_value": 0.159070685505867, "icl_sequence_length": 70, "num_contexts": 3, "step": 8440 }, { "grad_norm": 0.16463018953800201, "learning_rate": 0.0002791806896328673, "loss": 0.4149, "step": 8450 }, { "gate_value": 0.15995164215564728, "icl_sequence_length": 80, "num_contexts": 3, "step": 8450 }, { "grad_norm": 0.3234344720840454, "learning_rate": 0.0002791176162439792, "loss": 0.4171, "step": 8460 }, { "gate_value": 0.160800963640213, "icl_sequence_length": 88, "num_contexts": 3, "step": 8460 }, { "grad_norm": 0.14832444489002228, "learning_rate": 0.0002790544546044179, "loss": 0.4228, "step": 8470 }, { "gate_value": 0.16142258048057556, "icl_sequence_length": 78, "num_contexts": 3, "step": 8470 }, { "grad_norm": 0.14149171113967896, "learning_rate": 0.00027899120475735373, "loss": 0.4124, "step": 8480 }, { "gate_value": 0.16140305995941162, "icl_sequence_length": 74, "num_contexts": 3, "step": 8480 }, { "grad_norm": 0.4263879656791687, "learning_rate": 0.00027892786674601745, "loss": 0.4111, "step": 8490 }, { "gate_value": 0.16139687597751617, "icl_sequence_length": 90, "num_contexts": 3, "step": 8490 }, { "grad_norm": 0.23564587533473969, "learning_rate": 0.0002788644406137, "loss": 0.4178, "step": 8500 }, { "gate_value": 0.16113747656345367, "icl_sequence_length": 92, "num_contexts": 3, "step": 8500 }, { "grad_norm": 0.10173361003398895, "learning_rate": 0.00027880092640375243, "loss": 0.4209, "step": 8510 }, { "gate_value": 0.16043519973754883, "icl_sequence_length": 82, "num_contexts": 3, "step": 8510 }, { "grad_norm": 0.34818899631500244, "learning_rate": 0.00027873732415958626, "loss": 0.4267, "step": 8520 }, { "gate_value": 0.16010433435440063, "icl_sequence_length": 70, "num_contexts": 3, "step": 8520 }, { "grad_norm": 0.12077205628156662, "learning_rate": 0.0002786736339246729, "loss": 0.4135, "step": 8530 }, { "gate_value": 0.16009613871574402, "icl_sequence_length": 68, "num_contexts": 3, "step": 8530 }, { "grad_norm": 0.4177238345146179, "learning_rate": 0.0002786098557425441, "loss": 0.4262, "step": 8540 }, { "gate_value": 0.16055171191692352, "icl_sequence_length": 86, "num_contexts": 3, "step": 8540 }, { "grad_norm": 0.12868152558803558, "learning_rate": 0.0002785459896567916, "loss": 0.3945, "step": 8550 }, { "gate_value": 0.16079725325107574, "icl_sequence_length": 88, "num_contexts": 3, "step": 8550 }, { "grad_norm": 0.12423283606767654, "learning_rate": 0.0002784820357110673, "loss": 0.4073, "step": 8560 }, { "gate_value": 0.16120773553848267, "icl_sequence_length": 82, "num_contexts": 3, "step": 8560 }, { "grad_norm": 0.16405817866325378, "learning_rate": 0.00027841799394908313, "loss": 0.4028, "step": 8570 }, { "gate_value": 0.16189523041248322, "icl_sequence_length": 74, "num_contexts": 3, "step": 8570 }, { "grad_norm": 0.11654902249574661, "learning_rate": 0.0002783538644146109, "loss": 0.4131, "step": 8580 }, { "gate_value": 0.16290602087974548, "icl_sequence_length": 74, "num_contexts": 3, "step": 8580 }, { "grad_norm": 0.11591009795665741, "learning_rate": 0.00027828964715148277, "loss": 0.4171, "step": 8590 }, { "gate_value": 0.16335241496562958, "icl_sequence_length": 70, "num_contexts": 3, "step": 8590 }, { "grad_norm": 0.3086751699447632, "learning_rate": 0.0002782253422035905, "loss": 0.3931, "step": 8600 }, { "gate_value": 0.1638471484184265, "icl_sequence_length": 76, "num_contexts": 3, "step": 8600 }, { "grad_norm": 0.4067281484603882, "learning_rate": 0.00027816094961488586, "loss": 0.4236, "step": 8610 }, { "gate_value": 0.16402751207351685, "icl_sequence_length": 78, "num_contexts": 3, "step": 8610 }, { "grad_norm": 0.4020434021949768, "learning_rate": 0.00027809646942938065, "loss": 0.4152, "step": 8620 }, { "gate_value": 0.16374509036540985, "icl_sequence_length": 80, "num_contexts": 3, "step": 8620 }, { "grad_norm": 0.11254820227622986, "learning_rate": 0.0002780319016911465, "loss": 0.3941, "step": 8630 }, { "gate_value": 0.16366678476333618, "icl_sequence_length": 78, "num_contexts": 3, "step": 8630 }, { "grad_norm": 0.2048700600862503, "learning_rate": 0.00027796724644431483, "loss": 0.428, "step": 8640 }, { "gate_value": 0.1640690714120865, "icl_sequence_length": 78, "num_contexts": 3, "step": 8640 }, { "grad_norm": 0.10903961211442947, "learning_rate": 0.0002779025037330768, "loss": 0.415, "step": 8650 }, { "gate_value": 0.1633603423833847, "icl_sequence_length": 82, "num_contexts": 3, "step": 8650 }, { "grad_norm": 0.17407450079917908, "learning_rate": 0.00027783767360168356, "loss": 0.4091, "step": 8660 }, { "gate_value": 0.16377109289169312, "icl_sequence_length": 94, "num_contexts": 3, "step": 8660 }, { "grad_norm": 0.18325786292552948, "learning_rate": 0.00027777275609444587, "loss": 0.4159, "step": 8670 }, { "gate_value": 0.16425953805446625, "icl_sequence_length": 84, "num_contexts": 3, "step": 8670 }, { "grad_norm": 0.957232654094696, "learning_rate": 0.0002777077512557342, "loss": 0.4066, "step": 8680 }, { "gate_value": 0.1641382873058319, "icl_sequence_length": 88, "num_contexts": 3, "step": 8680 }, { "grad_norm": 0.2491225153207779, "learning_rate": 0.0002776426591299787, "loss": 0.4108, "step": 8690 }, { "gate_value": 0.16421766579151154, "icl_sequence_length": 84, "num_contexts": 3, "step": 8690 }, { "grad_norm": 3.7188527584075928, "learning_rate": 0.00027757747976166935, "loss": 0.4058, "step": 8700 }, { "gate_value": 0.16427725553512573, "icl_sequence_length": 84, "num_contexts": 3, "step": 8700 }, { "grad_norm": 0.4891299307346344, "learning_rate": 0.00027751221319535557, "loss": 0.4075, "step": 8710 }, { "gate_value": 0.16449934244155884, "icl_sequence_length": 68, "num_contexts": 3, "step": 8710 }, { "grad_norm": 0.20242206752300262, "learning_rate": 0.0002774468594756464, "loss": 0.4026, "step": 8720 }, { "gate_value": 0.1650473028421402, "icl_sequence_length": 70, "num_contexts": 3, "step": 8720 }, { "grad_norm": 0.14760473370552063, "learning_rate": 0.0002773814186472106, "loss": 0.3994, "step": 8730 }, { "gate_value": 0.16507603228092194, "icl_sequence_length": 80, "num_contexts": 3, "step": 8730 }, { "grad_norm": 0.14821743965148926, "learning_rate": 0.00027731589075477624, "loss": 0.4118, "step": 8740 }, { "gate_value": 0.1656453013420105, "icl_sequence_length": 84, "num_contexts": 3, "step": 8740 }, { "grad_norm": 0.7819263935089111, "learning_rate": 0.00027725027584313104, "loss": 0.399, "step": 8750 }, { "gate_value": 0.1657910943031311, "icl_sequence_length": 92, "num_contexts": 3, "step": 8750 }, { "grad_norm": 0.29498928785324097, "learning_rate": 0.0002771845739571222, "loss": 0.4329, "step": 8760 }, { "gate_value": 0.16638685762882233, "icl_sequence_length": 86, "num_contexts": 3, "step": 8760 }, { "grad_norm": 0.2815771996974945, "learning_rate": 0.0002771187851416564, "loss": 0.4159, "step": 8770 }, { "gate_value": 0.1663895696401596, "icl_sequence_length": 66, "num_contexts": 3, "step": 8770 }, { "grad_norm": 0.11321847885847092, "learning_rate": 0.0002770529094416996, "loss": 0.4172, "step": 8780 }, { "gate_value": 0.16677428781986237, "icl_sequence_length": 90, "num_contexts": 3, "step": 8780 }, { "grad_norm": 0.1718282252550125, "learning_rate": 0.0002769869469022772, "loss": 0.4133, "step": 8790 }, { "gate_value": 0.16660121083259583, "icl_sequence_length": 70, "num_contexts": 3, "step": 8790 }, { "grad_norm": 0.3674944043159485, "learning_rate": 0.000276920897568474, "loss": 0.4151, "step": 8800 }, { "gate_value": 0.16616857051849365, "icl_sequence_length": 88, "num_contexts": 3, "step": 8800 }, { "grad_norm": 0.262675940990448, "learning_rate": 0.00027685476148543416, "loss": 0.4092, "step": 8810 }, { "gate_value": 0.16660258173942566, "icl_sequence_length": 70, "num_contexts": 3, "step": 8810 }, { "grad_norm": 0.4975556433200836, "learning_rate": 0.00027678853869836096, "loss": 0.4073, "step": 8820 }, { "gate_value": 0.16717791557312012, "icl_sequence_length": 68, "num_contexts": 3, "step": 8820 }, { "grad_norm": 0.2040899395942688, "learning_rate": 0.0002767222292525171, "loss": 0.4238, "step": 8830 }, { "gate_value": 0.16736888885498047, "icl_sequence_length": 76, "num_contexts": 3, "step": 8830 }, { "grad_norm": 0.1558050960302353, "learning_rate": 0.00027665583319322454, "loss": 0.4052, "step": 8840 }, { "gate_value": 0.1675145924091339, "icl_sequence_length": 90, "num_contexts": 3, "step": 8840 }, { "grad_norm": 0.6124147176742554, "learning_rate": 0.0002765893505658642, "loss": 0.4167, "step": 8850 }, { "gate_value": 0.16753420233726501, "icl_sequence_length": 82, "num_contexts": 3, "step": 8850 }, { "grad_norm": 0.25529617071151733, "learning_rate": 0.00027652278141587647, "loss": 0.4245, "step": 8860 }, { "gate_value": 0.16686958074569702, "icl_sequence_length": 76, "num_contexts": 3, "step": 8860 }, { "grad_norm": 0.11387065052986145, "learning_rate": 0.00027645612578876066, "loss": 0.4092, "step": 8870 }, { "gate_value": 0.16669636964797974, "icl_sequence_length": 72, "num_contexts": 3, "step": 8870 }, { "grad_norm": 0.0918634682893753, "learning_rate": 0.00027638938373007526, "loss": 0.4073, "step": 8880 }, { "gate_value": 0.1666368544101715, "icl_sequence_length": 80, "num_contexts": 3, "step": 8880 }, { "grad_norm": 0.12662643194198608, "learning_rate": 0.00027632255528543787, "loss": 0.4127, "step": 8890 }, { "gate_value": 0.1672160029411316, "icl_sequence_length": 80, "num_contexts": 3, "step": 8890 }, { "grad_norm": 0.10079085826873779, "learning_rate": 0.00027625564050052517, "loss": 0.4111, "step": 8900 }, { "gate_value": 0.16768445074558258, "icl_sequence_length": 86, "num_contexts": 3, "step": 8900 }, { "grad_norm": 0.10930916666984558, "learning_rate": 0.0002761886394210726, "loss": 0.4033, "step": 8910 }, { "gate_value": 0.16912949085235596, "icl_sequence_length": 68, "num_contexts": 3, "step": 8910 }, { "grad_norm": 0.12985733151435852, "learning_rate": 0.00027612155209287494, "loss": 0.3969, "step": 8920 }, { "gate_value": 0.16979645192623138, "icl_sequence_length": 62, "num_contexts": 3, "step": 8920 }, { "grad_norm": 0.09886956959962845, "learning_rate": 0.0002760543785617857, "loss": 0.4102, "step": 8930 }, { "gate_value": 0.1697724461555481, "icl_sequence_length": 76, "num_contexts": 3, "step": 8930 }, { "grad_norm": 0.3233407735824585, "learning_rate": 0.0002759871188737173, "loss": 0.4139, "step": 8940 }, { "gate_value": 0.17055948078632355, "icl_sequence_length": 86, "num_contexts": 3, "step": 8940 }, { "grad_norm": 0.15284964442253113, "learning_rate": 0.0002759197730746411, "loss": 0.4091, "step": 8950 }, { "gate_value": 0.1704828441143036, "icl_sequence_length": 78, "num_contexts": 3, "step": 8950 }, { "grad_norm": 1.4669501781463623, "learning_rate": 0.0002758523412105874, "loss": 0.4177, "step": 8960 }, { "gate_value": 0.1702096164226532, "icl_sequence_length": 88, "num_contexts": 3, "step": 8960 }, { "grad_norm": 0.29607024788856506, "learning_rate": 0.00027578482332764516, "loss": 0.3793, "step": 8970 }, { "gate_value": 0.1705409288406372, "icl_sequence_length": 82, "num_contexts": 3, "step": 8970 }, { "grad_norm": 1.1997206211090088, "learning_rate": 0.0002757172194719623, "loss": 0.4155, "step": 8980 }, { "gate_value": 0.17043624818325043, "icl_sequence_length": 80, "num_contexts": 3, "step": 8980 }, { "grad_norm": 1.4481717348098755, "learning_rate": 0.00027564952968974534, "loss": 0.4089, "step": 8990 }, { "gate_value": 0.17082171142101288, "icl_sequence_length": 88, "num_contexts": 3, "step": 8990 }, { "grad_norm": 0.1889583021402359, "learning_rate": 0.00027558175402725963, "loss": 0.4162, "step": 9000 }, { "gate_value": 0.17097678780555725, "icl_sequence_length": 90, "num_contexts": 3, "step": 9000 }, { "grad_norm": 0.22113603353500366, "learning_rate": 0.00027551389253082926, "loss": 0.4092, "step": 9010 }, { "gate_value": 0.1711142361164093, "icl_sequence_length": 72, "num_contexts": 3, "step": 9010 }, { "grad_norm": 0.12217526137828827, "learning_rate": 0.00027544594524683683, "loss": 0.4106, "step": 9020 }, { "gate_value": 0.17178325355052948, "icl_sequence_length": 88, "num_contexts": 3, "step": 9020 }, { "grad_norm": 0.4917551279067993, "learning_rate": 0.0002753779122217237, "loss": 0.4076, "step": 9030 }, { "gate_value": 0.17185628414154053, "icl_sequence_length": 82, "num_contexts": 3, "step": 9030 }, { "grad_norm": 0.12286029756069183, "learning_rate": 0.00027530979350198987, "loss": 0.4132, "step": 9040 }, { "gate_value": 0.17211104929447174, "icl_sequence_length": 70, "num_contexts": 3, "step": 9040 }, { "grad_norm": 0.1475491225719452, "learning_rate": 0.00027524158913419376, "loss": 0.4016, "step": 9050 }, { "gate_value": 0.17211231589317322, "icl_sequence_length": 90, "num_contexts": 3, "step": 9050 }, { "grad_norm": 0.17296111583709717, "learning_rate": 0.0002751732991649524, "loss": 0.4067, "step": 9060 }, { "gate_value": 0.1717059165239334, "icl_sequence_length": 84, "num_contexts": 3, "step": 9060 }, { "grad_norm": 4.388981819152832, "learning_rate": 0.0002751049236409414, "loss": 0.3944, "step": 9070 }, { "gate_value": 0.17243611812591553, "icl_sequence_length": 86, "num_contexts": 3, "step": 9070 }, { "grad_norm": 0.2799641191959381, "learning_rate": 0.0002750364626088947, "loss": 0.4033, "step": 9080 }, { "gate_value": 0.17295986413955688, "icl_sequence_length": 88, "num_contexts": 3, "step": 9080 }, { "grad_norm": 0.15736496448516846, "learning_rate": 0.0002749679161156049, "loss": 0.4135, "step": 9090 }, { "gate_value": 0.17265865206718445, "icl_sequence_length": 86, "num_contexts": 3, "step": 9090 }, { "grad_norm": 0.14757375419139862, "learning_rate": 0.0002748992842079228, "loss": 0.4255, "step": 9100 }, { "gate_value": 0.1720341593027115, "icl_sequence_length": 72, "num_contexts": 3, "step": 9100 }, { "grad_norm": 0.4037497639656067, "learning_rate": 0.0002748305669327577, "loss": 0.3946, "step": 9110 }, { "gate_value": 0.1721326857805252, "icl_sequence_length": 60, "num_contexts": 3, "step": 9110 }, { "grad_norm": 0.11872224509716034, "learning_rate": 0.00027476176433707713, "loss": 0.3961, "step": 9120 }, { "gate_value": 0.17253492772579193, "icl_sequence_length": 88, "num_contexts": 3, "step": 9120 }, { "grad_norm": 0.15447595715522766, "learning_rate": 0.0002746928764679071, "loss": 0.4018, "step": 9130 }, { "gate_value": 0.17255066335201263, "icl_sequence_length": 76, "num_contexts": 3, "step": 9130 }, { "grad_norm": 0.7999837398529053, "learning_rate": 0.0002746239033723318, "loss": 0.4058, "step": 9140 }, { "gate_value": 0.17191246151924133, "icl_sequence_length": 80, "num_contexts": 3, "step": 9140 }, { "grad_norm": 0.20919932425022125, "learning_rate": 0.0002745548450974936, "loss": 0.4074, "step": 9150 }, { "gate_value": 0.17232879996299744, "icl_sequence_length": 92, "num_contexts": 3, "step": 9150 }, { "grad_norm": 0.595839262008667, "learning_rate": 0.0002744857016905933, "loss": 0.3921, "step": 9160 }, { "gate_value": 0.17247727513313293, "icl_sequence_length": 70, "num_contexts": 3, "step": 9160 }, { "grad_norm": 0.5459367036819458, "learning_rate": 0.0002744164731988898, "loss": 0.4138, "step": 9170 }, { "gate_value": 0.17329102754592896, "icl_sequence_length": 76, "num_contexts": 3, "step": 9170 }, { "grad_norm": 0.19087789952754974, "learning_rate": 0.00027434715966969997, "loss": 0.4062, "step": 9180 }, { "gate_value": 0.1738402396440506, "icl_sequence_length": 70, "num_contexts": 3, "step": 9180 }, { "grad_norm": 0.21525757014751434, "learning_rate": 0.000274277761150399, "loss": 0.402, "step": 9190 }, { "gate_value": 0.1737126260995865, "icl_sequence_length": 94, "num_contexts": 3, "step": 9190 }, { "grad_norm": 0.38013356924057007, "learning_rate": 0.00027420827768842023, "loss": 0.4117, "step": 9200 }, { "gate_value": 0.17394515872001648, "icl_sequence_length": 78, "num_contexts": 3, "step": 9200 }, { "grad_norm": 0.17054623365402222, "learning_rate": 0.00027413870933125486, "loss": 0.4116, "step": 9210 }, { "gate_value": 0.173823744058609, "icl_sequence_length": 80, "num_contexts": 3, "step": 9210 }, { "grad_norm": 0.16116704046726227, "learning_rate": 0.00027406905612645217, "loss": 0.4072, "step": 9220 }, { "gate_value": 0.17491909861564636, "icl_sequence_length": 84, "num_contexts": 3, "step": 9220 }, { "grad_norm": 0.20770235359668732, "learning_rate": 0.00027399931812161957, "loss": 0.3936, "step": 9230 }, { "gate_value": 0.1753663271665573, "icl_sequence_length": 80, "num_contexts": 3, "step": 9230 }, { "grad_norm": 0.12470269948244095, "learning_rate": 0.00027392949536442224, "loss": 0.407, "step": 9240 }, { "gate_value": 0.17491556704044342, "icl_sequence_length": 88, "num_contexts": 3, "step": 9240 }, { "grad_norm": 0.29726898670196533, "learning_rate": 0.0002738595879025835, "loss": 0.4221, "step": 9250 }, { "gate_value": 0.1747361123561859, "icl_sequence_length": 80, "num_contexts": 3, "step": 9250 }, { "grad_norm": 0.16315695643424988, "learning_rate": 0.0002737895957838842, "loss": 0.4042, "step": 9260 }, { "gate_value": 0.17513708770275116, "icl_sequence_length": 88, "num_contexts": 3, "step": 9260 }, { "grad_norm": 0.3211088180541992, "learning_rate": 0.00027371951905616357, "loss": 0.4107, "step": 9270 }, { "gate_value": 0.17483621835708618, "icl_sequence_length": 80, "num_contexts": 3, "step": 9270 }, { "grad_norm": 0.12201294302940369, "learning_rate": 0.00027364935776731826, "loss": 0.405, "step": 9280 }, { "gate_value": 0.1746746301651001, "icl_sequence_length": 78, "num_contexts": 3, "step": 9280 }, { "grad_norm": 1.4433128833770752, "learning_rate": 0.00027357911196530284, "loss": 0.3999, "step": 9290 }, { "gate_value": 0.17453843355178833, "icl_sequence_length": 88, "num_contexts": 3, "step": 9290 }, { "grad_norm": 0.13382488489151, "learning_rate": 0.0002735087816981296, "loss": 0.4227, "step": 9300 }, { "gate_value": 0.17486976087093353, "icl_sequence_length": 90, "num_contexts": 3, "step": 9300 }, { "grad_norm": 0.3727971017360687, "learning_rate": 0.00027343836701386877, "loss": 0.4146, "step": 9310 }, { "gate_value": 0.17553885281085968, "icl_sequence_length": 82, "num_contexts": 3, "step": 9310 }, { "grad_norm": 0.5436861515045166, "learning_rate": 0.00027336786796064807, "loss": 0.3989, "step": 9320 }, { "gate_value": 0.1751997321844101, "icl_sequence_length": 70, "num_contexts": 3, "step": 9320 }, { "grad_norm": 0.17482315003871918, "learning_rate": 0.00027329728458665284, "loss": 0.4068, "step": 9330 }, { "gate_value": 0.17559026181697845, "icl_sequence_length": 62, "num_contexts": 3, "step": 9330 }, { "grad_norm": 0.23877385258674622, "learning_rate": 0.0002732266169401262, "loss": 0.4171, "step": 9340 }, { "gate_value": 0.17537659406661987, "icl_sequence_length": 92, "num_contexts": 3, "step": 9340 }, { "grad_norm": 0.09425515681505203, "learning_rate": 0.0002731558650693689, "loss": 0.4215, "step": 9350 }, { "gate_value": 0.175358846783638, "icl_sequence_length": 66, "num_contexts": 3, "step": 9350 }, { "grad_norm": 0.14440549910068512, "learning_rate": 0.00027308502902273913, "loss": 0.418, "step": 9360 }, { "gate_value": 0.17526549100875854, "icl_sequence_length": 86, "num_contexts": 3, "step": 9360 }, { "grad_norm": 0.13115571439266205, "learning_rate": 0.0002730141088486526, "loss": 0.4042, "step": 9370 }, { "gate_value": 0.17517532408237457, "icl_sequence_length": 84, "num_contexts": 3, "step": 9370 }, { "grad_norm": 0.10693828016519547, "learning_rate": 0.0002729431045955826, "loss": 0.3931, "step": 9380 }, { "gate_value": 0.17489777505397797, "icl_sequence_length": 86, "num_contexts": 3, "step": 9380 }, { "grad_norm": 0.3044837713241577, "learning_rate": 0.00027287201631205995, "loss": 0.397, "step": 9390 }, { "gate_value": 0.17526645958423615, "icl_sequence_length": 82, "num_contexts": 3, "step": 9390 }, { "grad_norm": 0.3311840891838074, "learning_rate": 0.00027280084404667274, "loss": 0.4216, "step": 9400 }, { "gate_value": 0.17530137300491333, "icl_sequence_length": 84, "num_contexts": 3, "step": 9400 }, { "grad_norm": 0.1382623314857483, "learning_rate": 0.0002727295878480666, "loss": 0.3987, "step": 9410 }, { "gate_value": 0.1757037341594696, "icl_sequence_length": 72, "num_contexts": 3, "step": 9410 }, { "grad_norm": 0.2891187369823456, "learning_rate": 0.0002726582477649444, "loss": 0.4025, "step": 9420 }, { "gate_value": 0.1756793111562729, "icl_sequence_length": 88, "num_contexts": 3, "step": 9420 }, { "grad_norm": 0.06408237665891647, "learning_rate": 0.00027258682384606646, "loss": 0.4079, "step": 9430 }, { "gate_value": 0.17592254281044006, "icl_sequence_length": 78, "num_contexts": 3, "step": 9430 }, { "grad_norm": 0.18277965486049652, "learning_rate": 0.00027251531614025035, "loss": 0.3936, "step": 9440 }, { "gate_value": 0.1765175312757492, "icl_sequence_length": 80, "num_contexts": 3, "step": 9440 }, { "grad_norm": 0.2279181033372879, "learning_rate": 0.00027244372469637087, "loss": 0.396, "step": 9450 }, { "gate_value": 0.17652124166488647, "icl_sequence_length": 72, "num_contexts": 3, "step": 9450 }, { "grad_norm": 0.46269768476486206, "learning_rate": 0.0002723720495633602, "loss": 0.4162, "step": 9460 }, { "gate_value": 0.17670394480228424, "icl_sequence_length": 78, "num_contexts": 3, "step": 9460 }, { "grad_norm": 0.5305395126342773, "learning_rate": 0.0002723002907902075, "loss": 0.4236, "step": 9470 }, { "gate_value": 0.1759636104106903, "icl_sequence_length": 82, "num_contexts": 3, "step": 9470 }, { "grad_norm": 0.5236008167266846, "learning_rate": 0.0002722284484259593, "loss": 0.4034, "step": 9480 }, { "gate_value": 0.1763700544834137, "icl_sequence_length": 78, "num_contexts": 3, "step": 9480 }, { "grad_norm": 0.11968474835157394, "learning_rate": 0.00027215652251971913, "loss": 0.3932, "step": 9490 }, { "gate_value": 0.1767255663871765, "icl_sequence_length": 82, "num_contexts": 3, "step": 9490 }, { "grad_norm": 0.1513536274433136, "learning_rate": 0.0002720845131206477, "loss": 0.406, "step": 9500 }, { "gate_value": 0.1772974580526352, "icl_sequence_length": 80, "num_contexts": 3, "step": 9500 }, { "grad_norm": 0.2298015058040619, "learning_rate": 0.00027201242027796274, "loss": 0.4074, "step": 9510 }, { "gate_value": 0.17706361413002014, "icl_sequence_length": 82, "num_contexts": 3, "step": 9510 }, { "grad_norm": 0.1239701434969902, "learning_rate": 0.000271940244040939, "loss": 0.409, "step": 9520 }, { "gate_value": 0.17645961046218872, "icl_sequence_length": 88, "num_contexts": 3, "step": 9520 }, { "grad_norm": 0.16889768838882446, "learning_rate": 0.0002718679844589083, "loss": 0.4144, "step": 9530 }, { "gate_value": 0.17701050639152527, "icl_sequence_length": 82, "num_contexts": 3, "step": 9530 }, { "grad_norm": 1.308580994606018, "learning_rate": 0.0002717956415812594, "loss": 0.4063, "step": 9540 }, { "gate_value": 0.17713579535484314, "icl_sequence_length": 74, "num_contexts": 3, "step": 9540 }, { "grad_norm": 0.1434451937675476, "learning_rate": 0.0002717232154574379, "loss": 0.3982, "step": 9550 }, { "gate_value": 0.1771237850189209, "icl_sequence_length": 68, "num_contexts": 3, "step": 9550 }, { "grad_norm": 0.11620178818702698, "learning_rate": 0.0002716507061369464, "loss": 0.4241, "step": 9560 }, { "gate_value": 0.17687828838825226, "icl_sequence_length": 74, "num_contexts": 3, "step": 9560 }, { "grad_norm": 0.29256099462509155, "learning_rate": 0.0002715781136693444, "loss": 0.4099, "step": 9570 }, { "gate_value": 0.17701010406017303, "icl_sequence_length": 90, "num_contexts": 3, "step": 9570 }, { "grad_norm": 0.36338385939598083, "learning_rate": 0.00027150543810424815, "loss": 0.4106, "step": 9580 }, { "gate_value": 0.1774062216281891, "icl_sequence_length": 84, "num_contexts": 3, "step": 9580 }, { "grad_norm": 0.1436304897069931, "learning_rate": 0.0002714326794913306, "loss": 0.4277, "step": 9590 }, { "gate_value": 0.1778738647699356, "icl_sequence_length": 92, "num_contexts": 3, "step": 9590 }, { "grad_norm": 0.1371924877166748, "learning_rate": 0.0002713598378803217, "loss": 0.4125, "step": 9600 }, { "gate_value": 0.178547665476799, "icl_sequence_length": 82, "num_contexts": 3, "step": 9600 }, { "grad_norm": 0.2031945139169693, "learning_rate": 0.000271286913321008, "loss": 0.4155, "step": 9610 }, { "gate_value": 0.17900629341602325, "icl_sequence_length": 80, "num_contexts": 3, "step": 9610 }, { "grad_norm": 0.13193300366401672, "learning_rate": 0.00027121390586323264, "loss": 0.4101, "step": 9620 }, { "gate_value": 0.1796974092721939, "icl_sequence_length": 78, "num_contexts": 3, "step": 9620 }, { "grad_norm": 0.11391983181238174, "learning_rate": 0.0002711408155568956, "loss": 0.4049, "step": 9630 }, { "gate_value": 0.17991317808628082, "icl_sequence_length": 90, "num_contexts": 3, "step": 9630 }, { "grad_norm": 1.1728397607803345, "learning_rate": 0.0002710676424519535, "loss": 0.4106, "step": 9640 }, { "gate_value": 0.18047550320625305, "icl_sequence_length": 82, "num_contexts": 3, "step": 9640 }, { "grad_norm": 0.14085493981838226, "learning_rate": 0.00027099438659841933, "loss": 0.3913, "step": 9650 }, { "gate_value": 0.18084587156772614, "icl_sequence_length": 86, "num_contexts": 3, "step": 9650 }, { "grad_norm": 0.33219820261001587, "learning_rate": 0.0002709210480463628, "loss": 0.4001, "step": 9660 }, { "gate_value": 0.18111124634742737, "icl_sequence_length": 78, "num_contexts": 3, "step": 9660 }, { "grad_norm": 0.1452653855085373, "learning_rate": 0.0002708476268459102, "loss": 0.4168, "step": 9670 }, { "gate_value": 0.1815754622220993, "icl_sequence_length": 86, "num_contexts": 3, "step": 9670 }, { "grad_norm": 0.40567007660865784, "learning_rate": 0.0002707741230472442, "loss": 0.3981, "step": 9680 }, { "gate_value": 0.18233047425746918, "icl_sequence_length": 78, "num_contexts": 3, "step": 9680 }, { "grad_norm": 0.13525651395320892, "learning_rate": 0.00027070053670060385, "loss": 0.4127, "step": 9690 }, { "gate_value": 0.1820317804813385, "icl_sequence_length": 78, "num_contexts": 3, "step": 9690 }, { "grad_norm": 0.1818414181470871, "learning_rate": 0.0002706268678562849, "loss": 0.4117, "step": 9700 }, { "gate_value": 0.1822560578584671, "icl_sequence_length": 82, "num_contexts": 3, "step": 9700 }, { "grad_norm": 0.4028673470020294, "learning_rate": 0.0002705531165646391, "loss": 0.3898, "step": 9710 }, { "gate_value": 0.1818368136882782, "icl_sequence_length": 94, "num_contexts": 3, "step": 9710 }, { "grad_norm": 0.12147530168294907, "learning_rate": 0.00027047928287607495, "loss": 0.4118, "step": 9720 }, { "gate_value": 0.182128444314003, "icl_sequence_length": 82, "num_contexts": 3, "step": 9720 }, { "grad_norm": 0.4396604299545288, "learning_rate": 0.000270405366841057, "loss": 0.4106, "step": 9730 }, { "gate_value": 0.18241579830646515, "icl_sequence_length": 92, "num_contexts": 3, "step": 9730 }, { "grad_norm": 0.09294986724853516, "learning_rate": 0.0002703313685101062, "loss": 0.3941, "step": 9740 }, { "gate_value": 0.18249233067035675, "icl_sequence_length": 84, "num_contexts": 3, "step": 9740 }, { "grad_norm": 0.15700602531433105, "learning_rate": 0.00027025728793379956, "loss": 0.412, "step": 9750 }, { "gate_value": 0.18315435945987701, "icl_sequence_length": 76, "num_contexts": 3, "step": 9750 }, { "grad_norm": 0.7230283617973328, "learning_rate": 0.0002701831251627707, "loss": 0.4121, "step": 9760 }, { "gate_value": 0.1829233020544052, "icl_sequence_length": 92, "num_contexts": 3, "step": 9760 }, { "grad_norm": 0.11449993401765823, "learning_rate": 0.000270108880247709, "loss": 0.397, "step": 9770 }, { "gate_value": 0.1825665831565857, "icl_sequence_length": 74, "num_contexts": 3, "step": 9770 }, { "grad_norm": 0.2630598247051239, "learning_rate": 0.00027003455323936014, "loss": 0.4077, "step": 9780 }, { "gate_value": 0.18319743871688843, "icl_sequence_length": 68, "num_contexts": 3, "step": 9780 }, { "grad_norm": 0.22726242244243622, "learning_rate": 0.00026996014418852616, "loss": 0.4094, "step": 9790 }, { "gate_value": 0.18323422968387604, "icl_sequence_length": 68, "num_contexts": 3, "step": 9790 }, { "grad_norm": 0.13267017900943756, "learning_rate": 0.0002698856531460646, "loss": 0.3891, "step": 9800 }, { "gate_value": 0.1828533113002777, "icl_sequence_length": 78, "num_contexts": 3, "step": 9800 }, { "grad_norm": 0.31722262501716614, "learning_rate": 0.0002698110801628897, "loss": 0.4149, "step": 9810 }, { "gate_value": 0.1827845424413681, "icl_sequence_length": 84, "num_contexts": 3, "step": 9810 }, { "grad_norm": 1.0422130823135376, "learning_rate": 0.0002697364252899713, "loss": 0.4124, "step": 9820 }, { "gate_value": 0.18345683813095093, "icl_sequence_length": 72, "num_contexts": 3, "step": 9820 }, { "grad_norm": 0.30988809466362, "learning_rate": 0.0002696616885783351, "loss": 0.4034, "step": 9830 }, { "gate_value": 0.18385672569274902, "icl_sequence_length": 70, "num_contexts": 3, "step": 9830 }, { "grad_norm": 0.5596709847450256, "learning_rate": 0.0002695868700790632, "loss": 0.4102, "step": 9840 }, { "gate_value": 0.18392261862754822, "icl_sequence_length": 76, "num_contexts": 3, "step": 9840 }, { "grad_norm": 0.3464175760746002, "learning_rate": 0.00026951196984329324, "loss": 0.3861, "step": 9850 }, { "gate_value": 0.18414254486560822, "icl_sequence_length": 92, "num_contexts": 3, "step": 9850 }, { "grad_norm": 0.6886278390884399, "learning_rate": 0.00026943698792221876, "loss": 0.4217, "step": 9860 }, { "gate_value": 0.1840921938419342, "icl_sequence_length": 86, "num_contexts": 3, "step": 9860 }, { "grad_norm": 0.1508173644542694, "learning_rate": 0.00026936192436708935, "loss": 0.4243, "step": 9870 }, { "gate_value": 0.1839398741722107, "icl_sequence_length": 76, "num_contexts": 3, "step": 9870 }, { "grad_norm": 0.7154667377471924, "learning_rate": 0.0002692867792292101, "loss": 0.3945, "step": 9880 }, { "gate_value": 0.18328236043453217, "icl_sequence_length": 74, "num_contexts": 3, "step": 9880 }, { "grad_norm": 0.453127920627594, "learning_rate": 0.000269211552559942, "loss": 0.3922, "step": 9890 }, { "gate_value": 0.1840886026620865, "icl_sequence_length": 72, "num_contexts": 3, "step": 9890 }, { "grad_norm": 0.4540349543094635, "learning_rate": 0.0002691362444107019, "loss": 0.4095, "step": 9900 }, { "gate_value": 0.18400152027606964, "icl_sequence_length": 72, "num_contexts": 3, "step": 9900 }, { "grad_norm": 0.09634116291999817, "learning_rate": 0.0002690608548329621, "loss": 0.3901, "step": 9910 }, { "gate_value": 0.1838119924068451, "icl_sequence_length": 86, "num_contexts": 3, "step": 9910 }, { "grad_norm": 0.20187024772167206, "learning_rate": 0.00026898538387825076, "loss": 0.3852, "step": 9920 }, { "gate_value": 0.18371149897575378, "icl_sequence_length": 74, "num_contexts": 3, "step": 9920 }, { "grad_norm": 0.14723998308181763, "learning_rate": 0.00026890983159815146, "loss": 0.4087, "step": 9930 }, { "gate_value": 0.18399539589881897, "icl_sequence_length": 84, "num_contexts": 3, "step": 9930 }, { "grad_norm": 0.4817863702774048, "learning_rate": 0.00026883419804430347, "loss": 0.4053, "step": 9940 }, { "gate_value": 0.18433444201946259, "icl_sequence_length": 68, "num_contexts": 3, "step": 9940 }, { "grad_norm": 0.10653712600469589, "learning_rate": 0.0002687584832684017, "loss": 0.4021, "step": 9950 }, { "gate_value": 0.18434855341911316, "icl_sequence_length": 68, "num_contexts": 3, "step": 9950 }, { "grad_norm": 0.11284265667200089, "learning_rate": 0.00026868268732219646, "loss": 0.3996, "step": 9960 }, { "gate_value": 0.18480272591114044, "icl_sequence_length": 62, "num_contexts": 3, "step": 9960 }, { "grad_norm": 1.6425282955169678, "learning_rate": 0.0002686068102574935, "loss": 0.4174, "step": 9970 }, { "gate_value": 0.18493328988552094, "icl_sequence_length": 96, "num_contexts": 3, "step": 9970 }, { "grad_norm": 0.20928919315338135, "learning_rate": 0.00026853085212615415, "loss": 0.4149, "step": 9980 }, { "gate_value": 0.18461304903030396, "icl_sequence_length": 80, "num_contexts": 3, "step": 9980 }, { "grad_norm": 0.1534522920846939, "learning_rate": 0.0002684548129800951, "loss": 0.4061, "step": 9990 }, { "gate_value": 0.1850883811712265, "icl_sequence_length": 82, "num_contexts": 3, "step": 9990 }, { "grad_norm": 0.11889275908470154, "learning_rate": 0.0002683786928712883, "loss": 0.4142, "step": 10000 }, { "gate_value": 0.1857037991285324, "icl_sequence_length": 82, "num_contexts": 3, "step": 10000 }, { "grad_norm": 0.10349398106336594, "learning_rate": 0.0002683024918517611, "loss": 0.4084, "step": 10010 }, { "gate_value": 0.18580205738544464, "icl_sequence_length": 78, "num_contexts": 3, "step": 10010 }, { "grad_norm": 0.20732319355010986, "learning_rate": 0.0002682262099735963, "loss": 0.4431, "step": 10020 }, { "gate_value": 0.18496911227703094, "icl_sequence_length": 76, "num_contexts": 3, "step": 10020 }, { "grad_norm": 0.08580266684293747, "learning_rate": 0.0002681498472889318, "loss": 0.4009, "step": 10030 }, { "gate_value": 0.18453511595726013, "icl_sequence_length": 78, "num_contexts": 3, "step": 10030 }, { "grad_norm": 0.14457011222839355, "learning_rate": 0.00026807340384996076, "loss": 0.4056, "step": 10040 }, { "gate_value": 0.18475934863090515, "icl_sequence_length": 76, "num_contexts": 3, "step": 10040 }, { "grad_norm": 0.11152706295251846, "learning_rate": 0.00026799687970893157, "loss": 0.4083, "step": 10050 }, { "gate_value": 0.1854647696018219, "icl_sequence_length": 76, "num_contexts": 3, "step": 10050 }, { "grad_norm": 0.20805339515209198, "learning_rate": 0.0002679202749181477, "loss": 0.4114, "step": 10060 }, { "gate_value": 0.1850779503583908, "icl_sequence_length": 70, "num_contexts": 3, "step": 10060 }, { "grad_norm": 0.12921452522277832, "learning_rate": 0.00026784358952996784, "loss": 0.3928, "step": 10070 }, { "gate_value": 0.18519233167171478, "icl_sequence_length": 82, "num_contexts": 3, "step": 10070 }, { "grad_norm": 0.6804535388946533, "learning_rate": 0.0002677668235968058, "loss": 0.4186, "step": 10080 }, { "gate_value": 0.18516965210437775, "icl_sequence_length": 80, "num_contexts": 3, "step": 10080 }, { "grad_norm": 0.214371919631958, "learning_rate": 0.0002676899771711303, "loss": 0.4086, "step": 10090 }, { "gate_value": 0.18558774888515472, "icl_sequence_length": 74, "num_contexts": 3, "step": 10090 }, { "grad_norm": 0.4606313407421112, "learning_rate": 0.0002676130503054651, "loss": 0.3933, "step": 10100 }, { "gate_value": 0.18566465377807617, "icl_sequence_length": 88, "num_contexts": 3, "step": 10100 }, { "grad_norm": 0.08777086436748505, "learning_rate": 0.00026753604305238904, "loss": 0.4084, "step": 10110 }, { "gate_value": 0.18632765114307404, "icl_sequence_length": 82, "num_contexts": 3, "step": 10110 }, { "grad_norm": 0.12021689116954803, "learning_rate": 0.00026745895546453587, "loss": 0.4083, "step": 10120 }, { "gate_value": 0.18685314059257507, "icl_sequence_length": 64, "num_contexts": 3, "step": 10120 }, { "grad_norm": 0.3062693774700165, "learning_rate": 0.0002673817875945942, "loss": 0.4227, "step": 10130 }, { "gate_value": 0.18733736872673035, "icl_sequence_length": 80, "num_contexts": 3, "step": 10130 }, { "grad_norm": 0.08411114662885666, "learning_rate": 0.0002673045394953076, "loss": 0.3954, "step": 10140 }, { "gate_value": 0.1871633529663086, "icl_sequence_length": 74, "num_contexts": 3, "step": 10140 }, { "grad_norm": 0.221735417842865, "learning_rate": 0.00026722721121947435, "loss": 0.3988, "step": 10150 }, { "gate_value": 0.1876184344291687, "icl_sequence_length": 86, "num_contexts": 3, "step": 10150 }, { "grad_norm": 0.2119043618440628, "learning_rate": 0.00026714980281994756, "loss": 0.4149, "step": 10160 }, { "gate_value": 0.18710008263587952, "icl_sequence_length": 88, "num_contexts": 3, "step": 10160 }, { "grad_norm": 0.2251426726579666, "learning_rate": 0.0002670723143496353, "loss": 0.3877, "step": 10170 }, { "gate_value": 0.18695615231990814, "icl_sequence_length": 90, "num_contexts": 3, "step": 10170 }, { "grad_norm": 0.12051714211702347, "learning_rate": 0.00026699474586150006, "loss": 0.4051, "step": 10180 }, { "gate_value": 0.18728826940059662, "icl_sequence_length": 70, "num_contexts": 3, "step": 10180 }, { "grad_norm": 0.15876959264278412, "learning_rate": 0.0002669170974085592, "loss": 0.3975, "step": 10190 }, { "gate_value": 0.18744824826717377, "icl_sequence_length": 84, "num_contexts": 3, "step": 10190 }, { "grad_norm": 0.06675142794847488, "learning_rate": 0.00026683936904388475, "loss": 0.4138, "step": 10200 }, { "gate_value": 0.18797366321086884, "icl_sequence_length": 80, "num_contexts": 3, "step": 10200 }, { "grad_norm": 0.1978006511926651, "learning_rate": 0.00026676156082060324, "loss": 0.4155, "step": 10210 }, { "gate_value": 0.1884395033121109, "icl_sequence_length": 94, "num_contexts": 3, "step": 10210 }, { "grad_norm": 0.13535474240779877, "learning_rate": 0.00026668367279189596, "loss": 0.4163, "step": 10220 }, { "gate_value": 0.18881042301654816, "icl_sequence_length": 88, "num_contexts": 3, "step": 10220 }, { "grad_norm": 0.3524298071861267, "learning_rate": 0.0002666057050109986, "loss": 0.3984, "step": 10230 }, { "gate_value": 0.1897391676902771, "icl_sequence_length": 76, "num_contexts": 3, "step": 10230 }, { "grad_norm": 0.11394257098436356, "learning_rate": 0.0002665276575312013, "loss": 0.4047, "step": 10240 }, { "gate_value": 0.18992333114147186, "icl_sequence_length": 88, "num_contexts": 3, "step": 10240 }, { "grad_norm": 0.19303439557552338, "learning_rate": 0.000266449530405849, "loss": 0.4021, "step": 10250 }, { "gate_value": 0.1900814026594162, "icl_sequence_length": 72, "num_contexts": 3, "step": 10250 }, { "grad_norm": 0.25334855914115906, "learning_rate": 0.0002663713236883406, "loss": 0.41, "step": 10260 }, { "gate_value": 0.19061408936977386, "icl_sequence_length": 76, "num_contexts": 3, "step": 10260 }, { "grad_norm": 0.13257992267608643, "learning_rate": 0.00026629303743212984, "loss": 0.4089, "step": 10270 }, { "gate_value": 0.19116652011871338, "icl_sequence_length": 82, "num_contexts": 3, "step": 10270 }, { "grad_norm": 0.10446831583976746, "learning_rate": 0.00026621467169072455, "loss": 0.4039, "step": 10280 }, { "gate_value": 0.19149279594421387, "icl_sequence_length": 84, "num_contexts": 3, "step": 10280 }, { "grad_norm": 0.18859893083572388, "learning_rate": 0.00026613622651768703, "loss": 0.3911, "step": 10290 }, { "gate_value": 0.19110308587551117, "icl_sequence_length": 70, "num_contexts": 3, "step": 10290 }, { "grad_norm": 0.1820632517337799, "learning_rate": 0.00026605770196663374, "loss": 0.4042, "step": 10300 }, { "gate_value": 0.19114728271961212, "icl_sequence_length": 82, "num_contexts": 3, "step": 10300 }, { "grad_norm": 0.2477792650461197, "learning_rate": 0.0002659790980912355, "loss": 0.4068, "step": 10310 }, { "gate_value": 0.1913151741027832, "icl_sequence_length": 84, "num_contexts": 3, "step": 10310 }, { "grad_norm": 0.14938776195049286, "learning_rate": 0.0002659004149452174, "loss": 0.4046, "step": 10320 }, { "gate_value": 0.19160404801368713, "icl_sequence_length": 84, "num_contexts": 3, "step": 10320 }, { "grad_norm": 0.09767202287912369, "learning_rate": 0.0002658216525823585, "loss": 0.3943, "step": 10330 }, { "gate_value": 0.1918257772922516, "icl_sequence_length": 82, "num_contexts": 3, "step": 10330 }, { "grad_norm": 0.10734827816486359, "learning_rate": 0.0002657428110564923, "loss": 0.4126, "step": 10340 }, { "gate_value": 0.1912240982055664, "icl_sequence_length": 68, "num_contexts": 3, "step": 10340 }, { "grad_norm": 0.0930911973118782, "learning_rate": 0.00026566389042150597, "loss": 0.408, "step": 10350 }, { "gate_value": 0.19152508676052094, "icl_sequence_length": 80, "num_contexts": 3, "step": 10350 }, { "grad_norm": 0.2743377089500427, "learning_rate": 0.0002655848907313413, "loss": 0.3837, "step": 10360 }, { "gate_value": 0.1920650601387024, "icl_sequence_length": 76, "num_contexts": 3, "step": 10360 }, { "grad_norm": 0.34478214383125305, "learning_rate": 0.00026550581203999365, "loss": 0.396, "step": 10370 }, { "gate_value": 0.19311079382896423, "icl_sequence_length": 70, "num_contexts": 3, "step": 10370 }, { "grad_norm": 0.8458656072616577, "learning_rate": 0.00026542665440151266, "loss": 0.4128, "step": 10380 }, { "gate_value": 0.19356413185596466, "icl_sequence_length": 82, "num_contexts": 3, "step": 10380 }, { "grad_norm": 0.08593804389238358, "learning_rate": 0.00026534741787000176, "loss": 0.4051, "step": 10390 }, { "gate_value": 0.1938723474740982, "icl_sequence_length": 80, "num_contexts": 3, "step": 10390 }, { "grad_norm": 0.11517023295164108, "learning_rate": 0.0002652681024996185, "loss": 0.4116, "step": 10400 }, { "gate_value": 0.19354356825351715, "icl_sequence_length": 64, "num_contexts": 3, "step": 10400 }, { "grad_norm": 0.07598927617073059, "learning_rate": 0.000265188708344574, "loss": 0.3978, "step": 10410 }, { "gate_value": 0.19409018754959106, "icl_sequence_length": 86, "num_contexts": 3, "step": 10410 }, { "grad_norm": 0.2558054029941559, "learning_rate": 0.00026510923545913355, "loss": 0.3929, "step": 10420 }, { "gate_value": 0.19420674443244934, "icl_sequence_length": 92, "num_contexts": 3, "step": 10420 }, { "grad_norm": 0.09162849932909012, "learning_rate": 0.0002650296838976161, "loss": 0.4059, "step": 10430 }, { "gate_value": 0.19449542462825775, "icl_sequence_length": 88, "num_contexts": 3, "step": 10430 }, { "grad_norm": 0.15429988503456116, "learning_rate": 0.00026495005371439433, "loss": 0.3918, "step": 10440 }, { "gate_value": 0.19442659616470337, "icl_sequence_length": 88, "num_contexts": 3, "step": 10440 }, { "grad_norm": 0.2850008010864258, "learning_rate": 0.00026487034496389475, "loss": 0.4045, "step": 10450 }, { "gate_value": 0.19495940208435059, "icl_sequence_length": 84, "num_contexts": 3, "step": 10450 }, { "grad_norm": 0.060812897980213165, "learning_rate": 0.00026479055770059755, "loss": 0.4023, "step": 10460 }, { "gate_value": 0.19487933814525604, "icl_sequence_length": 84, "num_contexts": 3, "step": 10460 }, { "grad_norm": 0.14067623019218445, "learning_rate": 0.0002647106919790366, "loss": 0.4084, "step": 10470 }, { "gate_value": 0.19565919041633606, "icl_sequence_length": 70, "num_contexts": 3, "step": 10470 }, { "grad_norm": 0.5360892415046692, "learning_rate": 0.00026463074785379936, "loss": 0.412, "step": 10480 }, { "gate_value": 0.19578729569911957, "icl_sequence_length": 90, "num_contexts": 3, "step": 10480 }, { "grad_norm": 0.1606275886297226, "learning_rate": 0.00026455072537952685, "loss": 0.4003, "step": 10490 }, { "gate_value": 0.19515106081962585, "icl_sequence_length": 70, "num_contexts": 3, "step": 10490 }, { "grad_norm": 0.1275758445262909, "learning_rate": 0.00026447062461091366, "loss": 0.4102, "step": 10500 }, { "gate_value": 0.19452591240406036, "icl_sequence_length": 90, "num_contexts": 3, "step": 10500 }, { "grad_norm": 0.757901668548584, "learning_rate": 0.000264390445602708, "loss": 0.4105, "step": 10510 }, { "gate_value": 0.19385652244091034, "icl_sequence_length": 70, "num_contexts": 3, "step": 10510 }, { "grad_norm": 0.23169085383415222, "learning_rate": 0.00026431018840971136, "loss": 0.393, "step": 10520 }, { "gate_value": 0.19386009871959686, "icl_sequence_length": 88, "num_contexts": 3, "step": 10520 }, { "grad_norm": 0.09247046709060669, "learning_rate": 0.0002642298530867788, "loss": 0.3884, "step": 10530 }, { "gate_value": 0.1936485320329666, "icl_sequence_length": 78, "num_contexts": 3, "step": 10530 }, { "grad_norm": 0.1239648163318634, "learning_rate": 0.0002641494396888188, "loss": 0.3954, "step": 10540 }, { "gate_value": 0.193940669298172, "icl_sequence_length": 72, "num_contexts": 3, "step": 10540 }, { "grad_norm": 0.08281738311052322, "learning_rate": 0.00026406894827079317, "loss": 0.3819, "step": 10550 }, { "gate_value": 0.19448955357074738, "icl_sequence_length": 74, "num_contexts": 3, "step": 10550 }, { "grad_norm": 0.2577625811100006, "learning_rate": 0.000263988378887717, "loss": 0.4143, "step": 10560 }, { "gate_value": 0.19554971158504486, "icl_sequence_length": 66, "num_contexts": 3, "step": 10560 }, { "grad_norm": 0.1077817901968956, "learning_rate": 0.0002639077315946587, "loss": 0.4029, "step": 10570 }, { "gate_value": 0.19615165889263153, "icl_sequence_length": 82, "num_contexts": 3, "step": 10570 }, { "grad_norm": 0.24295492470264435, "learning_rate": 0.0002638270064467399, "loss": 0.4179, "step": 10580 }, { "gate_value": 0.19590169191360474, "icl_sequence_length": 78, "num_contexts": 3, "step": 10580 }, { "grad_norm": 0.1465950757265091, "learning_rate": 0.00026374620349913554, "loss": 0.3956, "step": 10590 }, { "gate_value": 0.196251779794693, "icl_sequence_length": 90, "num_contexts": 3, "step": 10590 }, { "grad_norm": 0.1468675583600998, "learning_rate": 0.00026366532280707366, "loss": 0.3949, "step": 10600 }, { "gate_value": 0.19655568897724152, "icl_sequence_length": 76, "num_contexts": 3, "step": 10600 }, { "grad_norm": 0.25693845748901367, "learning_rate": 0.00026358436442583546, "loss": 0.4107, "step": 10610 }, { "gate_value": 0.19645854830741882, "icl_sequence_length": 92, "num_contexts": 3, "step": 10610 }, { "grad_norm": 0.6391911506652832, "learning_rate": 0.0002635033284107552, "loss": 0.3942, "step": 10620 }, { "gate_value": 0.195563405752182, "icl_sequence_length": 84, "num_contexts": 3, "step": 10620 }, { "grad_norm": 0.08769239485263824, "learning_rate": 0.00026342221481722025, "loss": 0.4048, "step": 10630 }, { "gate_value": 0.1954774558544159, "icl_sequence_length": 74, "num_contexts": 3, "step": 10630 }, { "grad_norm": 0.8492870926856995, "learning_rate": 0.00026334102370067093, "loss": 0.4027, "step": 10640 }, { "gate_value": 0.19497144222259521, "icl_sequence_length": 86, "num_contexts": 3, "step": 10640 }, { "grad_norm": 0.1712755709886551, "learning_rate": 0.00026325975511660066, "loss": 0.4122, "step": 10650 }, { "gate_value": 0.19459974765777588, "icl_sequence_length": 86, "num_contexts": 3, "step": 10650 }, { "grad_norm": 0.0769922062754631, "learning_rate": 0.00026317840912055577, "loss": 0.3907, "step": 10660 }, { "gate_value": 0.19528329372406006, "icl_sequence_length": 80, "num_contexts": 3, "step": 10660 }, { "grad_norm": 0.09351008385419846, "learning_rate": 0.00026309698576813546, "loss": 0.3861, "step": 10670 }, { "gate_value": 0.19526593387126923, "icl_sequence_length": 74, "num_contexts": 3, "step": 10670 }, { "grad_norm": 0.10859563201665878, "learning_rate": 0.00026301548511499187, "loss": 0.3891, "step": 10680 }, { "gate_value": 0.1956755518913269, "icl_sequence_length": 86, "num_contexts": 3, "step": 10680 }, { "grad_norm": 0.09231425821781158, "learning_rate": 0.0002629339072168298, "loss": 0.4035, "step": 10690 }, { "gate_value": 0.19672146439552307, "icl_sequence_length": 86, "num_contexts": 3, "step": 10690 }, { "grad_norm": 0.1549149751663208, "learning_rate": 0.00026285225212940703, "loss": 0.3961, "step": 10700 }, { "gate_value": 0.19741348922252655, "icl_sequence_length": 86, "num_contexts": 3, "step": 10700 }, { "grad_norm": 0.13633489608764648, "learning_rate": 0.0002627705199085341, "loss": 0.3813, "step": 10710 }, { "gate_value": 0.1971900761127472, "icl_sequence_length": 74, "num_contexts": 3, "step": 10710 }, { "grad_norm": 0.22589802742004395, "learning_rate": 0.0002626887106100742, "loss": 0.4179, "step": 10720 }, { "gate_value": 0.19635334610939026, "icl_sequence_length": 92, "num_contexts": 3, "step": 10720 }, { "grad_norm": 0.14190012216567993, "learning_rate": 0.0002626068242899432, "loss": 0.418, "step": 10730 }, { "gate_value": 0.1962626874446869, "icl_sequence_length": 66, "num_contexts": 3, "step": 10730 }, { "grad_norm": 0.19081388413906097, "learning_rate": 0.0002625248610041095, "loss": 0.3844, "step": 10740 }, { "gate_value": 0.19717535376548767, "icl_sequence_length": 86, "num_contexts": 3, "step": 10740 }, { "grad_norm": 0.1003599539399147, "learning_rate": 0.0002624428208085945, "loss": 0.4141, "step": 10750 }, { "gate_value": 0.19717150926589966, "icl_sequence_length": 74, "num_contexts": 3, "step": 10750 }, { "grad_norm": 0.9871111512184143, "learning_rate": 0.0002623607037594717, "loss": 0.4207, "step": 10760 }, { "gate_value": 0.19743303954601288, "icl_sequence_length": 76, "num_contexts": 3, "step": 10760 }, { "grad_norm": 0.10571835190057755, "learning_rate": 0.0002622785099128673, "loss": 0.392, "step": 10770 }, { "gate_value": 0.197885200381279, "icl_sequence_length": 88, "num_contexts": 3, "step": 10770 }, { "grad_norm": 1.096756100654602, "learning_rate": 0.0002621962393249602, "loss": 0.4089, "step": 10780 }, { "gate_value": 0.19861187040805817, "icl_sequence_length": 74, "num_contexts": 3, "step": 10780 }, { "grad_norm": 4.666723251342773, "learning_rate": 0.0002621138920519814, "loss": 0.3928, "step": 10790 }, { "gate_value": 0.1985689103603363, "icl_sequence_length": 80, "num_contexts": 3, "step": 10790 }, { "grad_norm": 0.28258785605430603, "learning_rate": 0.0002620314681502146, "loss": 0.4085, "step": 10800 }, { "gate_value": 0.19868691265583038, "icl_sequence_length": 78, "num_contexts": 3, "step": 10800 }, { "grad_norm": 0.14337381720542908, "learning_rate": 0.00026194896767599567, "loss": 0.4042, "step": 10810 }, { "gate_value": 0.19859598577022552, "icl_sequence_length": 78, "num_contexts": 3, "step": 10810 }, { "grad_norm": 0.19911415874958038, "learning_rate": 0.000261866390685713, "loss": 0.4034, "step": 10820 }, { "gate_value": 0.19771799445152283, "icl_sequence_length": 62, "num_contexts": 3, "step": 10820 }, { "grad_norm": 0.13793812692165375, "learning_rate": 0.0002617837372358071, "loss": 0.3974, "step": 10830 }, { "gate_value": 0.19785933196544647, "icl_sequence_length": 94, "num_contexts": 3, "step": 10830 }, { "grad_norm": 0.2408376932144165, "learning_rate": 0.00026170100738277086, "loss": 0.4094, "step": 10840 }, { "gate_value": 0.19815245270729065, "icl_sequence_length": 78, "num_contexts": 3, "step": 10840 }, { "grad_norm": 5.759314060211182, "learning_rate": 0.0002616182011831493, "loss": 0.3989, "step": 10850 }, { "gate_value": 0.1983094960451126, "icl_sequence_length": 88, "num_contexts": 3, "step": 10850 }, { "grad_norm": 0.5932163596153259, "learning_rate": 0.00026153531869353984, "loss": 0.3882, "step": 10860 }, { "gate_value": 0.19824165105819702, "icl_sequence_length": 68, "num_contexts": 3, "step": 10860 }, { "grad_norm": 0.16285298764705658, "learning_rate": 0.0002614523599705917, "loss": 0.3957, "step": 10870 }, { "gate_value": 0.19801777601242065, "icl_sequence_length": 82, "num_contexts": 3, "step": 10870 }, { "grad_norm": 0.10833615809679031, "learning_rate": 0.0002613693250710065, "loss": 0.4188, "step": 10880 }, { "gate_value": 0.1979728490114212, "icl_sequence_length": 88, "num_contexts": 3, "step": 10880 }, { "grad_norm": 0.27149438858032227, "learning_rate": 0.00026128621405153773, "loss": 0.3914, "step": 10890 }, { "gate_value": 0.19862493872642517, "icl_sequence_length": 78, "num_contexts": 3, "step": 10890 }, { "grad_norm": 0.18332615494728088, "learning_rate": 0.000261203026968991, "loss": 0.4009, "step": 10900 }, { "gate_value": 0.1991543471813202, "icl_sequence_length": 84, "num_contexts": 3, "step": 10900 }, { "grad_norm": 0.2317703515291214, "learning_rate": 0.0002611197638802239, "loss": 0.3942, "step": 10910 }, { "gate_value": 0.19955024123191833, "icl_sequence_length": 84, "num_contexts": 3, "step": 10910 }, { "grad_norm": 0.31707388162612915, "learning_rate": 0.000261036424842146, "loss": 0.39, "step": 10920 }, { "gate_value": 0.19923262298107147, "icl_sequence_length": 76, "num_contexts": 3, "step": 10920 }, { "grad_norm": 0.15591078996658325, "learning_rate": 0.0002609530099117188, "loss": 0.3914, "step": 10930 }, { "gate_value": 0.1992308497428894, "icl_sequence_length": 82, "num_contexts": 3, "step": 10930 }, { "grad_norm": 0.17535750567913055, "learning_rate": 0.0002608695191459555, "loss": 0.4069, "step": 10940 }, { "gate_value": 0.19915905594825745, "icl_sequence_length": 72, "num_contexts": 3, "step": 10940 }, { "grad_norm": 0.18063399195671082, "learning_rate": 0.00026078595260192137, "loss": 0.3959, "step": 10950 }, { "gate_value": 0.19885124266147614, "icl_sequence_length": 76, "num_contexts": 3, "step": 10950 }, { "grad_norm": 0.39659494161605835, "learning_rate": 0.00026070231033673317, "loss": 0.4133, "step": 10960 }, { "gate_value": 0.198845773935318, "icl_sequence_length": 94, "num_contexts": 3, "step": 10960 }, { "grad_norm": 0.3130630552768707, "learning_rate": 0.00026061859240755975, "loss": 0.4056, "step": 10970 }, { "gate_value": 0.1990204006433487, "icl_sequence_length": 76, "num_contexts": 3, "step": 10970 }, { "grad_norm": 0.2690387964248657, "learning_rate": 0.00026053479887162156, "loss": 0.3996, "step": 10980 }, { "gate_value": 0.19938315451145172, "icl_sequence_length": 56, "num_contexts": 3, "step": 10980 }, { "grad_norm": 0.519874632358551, "learning_rate": 0.0002604509297861905, "loss": 0.3991, "step": 10990 }, { "gate_value": 0.19984617829322815, "icl_sequence_length": 88, "num_contexts": 3, "step": 10990 }, { "grad_norm": 0.3831429183483124, "learning_rate": 0.00026036698520859054, "loss": 0.4068, "step": 11000 }, { "gate_value": 0.1997131109237671, "icl_sequence_length": 90, "num_contexts": 3, "step": 11000 }, { "grad_norm": 0.3186333477497101, "learning_rate": 0.0002602829651961968, "loss": 0.4008, "step": 11010 }, { "gate_value": 0.20006214082241058, "icl_sequence_length": 70, "num_contexts": 3, "step": 11010 }, { "grad_norm": 0.6247459650039673, "learning_rate": 0.0002601988698064363, "loss": 0.3934, "step": 11020 }, { "gate_value": 0.2011529803276062, "icl_sequence_length": 94, "num_contexts": 3, "step": 11020 }, { "grad_norm": 0.5456902384757996, "learning_rate": 0.0002601146990967874, "loss": 0.3996, "step": 11030 }, { "gate_value": 0.2017337530851364, "icl_sequence_length": 74, "num_contexts": 3, "step": 11030 }, { "grad_norm": 0.6686020493507385, "learning_rate": 0.00026003045312477996, "loss": 0.41, "step": 11040 }, { "gate_value": 0.20241455733776093, "icl_sequence_length": 76, "num_contexts": 3, "step": 11040 }, { "grad_norm": 0.33470675349235535, "learning_rate": 0.0002599461319479954, "loss": 0.4025, "step": 11050 }, { "gate_value": 0.20253930985927582, "icl_sequence_length": 90, "num_contexts": 3, "step": 11050 }, { "grad_norm": 0.2852339446544647, "learning_rate": 0.0002598617356240663, "loss": 0.4074, "step": 11060 }, { "gate_value": 0.20202483236789703, "icl_sequence_length": 90, "num_contexts": 3, "step": 11060 }, { "grad_norm": 3.269089460372925, "learning_rate": 0.00025977726421067687, "loss": 0.4201, "step": 11070 }, { "gate_value": 0.20169176161289215, "icl_sequence_length": 72, "num_contexts": 3, "step": 11070 }, { "grad_norm": 0.577283501625061, "learning_rate": 0.0002596927177655625, "loss": 0.3903, "step": 11080 }, { "gate_value": 0.20160920917987823, "icl_sequence_length": 92, "num_contexts": 3, "step": 11080 }, { "grad_norm": 0.5205775499343872, "learning_rate": 0.0002596080963465099, "loss": 0.4123, "step": 11090 }, { "gate_value": 0.2019902616739273, "icl_sequence_length": 88, "num_contexts": 3, "step": 11090 }, { "grad_norm": 0.5158397555351257, "learning_rate": 0.00025952340001135694, "loss": 0.3929, "step": 11100 }, { "gate_value": 0.2014622986316681, "icl_sequence_length": 86, "num_contexts": 3, "step": 11100 }, { "grad_norm": 0.3267214000225067, "learning_rate": 0.00025943862881799287, "loss": 0.3998, "step": 11110 }, { "gate_value": 0.201474130153656, "icl_sequence_length": 74, "num_contexts": 3, "step": 11110 }, { "grad_norm": 0.4279942214488983, "learning_rate": 0.0002593537828243579, "loss": 0.3862, "step": 11120 }, { "gate_value": 0.20170217752456665, "icl_sequence_length": 82, "num_contexts": 3, "step": 11120 }, { "grad_norm": 0.2724282741546631, "learning_rate": 0.0002592688620884435, "loss": 0.3939, "step": 11130 }, { "gate_value": 0.20227088034152985, "icl_sequence_length": 68, "num_contexts": 3, "step": 11130 }, { "grad_norm": 0.24624592065811157, "learning_rate": 0.0002591838666682922, "loss": 0.3959, "step": 11140 }, { "gate_value": 0.20264950394630432, "icl_sequence_length": 72, "num_contexts": 3, "step": 11140 }, { "grad_norm": 0.17288321256637573, "learning_rate": 0.0002590987966219976, "loss": 0.3931, "step": 11150 }, { "gate_value": 0.2034718543291092, "icl_sequence_length": 74, "num_contexts": 3, "step": 11150 }, { "grad_norm": 0.12512151896953583, "learning_rate": 0.00025901365200770433, "loss": 0.3943, "step": 11160 }, { "gate_value": 0.2034679651260376, "icl_sequence_length": 78, "num_contexts": 3, "step": 11160 }, { "grad_norm": 0.22475214302539825, "learning_rate": 0.00025892843288360777, "loss": 0.424, "step": 11170 }, { "gate_value": 0.2031925916671753, "icl_sequence_length": 82, "num_contexts": 3, "step": 11170 }, { "grad_norm": 0.11940725892782211, "learning_rate": 0.0002588431393079544, "loss": 0.4023, "step": 11180 }, { "gate_value": 0.20355471968650818, "icl_sequence_length": 76, "num_contexts": 3, "step": 11180 }, { "grad_norm": 0.16159369051456451, "learning_rate": 0.00025875777133904177, "loss": 0.4014, "step": 11190 }, { "gate_value": 0.20374581217765808, "icl_sequence_length": 86, "num_contexts": 3, "step": 11190 }, { "grad_norm": 1.926444172859192, "learning_rate": 0.0002586723290352179, "loss": 0.3897, "step": 11200 }, { "gate_value": 0.2042069137096405, "icl_sequence_length": 78, "num_contexts": 3, "step": 11200 }, { "grad_norm": 0.15389111638069153, "learning_rate": 0.0002585868124548819, "loss": 0.3937, "step": 11210 }, { "gate_value": 0.2046923041343689, "icl_sequence_length": 78, "num_contexts": 3, "step": 11210 }, { "grad_norm": 0.4765077233314514, "learning_rate": 0.0002585012216564834, "loss": 0.4035, "step": 11220 }, { "gate_value": 0.2043992131948471, "icl_sequence_length": 62, "num_contexts": 3, "step": 11220 }, { "grad_norm": 0.20746025443077087, "learning_rate": 0.00025841555669852307, "loss": 0.4032, "step": 11230 }, { "gate_value": 0.20465603470802307, "icl_sequence_length": 66, "num_contexts": 3, "step": 11230 }, { "grad_norm": 2.6083579063415527, "learning_rate": 0.00025832981763955205, "loss": 0.391, "step": 11240 }, { "gate_value": 0.2050364762544632, "icl_sequence_length": 80, "num_contexts": 3, "step": 11240 }, { "grad_norm": 0.14017367362976074, "learning_rate": 0.0002582440045381721, "loss": 0.3997, "step": 11250 }, { "gate_value": 0.20489144325256348, "icl_sequence_length": 82, "num_contexts": 3, "step": 11250 }, { "grad_norm": 0.21358314156532288, "learning_rate": 0.0002581581174530357, "loss": 0.407, "step": 11260 }, { "gate_value": 0.20463384687900543, "icl_sequence_length": 70, "num_contexts": 3, "step": 11260 }, { "grad_norm": 0.5433079600334167, "learning_rate": 0.000258072156442846, "loss": 0.398, "step": 11270 }, { "gate_value": 0.20472584664821625, "icl_sequence_length": 76, "num_contexts": 3, "step": 11270 }, { "grad_norm": 0.2550680339336395, "learning_rate": 0.0002579861215663564, "loss": 0.4106, "step": 11280 }, { "gate_value": 0.2049986571073532, "icl_sequence_length": 64, "num_contexts": 3, "step": 11280 }, { "grad_norm": 5.18765926361084, "learning_rate": 0.00025790001288237093, "loss": 0.4125, "step": 11290 }, { "gate_value": 0.20544874668121338, "icl_sequence_length": 70, "num_contexts": 3, "step": 11290 }, { "grad_norm": 0.8696367740631104, "learning_rate": 0.00025781383044974415, "loss": 0.4079, "step": 11300 }, { "gate_value": 0.20583544671535492, "icl_sequence_length": 70, "num_contexts": 3, "step": 11300 }, { "grad_norm": 0.540113091468811, "learning_rate": 0.0002577275743273808, "loss": 0.4053, "step": 11310 }, { "gate_value": 0.20618411898612976, "icl_sequence_length": 92, "num_contexts": 3, "step": 11310 }, { "grad_norm": 0.2631917893886566, "learning_rate": 0.00025764124457423627, "loss": 0.4173, "step": 11320 }, { "gate_value": 0.2062470018863678, "icl_sequence_length": 78, "num_contexts": 3, "step": 11320 }, { "grad_norm": 0.4374353885650635, "learning_rate": 0.00025755484124931604, "loss": 0.4042, "step": 11330 }, { "gate_value": 0.20691519975662231, "icl_sequence_length": 74, "num_contexts": 3, "step": 11330 }, { "grad_norm": 0.11059547960758209, "learning_rate": 0.000257468364411676, "loss": 0.3971, "step": 11340 }, { "gate_value": 0.20722566545009613, "icl_sequence_length": 70, "num_contexts": 3, "step": 11340 }, { "grad_norm": 1.345912218093872, "learning_rate": 0.0002573818141204222, "loss": 0.3874, "step": 11350 }, { "gate_value": 0.20745126903057098, "icl_sequence_length": 88, "num_contexts": 3, "step": 11350 }, { "grad_norm": 0.13886769115924835, "learning_rate": 0.0002572951904347111, "loss": 0.4211, "step": 11360 }, { "gate_value": 0.20737117528915405, "icl_sequence_length": 86, "num_contexts": 3, "step": 11360 }, { "grad_norm": 0.12669552862644196, "learning_rate": 0.000257208493413749, "loss": 0.4001, "step": 11370 }, { "gate_value": 0.2072693556547165, "icl_sequence_length": 86, "num_contexts": 3, "step": 11370 }, { "grad_norm": 0.19020166993141174, "learning_rate": 0.00025712172311679254, "loss": 0.4082, "step": 11380 }, { "gate_value": 0.20699158310890198, "icl_sequence_length": 64, "num_contexts": 3, "step": 11380 }, { "grad_norm": 1.0254766941070557, "learning_rate": 0.0002570348796031485, "loss": 0.4116, "step": 11390 }, { "gate_value": 0.2072145640850067, "icl_sequence_length": 86, "num_contexts": 3, "step": 11390 }, { "grad_norm": 0.566098690032959, "learning_rate": 0.0002569479629321735, "loss": 0.3974, "step": 11400 }, { "gate_value": 0.2072124481201172, "icl_sequence_length": 90, "num_contexts": 3, "step": 11400 }, { "grad_norm": 0.15408563613891602, "learning_rate": 0.0002568609731632743, "loss": 0.4004, "step": 11410 }, { "gate_value": 0.20725180208683014, "icl_sequence_length": 76, "num_contexts": 3, "step": 11410 }, { "grad_norm": 0.1329866200685501, "learning_rate": 0.00025677391035590764, "loss": 0.4095, "step": 11420 }, { "gate_value": 0.20727354288101196, "icl_sequence_length": 80, "num_contexts": 3, "step": 11420 }, { "grad_norm": 0.20672903954982758, "learning_rate": 0.00025668677456957997, "loss": 0.4015, "step": 11430 }, { "gate_value": 0.20741140842437744, "icl_sequence_length": 76, "num_contexts": 3, "step": 11430 }, { "grad_norm": 2.841158390045166, "learning_rate": 0.00025659956586384795, "loss": 0.4014, "step": 11440 }, { "gate_value": 0.20806585252285004, "icl_sequence_length": 78, "num_contexts": 3, "step": 11440 }, { "grad_norm": 0.2784873843193054, "learning_rate": 0.00025651228429831777, "loss": 0.3921, "step": 11450 }, { "gate_value": 0.2081623524427414, "icl_sequence_length": 80, "num_contexts": 3, "step": 11450 }, { "grad_norm": 0.27490857243537903, "learning_rate": 0.00025642492993264564, "loss": 0.3905, "step": 11460 }, { "gate_value": 0.20837196707725525, "icl_sequence_length": 94, "num_contexts": 3, "step": 11460 }, { "grad_norm": 0.17722657322883606, "learning_rate": 0.00025633750282653744, "loss": 0.4118, "step": 11470 }, { "gate_value": 0.20847874879837036, "icl_sequence_length": 80, "num_contexts": 3, "step": 11470 }, { "grad_norm": 0.22108136117458344, "learning_rate": 0.0002562500030397488, "loss": 0.4031, "step": 11480 }, { "gate_value": 0.20883841812610626, "icl_sequence_length": 88, "num_contexts": 3, "step": 11480 }, { "grad_norm": 7.156761646270752, "learning_rate": 0.0002561624306320849, "loss": 0.3949, "step": 11490 }, { "gate_value": 0.2090214192867279, "icl_sequence_length": 88, "num_contexts": 3, "step": 11490 }, { "grad_norm": 1.761451005935669, "learning_rate": 0.0002560747856634007, "loss": 0.3936, "step": 11500 }, { "gate_value": 0.20910920202732086, "icl_sequence_length": 90, "num_contexts": 3, "step": 11500 }, { "grad_norm": 0.25213727355003357, "learning_rate": 0.00025598706819360083, "loss": 0.379, "step": 11510 }, { "gate_value": 0.20905554294586182, "icl_sequence_length": 88, "num_contexts": 3, "step": 11510 }, { "grad_norm": 0.11987407505512238, "learning_rate": 0.00025589927828263914, "loss": 0.4081, "step": 11520 }, { "gate_value": 0.20902156829833984, "icl_sequence_length": 70, "num_contexts": 3, "step": 11520 }, { "grad_norm": 0.22566959261894226, "learning_rate": 0.00025581141599051937, "loss": 0.4216, "step": 11530 }, { "gate_value": 0.20962709188461304, "icl_sequence_length": 70, "num_contexts": 3, "step": 11530 }, { "grad_norm": 0.4890899658203125, "learning_rate": 0.0002557234813772945, "loss": 0.4087, "step": 11540 }, { "gate_value": 0.2097298502922058, "icl_sequence_length": 82, "num_contexts": 3, "step": 11540 }, { "grad_norm": 0.2041608691215515, "learning_rate": 0.00025563547450306703, "loss": 0.408, "step": 11550 }, { "gate_value": 0.20969891548156738, "icl_sequence_length": 92, "num_contexts": 3, "step": 11550 }, { "grad_norm": 1.2819507122039795, "learning_rate": 0.0002555473954279888, "loss": 0.4097, "step": 11560 }, { "gate_value": 0.20980440080165863, "icl_sequence_length": 76, "num_contexts": 3, "step": 11560 }, { "grad_norm": 0.22319957613945007, "learning_rate": 0.00025545924421226107, "loss": 0.4189, "step": 11570 }, { "gate_value": 0.20961406826972961, "icl_sequence_length": 70, "num_contexts": 3, "step": 11570 }, { "grad_norm": 0.2318154275417328, "learning_rate": 0.00025537102091613434, "loss": 0.4078, "step": 11580 }, { "gate_value": 0.20991379022598267, "icl_sequence_length": 76, "num_contexts": 3, "step": 11580 }, { "grad_norm": 0.2543465495109558, "learning_rate": 0.0002552827255999084, "loss": 0.3914, "step": 11590 }, { "gate_value": 0.21022337675094604, "icl_sequence_length": 74, "num_contexts": 3, "step": 11590 }, { "grad_norm": 0.8110744953155518, "learning_rate": 0.00025519435832393225, "loss": 0.3871, "step": 11600 }, { "gate_value": 0.2104700356721878, "icl_sequence_length": 80, "num_contexts": 3, "step": 11600 }, { "grad_norm": 0.1309548169374466, "learning_rate": 0.0002551059191486041, "loss": 0.4099, "step": 11610 }, { "gate_value": 0.21059533953666687, "icl_sequence_length": 70, "num_contexts": 3, "step": 11610 }, { "grad_norm": 0.6616964936256409, "learning_rate": 0.00025501740813437137, "loss": 0.389, "step": 11620 }, { "gate_value": 0.21070751547813416, "icl_sequence_length": 70, "num_contexts": 3, "step": 11620 }, { "grad_norm": 0.5464457869529724, "learning_rate": 0.00025492882534173037, "loss": 0.3947, "step": 11630 }, { "gate_value": 0.21091219782829285, "icl_sequence_length": 84, "num_contexts": 3, "step": 11630 }, { "grad_norm": 8.245672225952148, "learning_rate": 0.0002548401708312267, "loss": 0.3988, "step": 11640 }, { "gate_value": 0.21120022237300873, "icl_sequence_length": 78, "num_contexts": 3, "step": 11640 }, { "grad_norm": 0.2549119293689728, "learning_rate": 0.0002547514446634548, "loss": 0.3989, "step": 11650 }, { "gate_value": 0.2115693986415863, "icl_sequence_length": 84, "num_contexts": 3, "step": 11650 }, { "grad_norm": 1.0499767065048218, "learning_rate": 0.00025466264689905826, "loss": 0.3905, "step": 11660 }, { "gate_value": 0.2116161584854126, "icl_sequence_length": 84, "num_contexts": 3, "step": 11660 }, { "grad_norm": 0.4546162188053131, "learning_rate": 0.00025457377759872946, "loss": 0.4116, "step": 11670 }, { "gate_value": 0.2118997722864151, "icl_sequence_length": 82, "num_contexts": 3, "step": 11670 }, { "grad_norm": 0.4323030710220337, "learning_rate": 0.00025448483682320976, "loss": 0.401, "step": 11680 }, { "gate_value": 0.21170073747634888, "icl_sequence_length": 78, "num_contexts": 3, "step": 11680 }, { "grad_norm": 2.9688122272491455, "learning_rate": 0.00025439582463328937, "loss": 0.4011, "step": 11690 }, { "gate_value": 0.21142630279064178, "icl_sequence_length": 92, "num_contexts": 3, "step": 11690 }, { "grad_norm": 0.3110814094543457, "learning_rate": 0.00025430674108980713, "loss": 0.3959, "step": 11700 }, { "gate_value": 0.21202224493026733, "icl_sequence_length": 84, "num_contexts": 3, "step": 11700 }, { "grad_norm": 0.1604657620191574, "learning_rate": 0.000254217586253651, "loss": 0.3987, "step": 11710 }, { "gate_value": 0.21249915659427643, "icl_sequence_length": 74, "num_contexts": 3, "step": 11710 }, { "grad_norm": 0.7040252089500427, "learning_rate": 0.0002541283601857573, "loss": 0.3869, "step": 11720 }, { "gate_value": 0.21285098791122437, "icl_sequence_length": 72, "num_contexts": 3, "step": 11720 }, { "grad_norm": 0.8333344459533691, "learning_rate": 0.00025403906294711135, "loss": 0.3844, "step": 11730 }, { "gate_value": 0.21329067647457123, "icl_sequence_length": 88, "num_contexts": 3, "step": 11730 }, { "grad_norm": 0.3842971622943878, "learning_rate": 0.0002539496945987469, "loss": 0.3995, "step": 11740 }, { "gate_value": 0.21374772489070892, "icl_sequence_length": 80, "num_contexts": 3, "step": 11740 }, { "grad_norm": 0.16032423079013824, "learning_rate": 0.00025386025520174636, "loss": 0.3798, "step": 11750 }, { "gate_value": 0.21357718110084534, "icl_sequence_length": 76, "num_contexts": 3, "step": 11750 }, { "grad_norm": 12.013026237487793, "learning_rate": 0.0002537707448172407, "loss": 0.4131, "step": 11760 }, { "gate_value": 0.21378053724765778, "icl_sequence_length": 84, "num_contexts": 3, "step": 11760 }, { "grad_norm": 0.17190168797969818, "learning_rate": 0.0002536811635064095, "loss": 0.3908, "step": 11770 }, { "gate_value": 0.21405042707920074, "icl_sequence_length": 76, "num_contexts": 3, "step": 11770 }, { "grad_norm": 0.15969933569431305, "learning_rate": 0.00025359151133048073, "loss": 0.3995, "step": 11780 }, { "gate_value": 0.2140841782093048, "icl_sequence_length": 86, "num_contexts": 3, "step": 11780 }, { "grad_norm": 0.2060869038105011, "learning_rate": 0.0002535017883507307, "loss": 0.4008, "step": 11790 }, { "gate_value": 0.21388085186481476, "icl_sequence_length": 76, "num_contexts": 3, "step": 11790 }, { "grad_norm": 0.09752976894378662, "learning_rate": 0.0002534119946284844, "loss": 0.3981, "step": 11800 }, { "gate_value": 0.21341773867607117, "icl_sequence_length": 76, "num_contexts": 3, "step": 11800 }, { "grad_norm": 0.21220721304416656, "learning_rate": 0.00025332213022511476, "loss": 0.4095, "step": 11810 }, { "gate_value": 0.2137778401374817, "icl_sequence_length": 74, "num_contexts": 3, "step": 11810 }, { "grad_norm": 9.720185279846191, "learning_rate": 0.00025323219520204343, "loss": 0.3894, "step": 11820 }, { "gate_value": 0.21382419764995575, "icl_sequence_length": 74, "num_contexts": 3, "step": 11820 }, { "grad_norm": 0.1416504681110382, "learning_rate": 0.00025314218962074015, "loss": 0.3947, "step": 11830 }, { "gate_value": 0.21372593939304352, "icl_sequence_length": 82, "num_contexts": 3, "step": 11830 }, { "grad_norm": 0.2012588381767273, "learning_rate": 0.0002530521135427228, "loss": 0.4004, "step": 11840 }, { "gate_value": 0.21327029168605804, "icl_sequence_length": 72, "num_contexts": 3, "step": 11840 }, { "grad_norm": 0.40130773186683655, "learning_rate": 0.0002529619670295575, "loss": 0.4058, "step": 11850 }, { "gate_value": 0.21350201964378357, "icl_sequence_length": 66, "num_contexts": 3, "step": 11850 }, { "grad_norm": 3.3934764862060547, "learning_rate": 0.0002528717501428587, "loss": 0.4031, "step": 11860 }, { "gate_value": 0.21378423273563385, "icl_sequence_length": 82, "num_contexts": 3, "step": 11860 }, { "grad_norm": 0.17320500314235687, "learning_rate": 0.0002527814629442887, "loss": 0.389, "step": 11870 }, { "gate_value": 0.2139461636543274, "icl_sequence_length": 88, "num_contexts": 3, "step": 11870 }, { "grad_norm": 0.15953584015369415, "learning_rate": 0.0002526911054955579, "loss": 0.3925, "step": 11880 }, { "gate_value": 0.21486397087574005, "icl_sequence_length": 86, "num_contexts": 3, "step": 11880 }, { "grad_norm": 0.1063394844532013, "learning_rate": 0.00025260067785842484, "loss": 0.4014, "step": 11890 }, { "gate_value": 0.21554067730903625, "icl_sequence_length": 68, "num_contexts": 3, "step": 11890 }, { "grad_norm": 0.2693503499031067, "learning_rate": 0.00025251018009469594, "loss": 0.4011, "step": 11900 }, { "gate_value": 0.21639026701450348, "icl_sequence_length": 76, "num_contexts": 3, "step": 11900 }, { "grad_norm": 0.45159634947776794, "learning_rate": 0.00025241961226622555, "loss": 0.3985, "step": 11910 }, { "gate_value": 0.21661345660686493, "icl_sequence_length": 90, "num_contexts": 3, "step": 11910 }, { "grad_norm": 0.23357713222503662, "learning_rate": 0.00025232897443491596, "loss": 0.396, "step": 11920 }, { "gate_value": 0.21685311198234558, "icl_sequence_length": 84, "num_contexts": 3, "step": 11920 }, { "grad_norm": 0.15322349965572357, "learning_rate": 0.0002522382666627172, "loss": 0.4046, "step": 11930 }, { "gate_value": 0.21700112521648407, "icl_sequence_length": 72, "num_contexts": 3, "step": 11930 }, { "grad_norm": 0.12242075055837631, "learning_rate": 0.00025214748901162724, "loss": 0.3988, "step": 11940 }, { "gate_value": 0.21745328605175018, "icl_sequence_length": 86, "num_contexts": 3, "step": 11940 }, { "grad_norm": 0.18854005634784698, "learning_rate": 0.0002520566415436917, "loss": 0.3967, "step": 11950 }, { "gate_value": 0.21726283431053162, "icl_sequence_length": 74, "num_contexts": 3, "step": 11950 }, { "grad_norm": 0.16896556317806244, "learning_rate": 0.00025196572432100404, "loss": 0.4054, "step": 11960 }, { "gate_value": 0.2172841727733612, "icl_sequence_length": 66, "num_contexts": 3, "step": 11960 }, { "grad_norm": 0.15144844353199005, "learning_rate": 0.0002518747374057053, "loss": 0.415, "step": 11970 }, { "gate_value": 0.21710263192653656, "icl_sequence_length": 88, "num_contexts": 3, "step": 11970 }, { "grad_norm": 0.29553645849227905, "learning_rate": 0.00025178368085998417, "loss": 0.4001, "step": 11980 }, { "gate_value": 0.21663376688957214, "icl_sequence_length": 84, "num_contexts": 3, "step": 11980 }, { "grad_norm": 0.24101394414901733, "learning_rate": 0.0002516925547460769, "loss": 0.3992, "step": 11990 }, { "gate_value": 0.21583116054534912, "icl_sequence_length": 88, "num_contexts": 3, "step": 11990 }, { "grad_norm": 3.6199612617492676, "learning_rate": 0.00025160135912626736, "loss": 0.4078, "step": 12000 }, { "gate_value": 0.2161846160888672, "icl_sequence_length": 80, "num_contexts": 3, "step": 12000 }, { "grad_norm": 0.21872039139270782, "learning_rate": 0.0002515100940628869, "loss": 0.4022, "step": 12010 }, { "gate_value": 0.2160039097070694, "icl_sequence_length": 88, "num_contexts": 3, "step": 12010 }, { "grad_norm": 0.23018062114715576, "learning_rate": 0.0002514187596183144, "loss": 0.4129, "step": 12020 }, { "gate_value": 0.21576650440692902, "icl_sequence_length": 88, "num_contexts": 3, "step": 12020 }, { "grad_norm": 0.10655272006988525, "learning_rate": 0.00025132735585497594, "loss": 0.4108, "step": 12030 }, { "gate_value": 0.2153758853673935, "icl_sequence_length": 88, "num_contexts": 3, "step": 12030 }, { "grad_norm": 0.13647131621837616, "learning_rate": 0.00025123588283534524, "loss": 0.4082, "step": 12040 }, { "gate_value": 0.21577800810337067, "icl_sequence_length": 82, "num_contexts": 3, "step": 12040 }, { "grad_norm": 0.14735141396522522, "learning_rate": 0.0002511443406219432, "loss": 0.4017, "step": 12050 }, { "gate_value": 0.2160462588071823, "icl_sequence_length": 84, "num_contexts": 3, "step": 12050 }, { "grad_norm": 0.10175333172082901, "learning_rate": 0.00025105272927733815, "loss": 0.4152, "step": 12060 }, { "gate_value": 0.21647045016288757, "icl_sequence_length": 84, "num_contexts": 3, "step": 12060 }, { "grad_norm": 0.15681752562522888, "learning_rate": 0.00025096104886414543, "loss": 0.4076, "step": 12070 }, { "gate_value": 0.21629151701927185, "icl_sequence_length": 82, "num_contexts": 3, "step": 12070 }, { "grad_norm": 0.22178079187870026, "learning_rate": 0.0002508692994450279, "loss": 0.4194, "step": 12080 }, { "gate_value": 0.2159765064716339, "icl_sequence_length": 68, "num_contexts": 3, "step": 12080 }, { "grad_norm": 0.3628661334514618, "learning_rate": 0.00025077748108269526, "loss": 0.3933, "step": 12090 }, { "gate_value": 0.21603453159332275, "icl_sequence_length": 72, "num_contexts": 3, "step": 12090 }, { "grad_norm": 2.652317523956299, "learning_rate": 0.0002506855938399046, "loss": 0.4035, "step": 12100 }, { "gate_value": 0.21626484394073486, "icl_sequence_length": 94, "num_contexts": 3, "step": 12100 }, { "grad_norm": 0.2516089379787445, "learning_rate": 0.00025059363777946, "loss": 0.3986, "step": 12110 }, { "gate_value": 0.2169078141450882, "icl_sequence_length": 86, "num_contexts": 3, "step": 12110 }, { "grad_norm": 0.11790217459201813, "learning_rate": 0.0002505016129642125, "loss": 0.4076, "step": 12120 }, { "gate_value": 0.21709206700325012, "icl_sequence_length": 76, "num_contexts": 3, "step": 12120 }, { "grad_norm": 0.11641325056552887, "learning_rate": 0.00025040951945706015, "loss": 0.4049, "step": 12130 }, { "gate_value": 0.21792539954185486, "icl_sequence_length": 84, "num_contexts": 3, "step": 12130 }, { "grad_norm": 1.3086296319961548, "learning_rate": 0.0002503173573209481, "loss": 0.4068, "step": 12140 }, { "gate_value": 0.21753643453121185, "icl_sequence_length": 76, "num_contexts": 3, "step": 12140 }, { "grad_norm": 0.12861375510692596, "learning_rate": 0.0002502251266188683, "loss": 0.383, "step": 12150 }, { "gate_value": 0.21757358312606812, "icl_sequence_length": 94, "num_contexts": 3, "step": 12150 }, { "grad_norm": 0.15611277520656586, "learning_rate": 0.00025013282741385946, "loss": 0.4115, "step": 12160 }, { "gate_value": 0.21742816269397736, "icl_sequence_length": 86, "num_contexts": 3, "step": 12160 }, { "grad_norm": 6.722140789031982, "learning_rate": 0.0002500404597690073, "loss": 0.4, "step": 12170 }, { "gate_value": 0.2178168147802353, "icl_sequence_length": 62, "num_contexts": 3, "step": 12170 }, { "grad_norm": 0.666631281375885, "learning_rate": 0.00024994802374744417, "loss": 0.3926, "step": 12180 }, { "gate_value": 0.21850833296775818, "icl_sequence_length": 72, "num_contexts": 3, "step": 12180 }, { "grad_norm": 0.7619335651397705, "learning_rate": 0.00024985551941234934, "loss": 0.4156, "step": 12190 }, { "gate_value": 0.21924303472042084, "icl_sequence_length": 90, "num_contexts": 3, "step": 12190 }, { "grad_norm": 0.20830854773521423, "learning_rate": 0.00024976294682694855, "loss": 0.3696, "step": 12200 }, { "gate_value": 0.21952006220817566, "icl_sequence_length": 90, "num_contexts": 3, "step": 12200 }, { "grad_norm": 0.15673328936100006, "learning_rate": 0.00024967030605451426, "loss": 0.4022, "step": 12210 }, { "gate_value": 0.2195618897676468, "icl_sequence_length": 66, "num_contexts": 3, "step": 12210 }, { "grad_norm": 0.19766902923583984, "learning_rate": 0.0002495775971583657, "loss": 0.3935, "step": 12220 }, { "gate_value": 0.2193445861339569, "icl_sequence_length": 92, "num_contexts": 3, "step": 12220 }, { "grad_norm": 0.25193437933921814, "learning_rate": 0.0002494848202018684, "loss": 0.3898, "step": 12230 }, { "gate_value": 0.21949507296085358, "icl_sequence_length": 64, "num_contexts": 3, "step": 12230 }, { "grad_norm": 0.16779395937919617, "learning_rate": 0.0002493919752484346, "loss": 0.3802, "step": 12240 }, { "gate_value": 0.2198145091533661, "icl_sequence_length": 76, "num_contexts": 3, "step": 12240 }, { "grad_norm": 0.22382983565330505, "learning_rate": 0.0002492990623615229, "loss": 0.382, "step": 12250 }, { "gate_value": 0.22057458758354187, "icl_sequence_length": 80, "num_contexts": 3, "step": 12250 }, { "grad_norm": 0.19160112738609314, "learning_rate": 0.0002492060816046384, "loss": 0.4112, "step": 12260 }, { "gate_value": 0.22058996558189392, "icl_sequence_length": 66, "num_contexts": 3, "step": 12260 }, { "grad_norm": 0.20744547247886658, "learning_rate": 0.00024911303304133255, "loss": 0.4119, "step": 12270 }, { "gate_value": 0.2202199250459671, "icl_sequence_length": 86, "num_contexts": 3, "step": 12270 }, { "grad_norm": 1.4961268901824951, "learning_rate": 0.0002490199167352033, "loss": 0.4039, "step": 12280 }, { "gate_value": 0.2205304652452469, "icl_sequence_length": 76, "num_contexts": 3, "step": 12280 }, { "grad_norm": 0.15413504838943481, "learning_rate": 0.0002489267327498946, "loss": 0.4038, "step": 12290 }, { "gate_value": 0.2205338329076767, "icl_sequence_length": 70, "num_contexts": 3, "step": 12290 }, { "grad_norm": 0.41113772988319397, "learning_rate": 0.00024883348114909686, "loss": 0.402, "step": 12300 }, { "gate_value": 0.22082765400409698, "icl_sequence_length": 70, "num_contexts": 3, "step": 12300 }, { "grad_norm": 0.1423916518688202, "learning_rate": 0.0002487401619965467, "loss": 0.4016, "step": 12310 }, { "gate_value": 0.2202834039926529, "icl_sequence_length": 76, "num_contexts": 3, "step": 12310 }, { "grad_norm": 0.292427122592926, "learning_rate": 0.000248646775356027, "loss": 0.4115, "step": 12320 }, { "gate_value": 0.21988192200660706, "icl_sequence_length": 68, "num_contexts": 3, "step": 12320 }, { "grad_norm": 0.33503174781799316, "learning_rate": 0.0002485533212913664, "loss": 0.3795, "step": 12330 }, { "gate_value": 0.2196183055639267, "icl_sequence_length": 74, "num_contexts": 3, "step": 12330 }, { "grad_norm": 0.14144979417324066, "learning_rate": 0.0002484597998664401, "loss": 0.3932, "step": 12340 }, { "gate_value": 0.2193087488412857, "icl_sequence_length": 90, "num_contexts": 3, "step": 12340 }, { "grad_norm": 0.42851975560188293, "learning_rate": 0.00024836621114516887, "loss": 0.3965, "step": 12350 }, { "gate_value": 0.21940088272094727, "icl_sequence_length": 82, "num_contexts": 3, "step": 12350 }, { "grad_norm": 1.2762000560760498, "learning_rate": 0.00024827255519152, "loss": 0.3975, "step": 12360 }, { "gate_value": 0.2194403111934662, "icl_sequence_length": 80, "num_contexts": 3, "step": 12360 }, { "grad_norm": 0.374787837266922, "learning_rate": 0.0002481788320695062, "loss": 0.3782, "step": 12370 }, { "gate_value": 0.21983832120895386, "icl_sequence_length": 86, "num_contexts": 3, "step": 12370 }, { "grad_norm": 0.21959860622882843, "learning_rate": 0.0002480850418431865, "loss": 0.3915, "step": 12380 }, { "gate_value": 0.22001340985298157, "icl_sequence_length": 96, "num_contexts": 3, "step": 12380 }, { "grad_norm": 0.1599218249320984, "learning_rate": 0.0002479911845766656, "loss": 0.3897, "step": 12390 }, { "gate_value": 0.22048789262771606, "icl_sequence_length": 78, "num_contexts": 3, "step": 12390 }, { "grad_norm": 0.7569872736930847, "learning_rate": 0.00024789726033409403, "loss": 0.3922, "step": 12400 }, { "gate_value": 0.22119253873825073, "icl_sequence_length": 66, "num_contexts": 3, "step": 12400 }, { "grad_norm": 1.3088319301605225, "learning_rate": 0.0002478032691796682, "loss": 0.3877, "step": 12410 }, { "gate_value": 0.222129225730896, "icl_sequence_length": 86, "num_contexts": 3, "step": 12410 }, { "grad_norm": 0.1527448296546936, "learning_rate": 0.00024770921117763, "loss": 0.3947, "step": 12420 }, { "gate_value": 0.22193755209445953, "icl_sequence_length": 66, "num_contexts": 3, "step": 12420 }, { "grad_norm": 0.16166727244853973, "learning_rate": 0.0002476150863922674, "loss": 0.4036, "step": 12430 }, { "gate_value": 0.22141483426094055, "icl_sequence_length": 76, "num_contexts": 3, "step": 12430 }, { "grad_norm": 0.2394699603319168, "learning_rate": 0.00024752089488791365, "loss": 0.3781, "step": 12440 }, { "gate_value": 0.22163152694702148, "icl_sequence_length": 66, "num_contexts": 3, "step": 12440 }, { "grad_norm": 0.7809823155403137, "learning_rate": 0.00024742663672894786, "loss": 0.4088, "step": 12450 }, { "gate_value": 0.2219049334526062, "icl_sequence_length": 86, "num_contexts": 3, "step": 12450 }, { "grad_norm": 0.5379728674888611, "learning_rate": 0.00024733231197979444, "loss": 0.3875, "step": 12460 }, { "gate_value": 0.22181984782218933, "icl_sequence_length": 80, "num_contexts": 3, "step": 12460 }, { "grad_norm": 0.2976929247379303, "learning_rate": 0.0002472379207049237, "loss": 0.3876, "step": 12470 }, { "gate_value": 0.22250066697597504, "icl_sequence_length": 76, "num_contexts": 3, "step": 12470 }, { "grad_norm": 0.16642062366008759, "learning_rate": 0.000247143462968851, "loss": 0.4015, "step": 12480 }, { "gate_value": 0.2227054238319397, "icl_sequence_length": 94, "num_contexts": 3, "step": 12480 }, { "grad_norm": 0.15899008512496948, "learning_rate": 0.00024704893883613734, "loss": 0.3878, "step": 12490 }, { "gate_value": 0.22255997359752655, "icl_sequence_length": 90, "num_contexts": 3, "step": 12490 }, { "grad_norm": 0.8335736989974976, "learning_rate": 0.0002469543483713891, "loss": 0.3893, "step": 12500 }, { "gate_value": 0.22294364869594574, "icl_sequence_length": 92, "num_contexts": 3, "step": 12500 }, { "grad_norm": 0.48398110270500183, "learning_rate": 0.000246859691639258, "loss": 0.4259, "step": 12510 }, { "gate_value": 0.2238253355026245, "icl_sequence_length": 70, "num_contexts": 3, "step": 12510 }, { "grad_norm": 0.5804698467254639, "learning_rate": 0.00024676496870444105, "loss": 0.408, "step": 12520 }, { "gate_value": 0.22372859716415405, "icl_sequence_length": 68, "num_contexts": 3, "step": 12520 }, { "grad_norm": 0.1858420968055725, "learning_rate": 0.0002466701796316804, "loss": 0.3909, "step": 12530 }, { "gate_value": 0.22408469021320343, "icl_sequence_length": 88, "num_contexts": 3, "step": 12530 }, { "grad_norm": 0.28055644035339355, "learning_rate": 0.00024657532448576347, "loss": 0.3836, "step": 12540 }, { "gate_value": 0.22420431673526764, "icl_sequence_length": 72, "num_contexts": 3, "step": 12540 }, { "grad_norm": 0.36310645937919617, "learning_rate": 0.00024648040333152295, "loss": 0.3864, "step": 12550 }, { "gate_value": 0.2246132344007492, "icl_sequence_length": 86, "num_contexts": 3, "step": 12550 }, { "grad_norm": 0.22087214887142181, "learning_rate": 0.00024638541623383647, "loss": 0.3993, "step": 12560 }, { "gate_value": 0.22429001331329346, "icl_sequence_length": 88, "num_contexts": 3, "step": 12560 }, { "grad_norm": 0.13360650837421417, "learning_rate": 0.00024629036325762686, "loss": 0.3914, "step": 12570 }, { "gate_value": 0.223907932639122, "icl_sequence_length": 78, "num_contexts": 3, "step": 12570 }, { "grad_norm": 0.27683377265930176, "learning_rate": 0.00024619524446786197, "loss": 0.3823, "step": 12580 }, { "gate_value": 0.22451086342334747, "icl_sequence_length": 80, "num_contexts": 3, "step": 12580 }, { "grad_norm": 0.11729135364294052, "learning_rate": 0.0002461000599295545, "loss": 0.3952, "step": 12590 }, { "gate_value": 0.2250521332025528, "icl_sequence_length": 90, "num_contexts": 3, "step": 12590 }, { "grad_norm": 0.3065733015537262, "learning_rate": 0.00024600480970776224, "loss": 0.3835, "step": 12600 }, { "gate_value": 0.22507460415363312, "icl_sequence_length": 90, "num_contexts": 3, "step": 12600 }, { "grad_norm": 0.2557946741580963, "learning_rate": 0.0002459094938675879, "loss": 0.4028, "step": 12610 }, { "gate_value": 0.2248997837305069, "icl_sequence_length": 78, "num_contexts": 3, "step": 12610 }, { "grad_norm": 0.1608905792236328, "learning_rate": 0.0002458141124741788, "loss": 0.3926, "step": 12620 }, { "gate_value": 0.2250816822052002, "icl_sequence_length": 80, "num_contexts": 3, "step": 12620 }, { "grad_norm": 0.5290404558181763, "learning_rate": 0.00024571866559272733, "loss": 0.3991, "step": 12630 }, { "gate_value": 0.22497162222862244, "icl_sequence_length": 72, "num_contexts": 3, "step": 12630 }, { "grad_norm": 0.7313876748085022, "learning_rate": 0.00024562315328847045, "loss": 0.4006, "step": 12640 }, { "gate_value": 0.22498567402362823, "icl_sequence_length": 84, "num_contexts": 3, "step": 12640 }, { "grad_norm": 0.6579257845878601, "learning_rate": 0.00024552757562669, "loss": 0.4221, "step": 12650 }, { "gate_value": 0.22542236745357513, "icl_sequence_length": 94, "num_contexts": 3, "step": 12650 }, { "grad_norm": 0.2773542106151581, "learning_rate": 0.0002454319326727124, "loss": 0.4127, "step": 12660 }, { "gate_value": 0.22569654881954193, "icl_sequence_length": 80, "num_contexts": 3, "step": 12660 }, { "grad_norm": 0.6629268527030945, "learning_rate": 0.00024533622449190865, "loss": 0.3972, "step": 12670 }, { "gate_value": 0.22591601312160492, "icl_sequence_length": 76, "num_contexts": 3, "step": 12670 }, { "grad_norm": 0.5978025197982788, "learning_rate": 0.00024524045114969446, "loss": 0.4055, "step": 12680 }, { "gate_value": 0.225976824760437, "icl_sequence_length": 74, "num_contexts": 3, "step": 12680 }, { "grad_norm": 0.41180774569511414, "learning_rate": 0.00024514461271153, "loss": 0.3886, "step": 12690 }, { "gate_value": 0.22621405124664307, "icl_sequence_length": 76, "num_contexts": 3, "step": 12690 }, { "grad_norm": 0.18459030985832214, "learning_rate": 0.0002450487092429198, "loss": 0.3793, "step": 12700 }, { "gate_value": 0.22648906707763672, "icl_sequence_length": 90, "num_contexts": 3, "step": 12700 }, { "grad_norm": 0.16683566570281982, "learning_rate": 0.0002449527408094132, "loss": 0.391, "step": 12710 }, { "gate_value": 0.22698599100112915, "icl_sequence_length": 90, "num_contexts": 3, "step": 12710 }, { "grad_norm": 0.18806658685207367, "learning_rate": 0.0002448567074766035, "loss": 0.41, "step": 12720 }, { "gate_value": 0.2272697538137436, "icl_sequence_length": 82, "num_contexts": 3, "step": 12720 }, { "grad_norm": 0.189344123005867, "learning_rate": 0.00024476060931012884, "loss": 0.3958, "step": 12730 }, { "gate_value": 0.22789913415908813, "icl_sequence_length": 74, "num_contexts": 3, "step": 12730 }, { "grad_norm": 0.14973287284374237, "learning_rate": 0.00024466444637567114, "loss": 0.3955, "step": 12740 }, { "gate_value": 0.22827696800231934, "icl_sequence_length": 72, "num_contexts": 3, "step": 12740 }, { "grad_norm": 0.16878141462802887, "learning_rate": 0.000244568218738957, "loss": 0.3812, "step": 12750 }, { "gate_value": 0.228811576962471, "icl_sequence_length": 86, "num_contexts": 3, "step": 12750 }, { "grad_norm": 0.1252906322479248, "learning_rate": 0.0002444719264657571, "loss": 0.3787, "step": 12760 }, { "gate_value": 0.22924955189228058, "icl_sequence_length": 92, "num_contexts": 3, "step": 12760 }, { "grad_norm": 0.16972161829471588, "learning_rate": 0.0002443755696218862, "loss": 0.4027, "step": 12770 }, { "gate_value": 0.22910422086715698, "icl_sequence_length": 84, "num_contexts": 3, "step": 12770 }, { "grad_norm": 0.4904925227165222, "learning_rate": 0.0002442791482732034, "loss": 0.3838, "step": 12780 }, { "gate_value": 0.2290533185005188, "icl_sequence_length": 82, "num_contexts": 3, "step": 12780 }, { "grad_norm": 0.6321537494659424, "learning_rate": 0.0002441826624856118, "loss": 0.3995, "step": 12790 }, { "gate_value": 0.22894862294197083, "icl_sequence_length": 84, "num_contexts": 3, "step": 12790 }, { "grad_norm": 3.330693006515503, "learning_rate": 0.0002440861123250585, "loss": 0.4076, "step": 12800 }, { "gate_value": 0.2295045107603073, "icl_sequence_length": 80, "num_contexts": 3, "step": 12800 }, { "grad_norm": 0.434231698513031, "learning_rate": 0.00024398949785753453, "loss": 0.4006, "step": 12810 }, { "gate_value": 0.2292521893978119, "icl_sequence_length": 80, "num_contexts": 3, "step": 12810 }, { "grad_norm": 0.20848998427391052, "learning_rate": 0.00024389281914907507, "loss": 0.3867, "step": 12820 }, { "gate_value": 0.22858992218971252, "icl_sequence_length": 74, "num_contexts": 3, "step": 12820 }, { "grad_norm": 0.12240742146968842, "learning_rate": 0.00024379607626575912, "loss": 0.3894, "step": 12830 }, { "gate_value": 0.22902952134609222, "icl_sequence_length": 80, "num_contexts": 3, "step": 12830 }, { "grad_norm": 0.32847192883491516, "learning_rate": 0.00024369926927370945, "loss": 0.3937, "step": 12840 }, { "gate_value": 0.22911740839481354, "icl_sequence_length": 88, "num_contexts": 3, "step": 12840 }, { "grad_norm": 1.2735588550567627, "learning_rate": 0.0002436023982390928, "loss": 0.4043, "step": 12850 }, { "gate_value": 0.22957158088684082, "icl_sequence_length": 68, "num_contexts": 3, "step": 12850 }, { "grad_norm": 0.12431875616312027, "learning_rate": 0.0002435054632281195, "loss": 0.3846, "step": 12860 }, { "gate_value": 0.2296081781387329, "icl_sequence_length": 64, "num_contexts": 3, "step": 12860 }, { "grad_norm": 0.23901695013046265, "learning_rate": 0.00024340846430704382, "loss": 0.3835, "step": 12870 }, { "gate_value": 0.2295922040939331, "icl_sequence_length": 76, "num_contexts": 3, "step": 12870 }, { "grad_norm": 0.24101102352142334, "learning_rate": 0.00024331140154216358, "loss": 0.3951, "step": 12880 }, { "gate_value": 0.22967174649238586, "icl_sequence_length": 90, "num_contexts": 3, "step": 12880 }, { "grad_norm": 0.16511069238185883, "learning_rate": 0.00024321427499982026, "loss": 0.4028, "step": 12890 }, { "gate_value": 0.2295798808336258, "icl_sequence_length": 80, "num_contexts": 3, "step": 12890 }, { "grad_norm": 0.40296611189842224, "learning_rate": 0.00024311708474639891, "loss": 0.4025, "step": 12900 }, { "gate_value": 0.22963640093803406, "icl_sequence_length": 80, "num_contexts": 3, "step": 12900 }, { "grad_norm": 0.7211963534355164, "learning_rate": 0.00024301983084832826, "loss": 0.3905, "step": 12910 }, { "gate_value": 0.2297767698764801, "icl_sequence_length": 70, "num_contexts": 3, "step": 12910 }, { "grad_norm": 0.2979947030544281, "learning_rate": 0.00024292251337208027, "loss": 0.4067, "step": 12920 }, { "gate_value": 0.22896960377693176, "icl_sequence_length": 80, "num_contexts": 3, "step": 12920 }, { "grad_norm": 0.4544394016265869, "learning_rate": 0.0002428251323841706, "loss": 0.4026, "step": 12930 }, { "gate_value": 0.22909384965896606, "icl_sequence_length": 82, "num_contexts": 3, "step": 12930 }, { "grad_norm": 0.4741581380367279, "learning_rate": 0.0002427276879511583, "loss": 0.4034, "step": 12940 }, { "gate_value": 0.22910211980342865, "icl_sequence_length": 88, "num_contexts": 3, "step": 12940 }, { "grad_norm": 0.6591212749481201, "learning_rate": 0.00024263018013964558, "loss": 0.3913, "step": 12950 }, { "gate_value": 0.2297748625278473, "icl_sequence_length": 72, "num_contexts": 3, "step": 12950 }, { "grad_norm": 1.3386341333389282, "learning_rate": 0.0002425326090162782, "loss": 0.3971, "step": 12960 }, { "gate_value": 0.22955504059791565, "icl_sequence_length": 82, "num_contexts": 3, "step": 12960 }, { "grad_norm": 0.5584853887557983, "learning_rate": 0.00024243497464774514, "loss": 0.3836, "step": 12970 }, { "gate_value": 0.22991710901260376, "icl_sequence_length": 78, "num_contexts": 3, "step": 12970 }, { "grad_norm": 1.5411170721054077, "learning_rate": 0.00024233727710077843, "loss": 0.4148, "step": 12980 }, { "gate_value": 0.23056305944919586, "icl_sequence_length": 86, "num_contexts": 3, "step": 12980 }, { "grad_norm": 1.385258436203003, "learning_rate": 0.00024223951644215358, "loss": 0.3851, "step": 12990 }, { "gate_value": 0.23073600232601166, "icl_sequence_length": 94, "num_contexts": 3, "step": 12990 }, { "grad_norm": 4.856663227081299, "learning_rate": 0.000242141692738689, "loss": 0.3942, "step": 13000 }, { "gate_value": 0.2309921234846115, "icl_sequence_length": 78, "num_contexts": 3, "step": 13000 }, { "grad_norm": 1.7304902076721191, "learning_rate": 0.00024204380605724626, "loss": 0.397, "step": 13010 }, { "gate_value": 0.23110632598400116, "icl_sequence_length": 88, "num_contexts": 3, "step": 13010 }, { "grad_norm": 4.753711223602295, "learning_rate": 0.00024194585646473, "loss": 0.394, "step": 13020 }, { "gate_value": 0.2315414547920227, "icl_sequence_length": 82, "num_contexts": 3, "step": 13020 }, { "grad_norm": 3.1984050273895264, "learning_rate": 0.00024184784402808785, "loss": 0.3844, "step": 13030 }, { "gate_value": 0.2317209541797638, "icl_sequence_length": 78, "num_contexts": 3, "step": 13030 }, { "grad_norm": 4.3004302978515625, "learning_rate": 0.0002417497688143104, "loss": 0.3802, "step": 13040 }, { "gate_value": 0.23195067048072815, "icl_sequence_length": 86, "num_contexts": 3, "step": 13040 }, { "grad_norm": 9.027650833129883, "learning_rate": 0.0002416516308904311, "loss": 0.403, "step": 13050 }, { "gate_value": 0.2321426421403885, "icl_sequence_length": 94, "num_contexts": 3, "step": 13050 }, { "grad_norm": 3.0218982696533203, "learning_rate": 0.00024155343032352628, "loss": 0.3972, "step": 13060 }, { "gate_value": 0.23222358524799347, "icl_sequence_length": 86, "num_contexts": 3, "step": 13060 }, { "grad_norm": 2.7752718925476074, "learning_rate": 0.00024145516718071517, "loss": 0.3899, "step": 13070 }, { "gate_value": 0.2324615716934204, "icl_sequence_length": 82, "num_contexts": 3, "step": 13070 }, { "grad_norm": 2.805215358734131, "learning_rate": 0.00024135684152915964, "loss": 0.3856, "step": 13080 }, { "gate_value": 0.2327684760093689, "icl_sequence_length": 84, "num_contexts": 3, "step": 13080 }, { "grad_norm": 5.619499683380127, "learning_rate": 0.0002412584534360644, "loss": 0.3887, "step": 13090 }, { "gate_value": 0.23298239707946777, "icl_sequence_length": 78, "num_contexts": 3, "step": 13090 }, { "grad_norm": 8.072099685668945, "learning_rate": 0.0002411600029686767, "loss": 0.3859, "step": 13100 }, { "gate_value": 0.23314546048641205, "icl_sequence_length": 60, "num_contexts": 3, "step": 13100 }, { "grad_norm": 6.481606960296631, "learning_rate": 0.00024106149019428657, "loss": 0.4038, "step": 13110 }, { "gate_value": 0.23341412842273712, "icl_sequence_length": 68, "num_contexts": 3, "step": 13110 }, { "grad_norm": 77.76628875732422, "learning_rate": 0.0002409629151802266, "loss": 0.393, "step": 13120 }, { "gate_value": 0.23348568379878998, "icl_sequence_length": 58, "num_contexts": 3, "step": 13120 }, { "grad_norm": 2.652857780456543, "learning_rate": 0.00024086427799387182, "loss": 0.3835, "step": 13130 }, { "gate_value": 0.23361773788928986, "icl_sequence_length": 84, "num_contexts": 3, "step": 13130 }, { "grad_norm": 4.54749059677124, "learning_rate": 0.0002407655787026398, "loss": 0.4017, "step": 13140 }, { "gate_value": 0.23375937342643738, "icl_sequence_length": 58, "num_contexts": 3, "step": 13140 }, { "grad_norm": 6.540721893310547, "learning_rate": 0.00024066681737399062, "loss": 0.387, "step": 13150 }, { "gate_value": 0.23380054533481598, "icl_sequence_length": 78, "num_contexts": 3, "step": 13150 }, { "grad_norm": 5.220513820648193, "learning_rate": 0.00024056799407542667, "loss": 0.4035, "step": 13160 }, { "gate_value": 0.233912393450737, "icl_sequence_length": 76, "num_contexts": 3, "step": 13160 }, { "grad_norm": 2.220935583114624, "learning_rate": 0.00024046910887449283, "loss": 0.3829, "step": 13170 }, { "gate_value": 0.23434002697467804, "icl_sequence_length": 78, "num_contexts": 3, "step": 13170 }, { "grad_norm": 10.84092903137207, "learning_rate": 0.00024037016183877614, "loss": 0.3715, "step": 13180 }, { "gate_value": 0.23447051644325256, "icl_sequence_length": 82, "num_contexts": 3, "step": 13180 }, { "grad_norm": 2.110276699066162, "learning_rate": 0.0002402711530359059, "loss": 0.422, "step": 13190 }, { "gate_value": 0.23508982360363007, "icl_sequence_length": 90, "num_contexts": 3, "step": 13190 }, { "grad_norm": 2.608215093612671, "learning_rate": 0.00024017208253355383, "loss": 0.3955, "step": 13200 }, { "gate_value": 0.2356402426958084, "icl_sequence_length": 92, "num_contexts": 3, "step": 13200 }, { "grad_norm": 3.4447200298309326, "learning_rate": 0.0002400729503994336, "loss": 0.3674, "step": 13210 }, { "gate_value": 0.23609599471092224, "icl_sequence_length": 76, "num_contexts": 3, "step": 13210 }, { "grad_norm": 31.154781341552734, "learning_rate": 0.00023997375670130116, "loss": 0.376, "step": 13220 }, { "gate_value": 0.23642614483833313, "icl_sequence_length": 78, "num_contexts": 3, "step": 13220 }, { "grad_norm": 11.202011108398438, "learning_rate": 0.00023987450150695437, "loss": 0.396, "step": 13230 }, { "gate_value": 0.2366163730621338, "icl_sequence_length": 74, "num_contexts": 3, "step": 13230 }, { "grad_norm": 8.70836067199707, "learning_rate": 0.00023977518488423324, "loss": 0.3887, "step": 13240 }, { "gate_value": 0.23657521605491638, "icl_sequence_length": 70, "num_contexts": 3, "step": 13240 }, { "grad_norm": 4.990417957305908, "learning_rate": 0.0002396758069010198, "loss": 0.3666, "step": 13250 }, { "gate_value": 0.23652224242687225, "icl_sequence_length": 86, "num_contexts": 3, "step": 13250 }, { "grad_norm": 8.101316452026367, "learning_rate": 0.00023957636762523792, "loss": 0.3931, "step": 13260 }, { "gate_value": 0.23652319610118866, "icl_sequence_length": 96, "num_contexts": 3, "step": 13260 }, { "grad_norm": 11.491870880126953, "learning_rate": 0.00023947686712485347, "loss": 0.3816, "step": 13270 }, { "gate_value": 0.23662154376506805, "icl_sequence_length": 70, "num_contexts": 3, "step": 13270 }, { "grad_norm": 9.589191436767578, "learning_rate": 0.00023937730546787404, "loss": 0.3723, "step": 13280 }, { "gate_value": 0.23666512966156006, "icl_sequence_length": 64, "num_contexts": 3, "step": 13280 }, { "grad_norm": 9.990079879760742, "learning_rate": 0.00023927768272234907, "loss": 0.3981, "step": 13290 }, { "gate_value": 0.23673947155475616, "icl_sequence_length": 90, "num_contexts": 3, "step": 13290 }, { "grad_norm": 9.692089080810547, "learning_rate": 0.00023917799895636983, "loss": 0.396, "step": 13300 }, { "gate_value": 0.23682591319084167, "icl_sequence_length": 82, "num_contexts": 3, "step": 13300 }, { "grad_norm": 12.743815422058105, "learning_rate": 0.00023907825423806915, "loss": 0.3835, "step": 13310 }, { "gate_value": 0.23698779940605164, "icl_sequence_length": 74, "num_contexts": 3, "step": 13310 }, { "grad_norm": 18.46089744567871, "learning_rate": 0.00023897844863562175, "loss": 0.3947, "step": 13320 }, { "gate_value": 0.23710179328918457, "icl_sequence_length": 72, "num_contexts": 3, "step": 13320 }, { "grad_norm": 4.994933605194092, "learning_rate": 0.00023887858221724364, "loss": 0.3818, "step": 13330 }, { "gate_value": 0.23720552027225494, "icl_sequence_length": 92, "num_contexts": 3, "step": 13330 }, { "grad_norm": 33.670082092285156, "learning_rate": 0.00023877865505119266, "loss": 0.392, "step": 13340 }, { "gate_value": 0.23719537258148193, "icl_sequence_length": 66, "num_contexts": 3, "step": 13340 }, { "grad_norm": 15.782670974731445, "learning_rate": 0.00023867866720576813, "loss": 0.3922, "step": 13350 }, { "gate_value": 0.2372649610042572, "icl_sequence_length": 82, "num_contexts": 3, "step": 13350 }, { "grad_norm": 6.85189151763916, "learning_rate": 0.00023857861874931074, "loss": 0.3822, "step": 13360 }, { "gate_value": 0.2373196929693222, "icl_sequence_length": 82, "num_contexts": 3, "step": 13360 }, { "grad_norm": 14.476446151733398, "learning_rate": 0.00023847850975020266, "loss": 0.3954, "step": 13370 }, { "gate_value": 0.2373858541250229, "icl_sequence_length": 84, "num_contexts": 3, "step": 13370 }, { "grad_norm": 12.489557266235352, "learning_rate": 0.0002383783402768675, "loss": 0.3907, "step": 13380 }, { "gate_value": 0.2375316321849823, "icl_sequence_length": 78, "num_contexts": 3, "step": 13380 }, { "grad_norm": 6.788305759429932, "learning_rate": 0.0002382781103977701, "loss": 0.3866, "step": 13390 }, { "gate_value": 0.23768095672130585, "icl_sequence_length": 82, "num_contexts": 3, "step": 13390 }, { "grad_norm": 9.206138610839844, "learning_rate": 0.00023817782018141666, "loss": 0.4043, "step": 13400 }, { "gate_value": 0.23779208958148956, "icl_sequence_length": 88, "num_contexts": 3, "step": 13400 }, { "grad_norm": 20.340181350708008, "learning_rate": 0.0002380774696963546, "loss": 0.4027, "step": 13410 }, { "gate_value": 0.23787511885166168, "icl_sequence_length": 78, "num_contexts": 3, "step": 13410 }, { "grad_norm": 14.296485900878906, "learning_rate": 0.00023797705901117252, "loss": 0.3959, "step": 13420 }, { "gate_value": 0.23800590634346008, "icl_sequence_length": 76, "num_contexts": 3, "step": 13420 }, { "grad_norm": 9.20689868927002, "learning_rate": 0.00023787658819450017, "loss": 0.382, "step": 13430 }, { "gate_value": 0.2381312996149063, "icl_sequence_length": 82, "num_contexts": 3, "step": 13430 }, { "grad_norm": 11.133676528930664, "learning_rate": 0.0002377760573150084, "loss": 0.3873, "step": 13440 }, { "gate_value": 0.2381925731897354, "icl_sequence_length": 80, "num_contexts": 3, "step": 13440 }, { "grad_norm": 6.757209777832031, "learning_rate": 0.00023767546644140917, "loss": 0.3809, "step": 13450 }, { "gate_value": 0.2382330298423767, "icl_sequence_length": 80, "num_contexts": 3, "step": 13450 }, { "grad_norm": 9.891308784484863, "learning_rate": 0.00023757481564245535, "loss": 0.394, "step": 13460 }, { "gate_value": 0.23824086785316467, "icl_sequence_length": 76, "num_contexts": 3, "step": 13460 }, { "grad_norm": 9.862556457519531, "learning_rate": 0.0002374741049869408, "loss": 0.3918, "step": 13470 }, { "gate_value": 0.23823952674865723, "icl_sequence_length": 88, "num_contexts": 3, "step": 13470 }, { "grad_norm": 17.30132293701172, "learning_rate": 0.00023737333454370034, "loss": 0.4038, "step": 13480 }, { "gate_value": 0.2382458597421646, "icl_sequence_length": 84, "num_contexts": 3, "step": 13480 }, { "grad_norm": 14.025443077087402, "learning_rate": 0.00023727250438160957, "loss": 0.3877, "step": 13490 }, { "gate_value": 0.23827463388442993, "icl_sequence_length": 84, "num_contexts": 3, "step": 13490 }, { "grad_norm": 7.736385345458984, "learning_rate": 0.00023717161456958508, "loss": 0.3922, "step": 13500 }, { "gate_value": 0.23830415308475494, "icl_sequence_length": 76, "num_contexts": 3, "step": 13500 }, { "grad_norm": 10.48551082611084, "learning_rate": 0.00023707066517658393, "loss": 0.3671, "step": 13510 }, { "gate_value": 0.23844420909881592, "icl_sequence_length": 88, "num_contexts": 3, "step": 13510 }, { "grad_norm": 9.336437225341797, "learning_rate": 0.00023696965627160416, "loss": 0.3838, "step": 13520 }, { "gate_value": 0.23857417702674866, "icl_sequence_length": 78, "num_contexts": 3, "step": 13520 }, { "grad_norm": 10.722733497619629, "learning_rate": 0.0002368685879236844, "loss": 0.3806, "step": 13530 }, { "gate_value": 0.23865066468715668, "icl_sequence_length": 88, "num_contexts": 3, "step": 13530 }, { "grad_norm": 8.75007438659668, "learning_rate": 0.0002367674602019039, "loss": 0.4108, "step": 13540 }, { "gate_value": 0.23874539136886597, "icl_sequence_length": 86, "num_contexts": 3, "step": 13540 }, { "grad_norm": 8.046998023986816, "learning_rate": 0.00023666627317538258, "loss": 0.3837, "step": 13550 }, { "gate_value": 0.23880630731582642, "icl_sequence_length": 88, "num_contexts": 3, "step": 13550 }, { "grad_norm": 6.331174373626709, "learning_rate": 0.00023656502691328074, "loss": 0.3923, "step": 13560 }, { "gate_value": 0.23895002901554108, "icl_sequence_length": 88, "num_contexts": 3, "step": 13560 }, { "grad_norm": 10.85879898071289, "learning_rate": 0.00023646372148479925, "loss": 0.3662, "step": 13570 }, { "gate_value": 0.23905928432941437, "icl_sequence_length": 84, "num_contexts": 3, "step": 13570 }, { "grad_norm": 10.636667251586914, "learning_rate": 0.00023636235695917942, "loss": 0.4045, "step": 13580 }, { "gate_value": 0.23911528289318085, "icl_sequence_length": 76, "num_contexts": 3, "step": 13580 }, { "grad_norm": 14.647674560546875, "learning_rate": 0.00023626093340570298, "loss": 0.3841, "step": 13590 }, { "gate_value": 0.23917175829410553, "icl_sequence_length": 92, "num_contexts": 3, "step": 13590 }, { "grad_norm": 10.018962860107422, "learning_rate": 0.00023615945089369193, "loss": 0.3831, "step": 13600 }, { "gate_value": 0.2392829954624176, "icl_sequence_length": 78, "num_contexts": 3, "step": 13600 }, { "grad_norm": 11.435403823852539, "learning_rate": 0.00023605790949250864, "loss": 0.4096, "step": 13610 }, { "gate_value": 0.2394438236951828, "icl_sequence_length": 76, "num_contexts": 3, "step": 13610 }, { "grad_norm": 41.82572937011719, "learning_rate": 0.00023595630927155571, "loss": 0.3759, "step": 13620 }, { "gate_value": 0.23965805768966675, "icl_sequence_length": 66, "num_contexts": 3, "step": 13620 }, { "grad_norm": 6.350934028625488, "learning_rate": 0.00023585465030027586, "loss": 0.3888, "step": 13630 }, { "gate_value": 0.23977549374103546, "icl_sequence_length": 80, "num_contexts": 3, "step": 13630 }, { "grad_norm": 9.555365562438965, "learning_rate": 0.00023575293264815214, "loss": 0.3868, "step": 13640 }, { "gate_value": 0.239946648478508, "icl_sequence_length": 92, "num_contexts": 3, "step": 13640 }, { "grad_norm": 15.217024803161621, "learning_rate": 0.00023565115638470754, "loss": 0.3924, "step": 13650 }, { "gate_value": 0.2401033490896225, "icl_sequence_length": 60, "num_contexts": 3, "step": 13650 }, { "grad_norm": 8.90522289276123, "learning_rate": 0.00023554932157950518, "loss": 0.3688, "step": 13660 }, { "gate_value": 0.24017292261123657, "icl_sequence_length": 90, "num_contexts": 3, "step": 13660 }, { "grad_norm": 11.551718711853027, "learning_rate": 0.00023544742830214823, "loss": 0.3912, "step": 13670 }, { "gate_value": 0.24020229279994965, "icl_sequence_length": 78, "num_contexts": 3, "step": 13670 }, { "grad_norm": 8.776484489440918, "learning_rate": 0.0002353454766222797, "loss": 0.3941, "step": 13680 }, { "gate_value": 0.24032677710056305, "icl_sequence_length": 70, "num_contexts": 3, "step": 13680 }, { "grad_norm": 3.0297889709472656, "learning_rate": 0.00023524346660958273, "loss": 0.3864, "step": 13690 }, { "gate_value": 0.2403283268213272, "icl_sequence_length": 92, "num_contexts": 3, "step": 13690 }, { "grad_norm": 2.8323323726654053, "learning_rate": 0.0002351413983337801, "loss": 0.3771, "step": 13700 }, { "gate_value": 0.2404557168483734, "icl_sequence_length": 76, "num_contexts": 3, "step": 13700 }, { "grad_norm": 4.921107292175293, "learning_rate": 0.00023503927186463455, "loss": 0.4035, "step": 13710 }, { "gate_value": 0.24044989049434662, "icl_sequence_length": 86, "num_contexts": 3, "step": 13710 }, { "grad_norm": 4.018657207489014, "learning_rate": 0.00023493708727194854, "loss": 0.3895, "step": 13720 }, { "gate_value": 0.2406502068042755, "icl_sequence_length": 88, "num_contexts": 3, "step": 13720 }, { "grad_norm": 3.5907695293426514, "learning_rate": 0.00023483484462556427, "loss": 0.3844, "step": 13730 }, { "gate_value": 0.2406596541404724, "icl_sequence_length": 80, "num_contexts": 3, "step": 13730 }, { "grad_norm": 4.6112847328186035, "learning_rate": 0.00023473254399536368, "loss": 0.3948, "step": 13740 }, { "gate_value": 0.24075298011302948, "icl_sequence_length": 76, "num_contexts": 3, "step": 13740 }, { "grad_norm": 7.318800449371338, "learning_rate": 0.00023463018545126827, "loss": 0.3905, "step": 13750 }, { "gate_value": 0.2410333752632141, "icl_sequence_length": 86, "num_contexts": 3, "step": 13750 }, { "grad_norm": 4.03450345993042, "learning_rate": 0.00023452776906323906, "loss": 0.3951, "step": 13760 }, { "gate_value": 0.24121364951133728, "icl_sequence_length": 86, "num_contexts": 3, "step": 13760 }, { "grad_norm": 4.004604339599609, "learning_rate": 0.00023442529490127678, "loss": 0.3899, "step": 13770 }, { "gate_value": 0.241162970662117, "icl_sequence_length": 86, "num_contexts": 3, "step": 13770 }, { "grad_norm": 3.408446788787842, "learning_rate": 0.00023432276303542152, "loss": 0.3875, "step": 13780 }, { "gate_value": 0.24171173572540283, "icl_sequence_length": 76, "num_contexts": 3, "step": 13780 }, { "grad_norm": 0.6727409362792969, "learning_rate": 0.0002342201735357528, "loss": 0.3736, "step": 13790 }, { "gate_value": 0.24206897616386414, "icl_sequence_length": 72, "num_contexts": 3, "step": 13790 }, { "grad_norm": 0.4120021462440491, "learning_rate": 0.00023411752647238963, "loss": 0.379, "step": 13800 }, { "gate_value": 0.242006316781044, "icl_sequence_length": 68, "num_contexts": 3, "step": 13800 }, { "grad_norm": 0.5434548854827881, "learning_rate": 0.00023401482191549034, "loss": 0.4013, "step": 13810 }, { "gate_value": 0.24193528294563293, "icl_sequence_length": 88, "num_contexts": 3, "step": 13810 }, { "grad_norm": 0.6041681170463562, "learning_rate": 0.00023391205993525245, "loss": 0.4044, "step": 13820 }, { "gate_value": 0.2423665076494217, "icl_sequence_length": 82, "num_contexts": 3, "step": 13820 }, { "grad_norm": 0.38581350445747375, "learning_rate": 0.00023380924060191287, "loss": 0.4103, "step": 13830 }, { "gate_value": 0.24196967482566833, "icl_sequence_length": 78, "num_contexts": 3, "step": 13830 }, { "grad_norm": 0.5824718475341797, "learning_rate": 0.00023370636398574758, "loss": 0.3836, "step": 13840 }, { "gate_value": 0.24174365401268005, "icl_sequence_length": 74, "num_contexts": 3, "step": 13840 }, { "grad_norm": 0.7611846923828125, "learning_rate": 0.0002336034301570718, "loss": 0.3934, "step": 13850 }, { "gate_value": 0.24081198871135712, "icl_sequence_length": 88, "num_contexts": 3, "step": 13850 }, { "grad_norm": 0.7086225748062134, "learning_rate": 0.00023350043918623982, "loss": 0.4015, "step": 13860 }, { "gate_value": 0.24044911563396454, "icl_sequence_length": 84, "num_contexts": 3, "step": 13860 }, { "grad_norm": 1.1706241369247437, "learning_rate": 0.00023339739114364508, "loss": 0.3975, "step": 13870 }, { "gate_value": 0.2400846630334854, "icl_sequence_length": 90, "num_contexts": 3, "step": 13870 }, { "grad_norm": 9.380261421203613, "learning_rate": 0.00023329428609971986, "loss": 0.3988, "step": 13880 }, { "gate_value": 0.23952551186084747, "icl_sequence_length": 84, "num_contexts": 3, "step": 13880 }, { "grad_norm": 4.3653388023376465, "learning_rate": 0.00023319112412493553, "loss": 0.3991, "step": 13890 }, { "gate_value": 0.23941710591316223, "icl_sequence_length": 72, "num_contexts": 3, "step": 13890 }, { "grad_norm": 2.4812557697296143, "learning_rate": 0.00023308790528980226, "loss": 0.4141, "step": 13900 }, { "gate_value": 0.23994140326976776, "icl_sequence_length": 80, "num_contexts": 3, "step": 13900 }, { "grad_norm": 3.448402166366577, "learning_rate": 0.00023298462966486923, "loss": 0.41, "step": 13910 }, { "gate_value": 0.2402760088443756, "icl_sequence_length": 90, "num_contexts": 3, "step": 13910 }, { "grad_norm": 1.7383501529693604, "learning_rate": 0.00023288129732072432, "loss": 0.3945, "step": 13920 }, { "gate_value": 0.24017523229122162, "icl_sequence_length": 90, "num_contexts": 3, "step": 13920 }, { "grad_norm": 0.8303579688072205, "learning_rate": 0.00023277790832799418, "loss": 0.4153, "step": 13930 }, { "gate_value": 0.2399144023656845, "icl_sequence_length": 94, "num_contexts": 3, "step": 13930 }, { "grad_norm": 1.4897278547286987, "learning_rate": 0.00023267446275734431, "loss": 0.389, "step": 13940 }, { "gate_value": 0.23930053412914276, "icl_sequence_length": 74, "num_contexts": 3, "step": 13940 }, { "grad_norm": 1.5729742050170898, "learning_rate": 0.00023257096067947868, "loss": 0.3811, "step": 13950 }, { "gate_value": 0.23936836421489716, "icl_sequence_length": 84, "num_contexts": 3, "step": 13950 }, { "grad_norm": 0.9085593223571777, "learning_rate": 0.00023246740216513998, "loss": 0.3947, "step": 13960 }, { "gate_value": 0.2395535558462143, "icl_sequence_length": 82, "num_contexts": 3, "step": 13960 }, { "grad_norm": 1.2284908294677734, "learning_rate": 0.00023236378728510963, "loss": 0.3951, "step": 13970 }, { "gate_value": 0.23942606151103973, "icl_sequence_length": 70, "num_contexts": 3, "step": 13970 }, { "grad_norm": 0.8122304677963257, "learning_rate": 0.00023226011611020723, "loss": 0.3864, "step": 13980 }, { "gate_value": 0.23953235149383545, "icl_sequence_length": 78, "num_contexts": 3, "step": 13980 }, { "grad_norm": 0.6110461354255676, "learning_rate": 0.00023215638871129115, "loss": 0.3924, "step": 13990 }, { "gate_value": 0.23886938393115997, "icl_sequence_length": 76, "num_contexts": 3, "step": 13990 }, { "grad_norm": 0.8406757116317749, "learning_rate": 0.00023205260515925808, "loss": 0.4018, "step": 14000 }, { "gate_value": 0.2380830943584442, "icl_sequence_length": 74, "num_contexts": 3, "step": 14000 }, { "grad_norm": 0.534526526927948, "learning_rate": 0.0002319487655250431, "loss": 0.4073, "step": 14010 }, { "gate_value": 0.23780196905136108, "icl_sequence_length": 90, "num_contexts": 3, "step": 14010 }, { "grad_norm": 14.408823013305664, "learning_rate": 0.00023184486987961963, "loss": 0.3853, "step": 14020 }, { "gate_value": 0.23794253170490265, "icl_sequence_length": 68, "num_contexts": 3, "step": 14020 }, { "grad_norm": 6.213502883911133, "learning_rate": 0.0002317409182939993, "loss": 0.3991, "step": 14030 }, { "gate_value": 0.23781372606754303, "icl_sequence_length": 90, "num_contexts": 3, "step": 14030 }, { "grad_norm": 1.1910241842269897, "learning_rate": 0.00023163691083923212, "loss": 0.3973, "step": 14040 }, { "gate_value": 0.2376827746629715, "icl_sequence_length": 82, "num_contexts": 3, "step": 14040 }, { "grad_norm": 2.642822265625, "learning_rate": 0.00023153284758640618, "loss": 0.3936, "step": 14050 }, { "gate_value": 0.23790773749351501, "icl_sequence_length": 80, "num_contexts": 3, "step": 14050 }, { "grad_norm": 1.310566782951355, "learning_rate": 0.00023142872860664775, "loss": 0.4136, "step": 14060 }, { "gate_value": 0.23784318566322327, "icl_sequence_length": 70, "num_contexts": 3, "step": 14060 }, { "grad_norm": 4.528188705444336, "learning_rate": 0.00023132455397112107, "loss": 0.3759, "step": 14070 }, { "gate_value": 0.23753774166107178, "icl_sequence_length": 86, "num_contexts": 3, "step": 14070 }, { "grad_norm": 2.330179214477539, "learning_rate": 0.00023122032375102862, "loss": 0.4058, "step": 14080 }, { "gate_value": 0.23672199249267578, "icl_sequence_length": 88, "num_contexts": 3, "step": 14080 }, { "grad_norm": 3.0592496395111084, "learning_rate": 0.00023111603801761075, "loss": 0.3816, "step": 14090 }, { "gate_value": 0.23660390079021454, "icl_sequence_length": 50, "num_contexts": 3, "step": 14090 }, { "grad_norm": 1.2959048748016357, "learning_rate": 0.00023101169684214577, "loss": 0.3854, "step": 14100 }, { "gate_value": 0.2372078150510788, "icl_sequence_length": 88, "num_contexts": 3, "step": 14100 }, { "grad_norm": 2.996217727661133, "learning_rate": 0.00023090730029594995, "loss": 0.3806, "step": 14110 }, { "gate_value": 0.237883061170578, "icl_sequence_length": 72, "num_contexts": 3, "step": 14110 }, { "grad_norm": 29.50691032409668, "learning_rate": 0.0002308028484503772, "loss": 0.3709, "step": 14120 }, { "gate_value": 0.23824338614940643, "icl_sequence_length": 80, "num_contexts": 3, "step": 14120 }, { "grad_norm": 6.587845325469971, "learning_rate": 0.00023069834137681952, "loss": 0.3881, "step": 14130 }, { "gate_value": 0.23884567618370056, "icl_sequence_length": 92, "num_contexts": 3, "step": 14130 }, { "grad_norm": 0.7628917694091797, "learning_rate": 0.0002305937791467064, "loss": 0.3745, "step": 14140 }, { "gate_value": 0.2393723726272583, "icl_sequence_length": 76, "num_contexts": 3, "step": 14140 }, { "grad_norm": 1.8146260976791382, "learning_rate": 0.00023048916183150524, "loss": 0.4034, "step": 14150 }, { "gate_value": 0.23927997052669525, "icl_sequence_length": 86, "num_contexts": 3, "step": 14150 }, { "grad_norm": 0.6181547045707703, "learning_rate": 0.0002303844895027209, "loss": 0.3794, "step": 14160 }, { "gate_value": 0.23931756615638733, "icl_sequence_length": 74, "num_contexts": 3, "step": 14160 }, { "grad_norm": 1.4100581407546997, "learning_rate": 0.000230279762231896, "loss": 0.409, "step": 14170 }, { "gate_value": 0.23971882462501526, "icl_sequence_length": 90, "num_contexts": 3, "step": 14170 }, { "grad_norm": 4.569934844970703, "learning_rate": 0.00023017498009061057, "loss": 0.3979, "step": 14180 }, { "gate_value": 0.23882213234901428, "icl_sequence_length": 92, "num_contexts": 3, "step": 14180 }, { "grad_norm": 0.9258241057395935, "learning_rate": 0.0002300701431504823, "loss": 0.4028, "step": 14190 }, { "gate_value": 0.2383560687303543, "icl_sequence_length": 86, "num_contexts": 3, "step": 14190 }, { "grad_norm": 0.9763109683990479, "learning_rate": 0.00022996525148316616, "loss": 0.3873, "step": 14200 }, { "gate_value": 0.23882944881916046, "icl_sequence_length": 72, "num_contexts": 3, "step": 14200 }, { "grad_norm": 0.7269139289855957, "learning_rate": 0.0002298603051603547, "loss": 0.4011, "step": 14210 }, { "gate_value": 0.23907381296157837, "icl_sequence_length": 84, "num_contexts": 3, "step": 14210 }, { "grad_norm": 1.4872263669967651, "learning_rate": 0.00022975530425377763, "loss": 0.4069, "step": 14220 }, { "gate_value": 0.2386438399553299, "icl_sequence_length": 86, "num_contexts": 3, "step": 14220 }, { "grad_norm": 1.6720694303512573, "learning_rate": 0.00022965024883520217, "loss": 0.4069, "step": 14230 }, { "gate_value": 0.23835961520671844, "icl_sequence_length": 82, "num_contexts": 3, "step": 14230 }, { "grad_norm": 14.838658332824707, "learning_rate": 0.00022954513897643274, "loss": 0.3907, "step": 14240 }, { "gate_value": 0.238117977976799, "icl_sequence_length": 70, "num_contexts": 3, "step": 14240 }, { "grad_norm": 1.1093040704727173, "learning_rate": 0.00022943997474931087, "loss": 0.4086, "step": 14250 }, { "gate_value": 0.23805883526802063, "icl_sequence_length": 88, "num_contexts": 3, "step": 14250 }, { "grad_norm": 1.4715149402618408, "learning_rate": 0.0002293347562257153, "loss": 0.3885, "step": 14260 }, { "gate_value": 0.23897786438465118, "icl_sequence_length": 88, "num_contexts": 3, "step": 14260 }, { "grad_norm": 1.3662744760513306, "learning_rate": 0.00022922948347756195, "loss": 0.3885, "step": 14270 }, { "gate_value": 0.23961474001407623, "icl_sequence_length": 82, "num_contexts": 3, "step": 14270 }, { "grad_norm": 1.8294868469238281, "learning_rate": 0.00022912415657680375, "loss": 0.3966, "step": 14280 }, { "gate_value": 0.23970316350460052, "icl_sequence_length": 84, "num_contexts": 3, "step": 14280 }, { "grad_norm": 1.3246426582336426, "learning_rate": 0.00022901877559543057, "loss": 0.3829, "step": 14290 }, { "gate_value": 0.24013105034828186, "icl_sequence_length": 92, "num_contexts": 3, "step": 14290 }, { "grad_norm": 0.7515550851821899, "learning_rate": 0.00022891334060546947, "loss": 0.3869, "step": 14300 }, { "gate_value": 0.24048610031604767, "icl_sequence_length": 74, "num_contexts": 3, "step": 14300 }, { "grad_norm": 1.6856255531311035, "learning_rate": 0.00022880785167898407, "loss": 0.4136, "step": 14310 }, { "gate_value": 0.24040848016738892, "icl_sequence_length": 74, "num_contexts": 3, "step": 14310 }, { "grad_norm": 2.9111411571502686, "learning_rate": 0.0002287023088880752, "loss": 0.404, "step": 14320 }, { "gate_value": 0.2407020479440689, "icl_sequence_length": 74, "num_contexts": 3, "step": 14320 }, { "grad_norm": 0.9571368098258972, "learning_rate": 0.00022859671230488033, "loss": 0.4, "step": 14330 }, { "gate_value": 0.24106580018997192, "icl_sequence_length": 84, "num_contexts": 3, "step": 14330 }, { "grad_norm": 0.6420993208885193, "learning_rate": 0.00022849106200157373, "loss": 0.3871, "step": 14340 }, { "gate_value": 0.24104709923267365, "icl_sequence_length": 78, "num_contexts": 3, "step": 14340 }, { "grad_norm": 1.0583370923995972, "learning_rate": 0.0002283853580503664, "loss": 0.4126, "step": 14350 }, { "gate_value": 0.24134980142116547, "icl_sequence_length": 84, "num_contexts": 3, "step": 14350 }, { "grad_norm": 0.8332363367080688, "learning_rate": 0.00022827960052350594, "loss": 0.3982, "step": 14360 }, { "gate_value": 0.24138779938220978, "icl_sequence_length": 64, "num_contexts": 3, "step": 14360 }, { "grad_norm": 28.765117645263672, "learning_rate": 0.0002281737894932766, "loss": 0.3802, "step": 14370 }, { "gate_value": 0.24165429174900055, "icl_sequence_length": 78, "num_contexts": 3, "step": 14370 }, { "grad_norm": 2.636730670928955, "learning_rate": 0.00022806792503199936, "loss": 0.3944, "step": 14380 }, { "gate_value": 0.24217766523361206, "icl_sequence_length": 74, "num_contexts": 3, "step": 14380 }, { "grad_norm": 2.2959983348846436, "learning_rate": 0.0002279620072120315, "loss": 0.3728, "step": 14390 }, { "gate_value": 0.24180297553539276, "icl_sequence_length": 82, "num_contexts": 3, "step": 14390 }, { "grad_norm": 1.3253600597381592, "learning_rate": 0.0002278560361057668, "loss": 0.3904, "step": 14400 }, { "gate_value": 0.24161909520626068, "icl_sequence_length": 82, "num_contexts": 3, "step": 14400 }, { "grad_norm": 0.8932808637619019, "learning_rate": 0.00022775001178563557, "loss": 0.3762, "step": 14410 }, { "gate_value": 0.2418455183506012, "icl_sequence_length": 74, "num_contexts": 3, "step": 14410 }, { "grad_norm": 13.018226623535156, "learning_rate": 0.00022764393432410442, "loss": 0.3948, "step": 14420 }, { "gate_value": 0.24179722368717194, "icl_sequence_length": 90, "num_contexts": 3, "step": 14420 }, { "grad_norm": 0.9938327074050903, "learning_rate": 0.00022753780379367633, "loss": 0.3786, "step": 14430 }, { "gate_value": 0.24247105419635773, "icl_sequence_length": 82, "num_contexts": 3, "step": 14430 }, { "grad_norm": 1.7252559661865234, "learning_rate": 0.00022743162026689047, "loss": 0.3905, "step": 14440 }, { "gate_value": 0.24302411079406738, "icl_sequence_length": 62, "num_contexts": 3, "step": 14440 }, { "grad_norm": 1.7536218166351318, "learning_rate": 0.0002273253838163223, "loss": 0.3863, "step": 14450 }, { "gate_value": 0.24294431507587433, "icl_sequence_length": 88, "num_contexts": 3, "step": 14450 }, { "grad_norm": 1.1929810047149658, "learning_rate": 0.0002272190945145834, "loss": 0.3985, "step": 14460 }, { "gate_value": 0.24252630770206451, "icl_sequence_length": 94, "num_contexts": 3, "step": 14460 }, { "grad_norm": 1.075748085975647, "learning_rate": 0.00022711275243432154, "loss": 0.39, "step": 14470 }, { "gate_value": 0.24282178282737732, "icl_sequence_length": 88, "num_contexts": 3, "step": 14470 }, { "grad_norm": 4.662461757659912, "learning_rate": 0.00022700635764822058, "loss": 0.3893, "step": 14480 }, { "gate_value": 0.24319680035114288, "icl_sequence_length": 84, "num_contexts": 3, "step": 14480 }, { "grad_norm": 1.0762276649475098, "learning_rate": 0.00022689991022900022, "loss": 0.3821, "step": 14490 }, { "gate_value": 0.2434968203306198, "icl_sequence_length": 90, "num_contexts": 3, "step": 14490 }, { "grad_norm": 1.5702511072158813, "learning_rate": 0.00022679341024941632, "loss": 0.3778, "step": 14500 }, { "gate_value": 0.24319276213645935, "icl_sequence_length": 66, "num_contexts": 3, "step": 14500 }, { "grad_norm": 0.9678138494491577, "learning_rate": 0.00022668685778226073, "loss": 0.3871, "step": 14510 }, { "gate_value": 0.24318233132362366, "icl_sequence_length": 84, "num_contexts": 3, "step": 14510 }, { "grad_norm": 1.7996389865875244, "learning_rate": 0.00022658025290036085, "loss": 0.3944, "step": 14520 }, { "gate_value": 0.2435857653617859, "icl_sequence_length": 86, "num_contexts": 3, "step": 14520 }, { "grad_norm": 0.8995399475097656, "learning_rate": 0.00022647359567658034, "loss": 0.4001, "step": 14530 }, { "gate_value": 0.24373860657215118, "icl_sequence_length": 74, "num_contexts": 3, "step": 14530 }, { "grad_norm": 1.5024384260177612, "learning_rate": 0.0002263668861838182, "loss": 0.3874, "step": 14540 }, { "gate_value": 0.24341940879821777, "icl_sequence_length": 74, "num_contexts": 3, "step": 14540 }, { "grad_norm": 0.7691788077354431, "learning_rate": 0.00022626012449500945, "loss": 0.3854, "step": 14550 }, { "gate_value": 0.24297001957893372, "icl_sequence_length": 78, "num_contexts": 3, "step": 14550 }, { "grad_norm": 1.1324009895324707, "learning_rate": 0.00022615331068312472, "loss": 0.3833, "step": 14560 }, { "gate_value": 0.24257968366146088, "icl_sequence_length": 74, "num_contexts": 3, "step": 14560 }, { "grad_norm": 1.6810340881347656, "learning_rate": 0.00022604644482117028, "loss": 0.4013, "step": 14570 }, { "gate_value": 0.24228624999523163, "icl_sequence_length": 80, "num_contexts": 3, "step": 14570 }, { "grad_norm": 0.5641115307807922, "learning_rate": 0.00022593952698218782, "loss": 0.3679, "step": 14580 }, { "gate_value": 0.24178647994995117, "icl_sequence_length": 82, "num_contexts": 3, "step": 14580 }, { "grad_norm": 0.9644331932067871, "learning_rate": 0.00022583255723925471, "loss": 0.3939, "step": 14590 }, { "gate_value": 0.24127645790576935, "icl_sequence_length": 66, "num_contexts": 3, "step": 14590 }, { "grad_norm": 1.2938753366470337, "learning_rate": 0.00022572553566548378, "loss": 0.3957, "step": 14600 }, { "gate_value": 0.24074934422969818, "icl_sequence_length": 64, "num_contexts": 3, "step": 14600 }, { "grad_norm": 1.338955044746399, "learning_rate": 0.00022561846233402333, "loss": 0.4033, "step": 14610 }, { "gate_value": 0.24045029282569885, "icl_sequence_length": 64, "num_contexts": 3, "step": 14610 }, { "grad_norm": 1.5236607789993286, "learning_rate": 0.00022551133731805689, "loss": 0.3936, "step": 14620 }, { "gate_value": 0.24116292595863342, "icl_sequence_length": 78, "num_contexts": 3, "step": 14620 }, { "grad_norm": 2.066084146499634, "learning_rate": 0.00022540416069080342, "loss": 0.3808, "step": 14630 }, { "gate_value": 0.24113567173480988, "icl_sequence_length": 86, "num_contexts": 3, "step": 14630 }, { "grad_norm": 1.0546592473983765, "learning_rate": 0.00022529693252551714, "loss": 0.4019, "step": 14640 }, { "gate_value": 0.24161110818386078, "icl_sequence_length": 90, "num_contexts": 3, "step": 14640 }, { "grad_norm": 1.3510944843292236, "learning_rate": 0.0002251896528954875, "loss": 0.3956, "step": 14650 }, { "gate_value": 0.24217374622821808, "icl_sequence_length": 80, "num_contexts": 3, "step": 14650 }, { "grad_norm": 262.3678894042969, "learning_rate": 0.00022508232187403907, "loss": 0.4029, "step": 14660 }, { "gate_value": 0.24230490624904633, "icl_sequence_length": 78, "num_contexts": 3, "step": 14660 }, { "grad_norm": 10.194805145263672, "learning_rate": 0.00022497493953453165, "loss": 0.3908, "step": 14670 }, { "gate_value": 0.2426171898841858, "icl_sequence_length": 84, "num_contexts": 3, "step": 14670 }, { "grad_norm": 0.7869942784309387, "learning_rate": 0.00022486750595036005, "loss": 0.3942, "step": 14680 }, { "gate_value": 0.24261200428009033, "icl_sequence_length": 82, "num_contexts": 3, "step": 14680 }, { "grad_norm": 0.7607384920120239, "learning_rate": 0.00022476002119495403, "loss": 0.3809, "step": 14690 }, { "gate_value": 0.24294787645339966, "icl_sequence_length": 80, "num_contexts": 3, "step": 14690 }, { "grad_norm": 1.007214069366455, "learning_rate": 0.00022465248534177848, "loss": 0.4001, "step": 14700 }, { "gate_value": 0.24369680881500244, "icl_sequence_length": 94, "num_contexts": 3, "step": 14700 }, { "grad_norm": 0.6694496273994446, "learning_rate": 0.0002245448984643332, "loss": 0.3956, "step": 14710 }, { "gate_value": 0.24375005066394806, "icl_sequence_length": 88, "num_contexts": 3, "step": 14710 }, { "grad_norm": 8.913844108581543, "learning_rate": 0.00022443726063615265, "loss": 0.401, "step": 14720 }, { "gate_value": 0.24351546168327332, "icl_sequence_length": 80, "num_contexts": 3, "step": 14720 }, { "grad_norm": 12.980989456176758, "learning_rate": 0.00022432957193080643, "loss": 0.3934, "step": 14730 }, { "gate_value": 0.2434246689081192, "icl_sequence_length": 66, "num_contexts": 3, "step": 14730 }, { "grad_norm": 7.73109769821167, "learning_rate": 0.00022422183242189862, "loss": 0.3741, "step": 14740 }, { "gate_value": 0.24368853867053986, "icl_sequence_length": 92, "num_contexts": 3, "step": 14740 }, { "grad_norm": 1.094784140586853, "learning_rate": 0.0002241140421830682, "loss": 0.3947, "step": 14750 }, { "gate_value": 0.24333494901657104, "icl_sequence_length": 86, "num_contexts": 3, "step": 14750 }, { "grad_norm": 56.325748443603516, "learning_rate": 0.00022400620128798892, "loss": 0.3731, "step": 14760 }, { "gate_value": 0.24329960346221924, "icl_sequence_length": 82, "num_contexts": 3, "step": 14760 }, { "grad_norm": 1.4292678833007812, "learning_rate": 0.00022389830981036878, "loss": 0.3852, "step": 14770 }, { "gate_value": 0.24320080876350403, "icl_sequence_length": 82, "num_contexts": 3, "step": 14770 }, { "grad_norm": 11.566863059997559, "learning_rate": 0.00022379036782395074, "loss": 0.3927, "step": 14780 }, { "gate_value": 0.24328622221946716, "icl_sequence_length": 94, "num_contexts": 3, "step": 14780 }, { "grad_norm": 1.4415967464447021, "learning_rate": 0.00022368237540251209, "loss": 0.3929, "step": 14790 }, { "gate_value": 0.2440831959247589, "icl_sequence_length": 90, "num_contexts": 3, "step": 14790 }, { "grad_norm": 2.255908489227295, "learning_rate": 0.0002235743326198646, "loss": 0.3876, "step": 14800 }, { "gate_value": 0.24463777244091034, "icl_sequence_length": 78, "num_contexts": 3, "step": 14800 }, { "grad_norm": 1.6364030838012695, "learning_rate": 0.00022346623954985463, "loss": 0.3858, "step": 14810 }, { "gate_value": 0.24514533579349518, "icl_sequence_length": 84, "num_contexts": 3, "step": 14810 }, { "grad_norm": 2.745298147201538, "learning_rate": 0.00022335809626636264, "loss": 0.3967, "step": 14820 }, { "gate_value": 0.2454182356595993, "icl_sequence_length": 68, "num_contexts": 3, "step": 14820 }, { "grad_norm": 0.6744495630264282, "learning_rate": 0.00022324990284330355, "loss": 0.3865, "step": 14830 }, { "gate_value": 0.2454521507024765, "icl_sequence_length": 92, "num_contexts": 3, "step": 14830 }, { "grad_norm": 2.1479763984680176, "learning_rate": 0.00022314165935462656, "loss": 0.3919, "step": 14840 }, { "gate_value": 0.2457962930202484, "icl_sequence_length": 80, "num_contexts": 3, "step": 14840 }, { "grad_norm": 1.0453277826309204, "learning_rate": 0.0002230333658743151, "loss": 0.3881, "step": 14850 }, { "gate_value": 0.24663123488426208, "icl_sequence_length": 72, "num_contexts": 3, "step": 14850 }, { "grad_norm": 13.712295532226562, "learning_rate": 0.00022292502247638673, "loss": 0.3828, "step": 14860 }, { "gate_value": 0.24684672057628632, "icl_sequence_length": 92, "num_contexts": 3, "step": 14860 }, { "grad_norm": 0.5274834036827087, "learning_rate": 0.00022281662923489312, "loss": 0.3867, "step": 14870 }, { "gate_value": 0.24730166792869568, "icl_sequence_length": 92, "num_contexts": 3, "step": 14870 }, { "grad_norm": 0.4449861943721771, "learning_rate": 0.0002227081862239201, "loss": 0.4015, "step": 14880 }, { "gate_value": 0.24659566581249237, "icl_sequence_length": 94, "num_contexts": 3, "step": 14880 }, { "grad_norm": 0.8005648255348206, "learning_rate": 0.00022259969351758733, "loss": 0.3941, "step": 14890 }, { "gate_value": 0.2455216944217682, "icl_sequence_length": 74, "num_contexts": 3, "step": 14890 }, { "grad_norm": 0.32042214274406433, "learning_rate": 0.00022249115119004863, "loss": 0.398, "step": 14900 }, { "gate_value": 0.24477018415927887, "icl_sequence_length": 76, "num_contexts": 3, "step": 14900 }, { "grad_norm": 0.7770922780036926, "learning_rate": 0.00022238255931549168, "loss": 0.4056, "step": 14910 }, { "gate_value": 0.24532420933246613, "icl_sequence_length": 86, "num_contexts": 3, "step": 14910 }, { "grad_norm": 0.9371281266212463, "learning_rate": 0.00022227391796813794, "loss": 0.3916, "step": 14920 }, { "gate_value": 0.24590061604976654, "icl_sequence_length": 84, "num_contexts": 3, "step": 14920 }, { "grad_norm": 3.6227617263793945, "learning_rate": 0.00022216522722224278, "loss": 0.3933, "step": 14930 }, { "gate_value": 0.24611833691596985, "icl_sequence_length": 76, "num_contexts": 3, "step": 14930 }, { "grad_norm": 59.824222564697266, "learning_rate": 0.00022205648715209526, "loss": 0.3817, "step": 14940 }, { "gate_value": 0.24605101346969604, "icl_sequence_length": 78, "num_contexts": 3, "step": 14940 }, { "grad_norm": 4.043715000152588, "learning_rate": 0.00022194769783201828, "loss": 0.3987, "step": 14950 }, { "gate_value": 0.2458629310131073, "icl_sequence_length": 82, "num_contexts": 3, "step": 14950 }, { "grad_norm": 1.7102625370025635, "learning_rate": 0.0002218388593363682, "loss": 0.3887, "step": 14960 }, { "gate_value": 0.24561689794063568, "icl_sequence_length": 90, "num_contexts": 3, "step": 14960 }, { "grad_norm": 61.252071380615234, "learning_rate": 0.00022172997173953518, "loss": 0.3995, "step": 14970 }, { "gate_value": 0.24532824754714966, "icl_sequence_length": 84, "num_contexts": 3, "step": 14970 }, { "grad_norm": 0.8813376426696777, "learning_rate": 0.0002216210351159429, "loss": 0.3859, "step": 14980 }, { "gate_value": 0.24525928497314453, "icl_sequence_length": 82, "num_contexts": 3, "step": 14980 }, { "grad_norm": 1.0708917379379272, "learning_rate": 0.0002215120495400484, "loss": 0.4059, "step": 14990 }, { "gate_value": 0.2450612485408783, "icl_sequence_length": 86, "num_contexts": 3, "step": 14990 }, { "grad_norm": 2.3246612548828125, "learning_rate": 0.00022140301508634237, "loss": 0.3872, "step": 15000 }, { "gate_value": 0.24598005414009094, "icl_sequence_length": 84, "num_contexts": 3, "step": 15000 }, { "grad_norm": 1.221426010131836, "learning_rate": 0.00022129393182934883, "loss": 0.4096, "step": 15010 }, { "gate_value": 0.2468685358762741, "icl_sequence_length": 74, "num_contexts": 3, "step": 15010 }, { "grad_norm": 0.46905654668807983, "learning_rate": 0.00022118479984362512, "loss": 0.39, "step": 15020 }, { "gate_value": 0.24734513461589813, "icl_sequence_length": 80, "num_contexts": 3, "step": 15020 }, { "grad_norm": 204.0078582763672, "learning_rate": 0.00022107561920376202, "loss": 0.4064, "step": 15030 }, { "gate_value": 0.24712851643562317, "icl_sequence_length": 70, "num_contexts": 3, "step": 15030 }, { "grad_norm": 0.45268166065216064, "learning_rate": 0.00022096638998438334, "loss": 0.3833, "step": 15040 }, { "gate_value": 0.24720291793346405, "icl_sequence_length": 92, "num_contexts": 3, "step": 15040 }, { "grad_norm": 0.8164011240005493, "learning_rate": 0.00022085711226014625, "loss": 0.3996, "step": 15050 }, { "gate_value": 0.24735890328884125, "icl_sequence_length": 82, "num_contexts": 3, "step": 15050 }, { "grad_norm": 1.2292507886886597, "learning_rate": 0.00022074778610574114, "loss": 0.3666, "step": 15060 }, { "gate_value": 0.24747858941555023, "icl_sequence_length": 82, "num_contexts": 3, "step": 15060 }, { "grad_norm": 1.1117497682571411, "learning_rate": 0.0002206384115958913, "loss": 0.3836, "step": 15070 }, { "gate_value": 0.24780391156673431, "icl_sequence_length": 86, "num_contexts": 3, "step": 15070 }, { "grad_norm": 1.9108260869979858, "learning_rate": 0.00022052898880535324, "loss": 0.3846, "step": 15080 }, { "gate_value": 0.2478286623954773, "icl_sequence_length": 88, "num_contexts": 3, "step": 15080 }, { "grad_norm": 2.420485496520996, "learning_rate": 0.00022041951780891637, "loss": 0.3976, "step": 15090 }, { "gate_value": 0.24756808578968048, "icl_sequence_length": 72, "num_contexts": 3, "step": 15090 }, { "grad_norm": 1.190596342086792, "learning_rate": 0.00022030999868140306, "loss": 0.3996, "step": 15100 }, { "gate_value": 0.24758541584014893, "icl_sequence_length": 86, "num_contexts": 3, "step": 15100 }, { "grad_norm": 1.1695483922958374, "learning_rate": 0.00022020043149766872, "loss": 0.3699, "step": 15110 }, { "gate_value": 0.2479713410139084, "icl_sequence_length": 68, "num_contexts": 3, "step": 15110 }, { "grad_norm": 2.384326219558716, "learning_rate": 0.0002200908163326013, "loss": 0.3778, "step": 15120 }, { "gate_value": 0.2486722320318222, "icl_sequence_length": 84, "num_contexts": 3, "step": 15120 }, { "grad_norm": 6.0441484451293945, "learning_rate": 0.0002199811532611219, "loss": 0.3925, "step": 15130 }, { "gate_value": 0.24903367459774017, "icl_sequence_length": 82, "num_contexts": 3, "step": 15130 }, { "grad_norm": 1.6270265579223633, "learning_rate": 0.0002198714423581841, "loss": 0.4015, "step": 15140 }, { "gate_value": 0.2497212141752243, "icl_sequence_length": 62, "num_contexts": 3, "step": 15140 }, { "grad_norm": 0.7187557220458984, "learning_rate": 0.00021976168369877428, "loss": 0.3847, "step": 15150 }, { "gate_value": 0.24981430172920227, "icl_sequence_length": 74, "num_contexts": 3, "step": 15150 }, { "grad_norm": 1.3020397424697876, "learning_rate": 0.00021965187735791154, "loss": 0.393, "step": 15160 }, { "gate_value": 0.24998848140239716, "icl_sequence_length": 82, "num_contexts": 3, "step": 15160 }, { "grad_norm": 0.6040678024291992, "learning_rate": 0.00021954202341064731, "loss": 0.3774, "step": 15170 }, { "gate_value": 0.25018632411956787, "icl_sequence_length": 78, "num_contexts": 3, "step": 15170 }, { "grad_norm": 0.56707763671875, "learning_rate": 0.00021943212193206588, "loss": 0.3768, "step": 15180 }, { "gate_value": 0.2501216530799866, "icl_sequence_length": 86, "num_contexts": 3, "step": 15180 }, { "grad_norm": 0.6855562925338745, "learning_rate": 0.00021932217299728383, "loss": 0.4046, "step": 15190 }, { "gate_value": 0.2500430643558502, "icl_sequence_length": 70, "num_contexts": 3, "step": 15190 }, { "grad_norm": 0.48342952132225037, "learning_rate": 0.00021921217668145014, "loss": 0.3892, "step": 15200 }, { "gate_value": 0.24985510110855103, "icl_sequence_length": 70, "num_contexts": 3, "step": 15200 }, { "grad_norm": 0.8365819454193115, "learning_rate": 0.00021910213305974637, "loss": 0.3907, "step": 15210 }, { "gate_value": 0.2495216578245163, "icl_sequence_length": 82, "num_contexts": 3, "step": 15210 }, { "grad_norm": 7.897146224975586, "learning_rate": 0.0002189920422073862, "loss": 0.3848, "step": 15220 }, { "gate_value": 0.25015580654144287, "icl_sequence_length": 84, "num_contexts": 3, "step": 15220 }, { "grad_norm": 3.6526150703430176, "learning_rate": 0.00021888190419961582, "loss": 0.4047, "step": 15230 }, { "gate_value": 0.2501993775367737, "icl_sequence_length": 70, "num_contexts": 3, "step": 15230 }, { "grad_norm": 0.5569013357162476, "learning_rate": 0.00021877171911171338, "loss": 0.3841, "step": 15240 }, { "gate_value": 0.25032347440719604, "icl_sequence_length": 94, "num_contexts": 3, "step": 15240 }, { "grad_norm": 1.0121521949768066, "learning_rate": 0.00021866148701898939, "loss": 0.4137, "step": 15250 }, { "gate_value": 0.25046205520629883, "icl_sequence_length": 94, "num_contexts": 3, "step": 15250 }, { "grad_norm": 80.40043640136719, "learning_rate": 0.0002185512079967865, "loss": 0.3993, "step": 15260 }, { "gate_value": 0.25006845593452454, "icl_sequence_length": 80, "num_contexts": 3, "step": 15260 }, { "grad_norm": 1.2732210159301758, "learning_rate": 0.00021844088212047934, "loss": 0.3928, "step": 15270 }, { "gate_value": 0.24979396164417267, "icl_sequence_length": 82, "num_contexts": 3, "step": 15270 }, { "grad_norm": 0.9375239610671997, "learning_rate": 0.0002183305094654746, "loss": 0.3938, "step": 15280 }, { "gate_value": 0.250060111284256, "icl_sequence_length": 92, "num_contexts": 3, "step": 15280 }, { "grad_norm": 0.3681797981262207, "learning_rate": 0.00021822009010721095, "loss": 0.3745, "step": 15290 }, { "gate_value": 0.2503737509250641, "icl_sequence_length": 68, "num_contexts": 3, "step": 15290 }, { "grad_norm": 0.4525548219680786, "learning_rate": 0.000218109624121159, "loss": 0.3909, "step": 15300 }, { "gate_value": 0.25037622451782227, "icl_sequence_length": 86, "num_contexts": 3, "step": 15300 }, { "grad_norm": 29.581342697143555, "learning_rate": 0.0002179991115828212, "loss": 0.3881, "step": 15310 }, { "gate_value": 0.25151196122169495, "icl_sequence_length": 78, "num_contexts": 3, "step": 15310 }, { "grad_norm": 25.658153533935547, "learning_rate": 0.00021788855256773182, "loss": 0.4066, "step": 15320 }, { "gate_value": 0.25173476338386536, "icl_sequence_length": 76, "num_contexts": 3, "step": 15320 }, { "grad_norm": 0.3691151738166809, "learning_rate": 0.0002177779471514569, "loss": 0.4099, "step": 15330 }, { "gate_value": 0.25139957666397095, "icl_sequence_length": 78, "num_contexts": 3, "step": 15330 }, { "grad_norm": 0.46884021162986755, "learning_rate": 0.00021766729540959422, "loss": 0.4095, "step": 15340 }, { "gate_value": 0.2511599361896515, "icl_sequence_length": 92, "num_contexts": 3, "step": 15340 }, { "grad_norm": 2.712085723876953, "learning_rate": 0.00021755659741777317, "loss": 0.3866, "step": 15350 }, { "gate_value": 0.25105735659599304, "icl_sequence_length": 72, "num_contexts": 3, "step": 15350 }, { "grad_norm": 0.5205731987953186, "learning_rate": 0.00021744585325165485, "loss": 0.3945, "step": 15360 }, { "gate_value": 0.25111421942710876, "icl_sequence_length": 88, "num_contexts": 3, "step": 15360 }, { "grad_norm": 0.41241544485092163, "learning_rate": 0.00021733506298693178, "loss": 0.4053, "step": 15370 }, { "gate_value": 0.25125807523727417, "icl_sequence_length": 84, "num_contexts": 3, "step": 15370 }, { "grad_norm": 0.6068854331970215, "learning_rate": 0.0002172242266993281, "loss": 0.3962, "step": 15380 }, { "gate_value": 0.25145697593688965, "icl_sequence_length": 80, "num_contexts": 3, "step": 15380 }, { "grad_norm": 0.8114979267120361, "learning_rate": 0.00021711334446459937, "loss": 0.3839, "step": 15390 }, { "gate_value": 0.25126177072525024, "icl_sequence_length": 86, "num_contexts": 3, "step": 15390 }, { "grad_norm": 0.5185739398002625, "learning_rate": 0.0002170024163585325, "loss": 0.3781, "step": 15400 }, { "gate_value": 0.2513514757156372, "icl_sequence_length": 80, "num_contexts": 3, "step": 15400 }, { "grad_norm": 8.948464393615723, "learning_rate": 0.0002168914424569459, "loss": 0.3842, "step": 15410 }, { "gate_value": 0.251672625541687, "icl_sequence_length": 90, "num_contexts": 3, "step": 15410 }, { "grad_norm": 0.726407527923584, "learning_rate": 0.0002167804228356891, "loss": 0.3908, "step": 15420 }, { "gate_value": 0.2515237033367157, "icl_sequence_length": 70, "num_contexts": 3, "step": 15420 }, { "grad_norm": 0.4296571910381317, "learning_rate": 0.00021666935757064294, "loss": 0.3717, "step": 15430 }, { "gate_value": 0.2517387866973877, "icl_sequence_length": 68, "num_contexts": 3, "step": 15430 }, { "grad_norm": 0.547008216381073, "learning_rate": 0.00021655824673771963, "loss": 0.3755, "step": 15440 }, { "gate_value": 0.2511221468448639, "icl_sequence_length": 84, "num_contexts": 3, "step": 15440 }, { "grad_norm": 4.1485819816589355, "learning_rate": 0.0002164470904128622, "loss": 0.3919, "step": 15450 }, { "gate_value": 0.2507605254650116, "icl_sequence_length": 76, "num_contexts": 3, "step": 15450 }, { "grad_norm": 1.476075291633606, "learning_rate": 0.00021633588867204509, "loss": 0.3869, "step": 15460 }, { "gate_value": 0.2507340610027313, "icl_sequence_length": 68, "num_contexts": 3, "step": 15460 }, { "grad_norm": 0.7980257272720337, "learning_rate": 0.0002162246415912736, "loss": 0.4104, "step": 15470 }, { "gate_value": 0.2506262958049774, "icl_sequence_length": 86, "num_contexts": 3, "step": 15470 }, { "grad_norm": 0.5964367389678955, "learning_rate": 0.00021611334924658397, "loss": 0.39, "step": 15480 }, { "gate_value": 0.25122764706611633, "icl_sequence_length": 86, "num_contexts": 3, "step": 15480 }, { "grad_norm": 0.5874263048171997, "learning_rate": 0.00021600201171404358, "loss": 0.369, "step": 15490 }, { "gate_value": 0.251434862613678, "icl_sequence_length": 90, "num_contexts": 3, "step": 15490 }, { "grad_norm": 0.9638623595237732, "learning_rate": 0.00021589062906975055, "loss": 0.3913, "step": 15500 }, { "gate_value": 0.2517266571521759, "icl_sequence_length": 76, "num_contexts": 3, "step": 15500 }, { "grad_norm": 0.5481185913085938, "learning_rate": 0.00021577920138983383, "loss": 0.3991, "step": 15510 }, { "gate_value": 0.2520568072795868, "icl_sequence_length": 64, "num_contexts": 3, "step": 15510 }, { "grad_norm": 2.838137626647949, "learning_rate": 0.00021566772875045327, "loss": 0.4051, "step": 15520 }, { "gate_value": 0.2513696253299713, "icl_sequence_length": 94, "num_contexts": 3, "step": 15520 }, { "grad_norm": 6.476634502410889, "learning_rate": 0.00021555621122779927, "loss": 0.3831, "step": 15530 }, { "gate_value": 0.2510931193828583, "icl_sequence_length": 90, "num_contexts": 3, "step": 15530 }, { "grad_norm": 0.6045856475830078, "learning_rate": 0.00021544464889809307, "loss": 0.3815, "step": 15540 }, { "gate_value": 0.25079429149627686, "icl_sequence_length": 66, "num_contexts": 3, "step": 15540 }, { "grad_norm": 9.019379615783691, "learning_rate": 0.0002153330418375865, "loss": 0.4041, "step": 15550 }, { "gate_value": 0.2513893246650696, "icl_sequence_length": 74, "num_contexts": 3, "step": 15550 }, { "grad_norm": 0.8856226205825806, "learning_rate": 0.0002152213901225618, "loss": 0.3808, "step": 15560 }, { "gate_value": 0.251968115568161, "icl_sequence_length": 82, "num_contexts": 3, "step": 15560 }, { "grad_norm": 0.6020224690437317, "learning_rate": 0.00021510969382933204, "loss": 0.3901, "step": 15570 }, { "gate_value": 0.25232499837875366, "icl_sequence_length": 76, "num_contexts": 3, "step": 15570 }, { "grad_norm": 1.5310420989990234, "learning_rate": 0.00021499795303424045, "loss": 0.4009, "step": 15580 }, { "gate_value": 0.2531309723854065, "icl_sequence_length": 74, "num_contexts": 3, "step": 15580 }, { "grad_norm": 0.6574874520301819, "learning_rate": 0.00021488616781366088, "loss": 0.3933, "step": 15590 }, { "gate_value": 0.25358569622039795, "icl_sequence_length": 76, "num_contexts": 3, "step": 15590 }, { "grad_norm": 1.205522894859314, "learning_rate": 0.00021477433824399741, "loss": 0.3957, "step": 15600 }, { "gate_value": 0.25376975536346436, "icl_sequence_length": 74, "num_contexts": 3, "step": 15600 }, { "grad_norm": 2.3468384742736816, "learning_rate": 0.00021466246440168457, "loss": 0.3819, "step": 15610 }, { "gate_value": 0.2539609670639038, "icl_sequence_length": 74, "num_contexts": 3, "step": 15610 }, { "grad_norm": 0.36947518587112427, "learning_rate": 0.00021455054636318702, "loss": 0.3828, "step": 15620 }, { "gate_value": 0.2545165717601776, "icl_sequence_length": 76, "num_contexts": 3, "step": 15620 }, { "grad_norm": 0.6467231512069702, "learning_rate": 0.0002144385842049997, "loss": 0.3631, "step": 15630 }, { "gate_value": 0.2550152540206909, "icl_sequence_length": 82, "num_contexts": 3, "step": 15630 }, { "grad_norm": 0.6928891539573669, "learning_rate": 0.00021432657800364775, "loss": 0.4008, "step": 15640 }, { "gate_value": 0.254954069852829, "icl_sequence_length": 84, "num_contexts": 3, "step": 15640 }, { "grad_norm": 1.5424365997314453, "learning_rate": 0.00021421452783568624, "loss": 0.3866, "step": 15650 }, { "gate_value": 0.25487643480300903, "icl_sequence_length": 82, "num_contexts": 3, "step": 15650 }, { "grad_norm": 1.0675183534622192, "learning_rate": 0.00021410243377770048, "loss": 0.3751, "step": 15660 }, { "gate_value": 0.2548693120479584, "icl_sequence_length": 82, "num_contexts": 3, "step": 15660 }, { "grad_norm": 2.203230381011963, "learning_rate": 0.00021399029590630567, "loss": 0.3958, "step": 15670 }, { "gate_value": 0.2552475035190582, "icl_sequence_length": 74, "num_contexts": 3, "step": 15670 }, { "grad_norm": 3.342008590698242, "learning_rate": 0.000213878114298147, "loss": 0.3727, "step": 15680 }, { "gate_value": 0.255521297454834, "icl_sequence_length": 66, "num_contexts": 3, "step": 15680 }, { "grad_norm": 20.107084274291992, "learning_rate": 0.00021376588902989962, "loss": 0.3893, "step": 15690 }, { "gate_value": 0.25595200061798096, "icl_sequence_length": 76, "num_contexts": 3, "step": 15690 }, { "grad_norm": 0.9243622422218323, "learning_rate": 0.00021365362017826826, "loss": 0.3972, "step": 15700 }, { "gate_value": 0.25568291544914246, "icl_sequence_length": 94, "num_contexts": 3, "step": 15700 }, { "grad_norm": 0.8758662343025208, "learning_rate": 0.00021354130781998774, "loss": 0.4018, "step": 15710 }, { "gate_value": 0.25568267703056335, "icl_sequence_length": 74, "num_contexts": 3, "step": 15710 }, { "grad_norm": 13.960478782653809, "learning_rate": 0.00021342895203182256, "loss": 0.3883, "step": 15720 }, { "gate_value": 0.25582268834114075, "icl_sequence_length": 86, "num_contexts": 3, "step": 15720 }, { "grad_norm": 0.7308502793312073, "learning_rate": 0.00021331655289056668, "loss": 0.3832, "step": 15730 }, { "gate_value": 0.2554386258125305, "icl_sequence_length": 76, "num_contexts": 3, "step": 15730 }, { "grad_norm": 0.8558282256126404, "learning_rate": 0.00021320411047304398, "loss": 0.3748, "step": 15740 }, { "gate_value": 0.25477343797683716, "icl_sequence_length": 64, "num_contexts": 3, "step": 15740 }, { "grad_norm": 0.4697778820991516, "learning_rate": 0.00021309162485610774, "loss": 0.3853, "step": 15750 }, { "gate_value": 0.2549998462200165, "icl_sequence_length": 86, "num_contexts": 3, "step": 15750 }, { "grad_norm": 19.340211868286133, "learning_rate": 0.00021297909611664085, "loss": 0.3793, "step": 15760 }, { "gate_value": 0.2552267909049988, "icl_sequence_length": 76, "num_contexts": 3, "step": 15760 }, { "grad_norm": 0.7915621995925903, "learning_rate": 0.0002128665243315556, "loss": 0.4153, "step": 15770 }, { "gate_value": 0.2552363872528076, "icl_sequence_length": 62, "num_contexts": 3, "step": 15770 }, { "grad_norm": 1.852100133895874, "learning_rate": 0.00021275390957779377, "loss": 0.3861, "step": 15780 }, { "gate_value": 0.2549734115600586, "icl_sequence_length": 70, "num_contexts": 3, "step": 15780 }, { "grad_norm": 1.6746331453323364, "learning_rate": 0.0002126412519323265, "loss": 0.3867, "step": 15790 }, { "gate_value": 0.2554560899734497, "icl_sequence_length": 70, "num_contexts": 3, "step": 15790 }, { "grad_norm": 0.7904180288314819, "learning_rate": 0.00021252855147215415, "loss": 0.396, "step": 15800 }, { "gate_value": 0.25506383180618286, "icl_sequence_length": 76, "num_contexts": 3, "step": 15800 }, { "grad_norm": 11.795361518859863, "learning_rate": 0.0002124158082743065, "loss": 0.4056, "step": 15810 }, { "gate_value": 0.2555553913116455, "icl_sequence_length": 88, "num_contexts": 3, "step": 15810 }, { "grad_norm": 0.7169960737228394, "learning_rate": 0.0002123030224158425, "loss": 0.3954, "step": 15820 }, { "gate_value": 0.2563459873199463, "icl_sequence_length": 68, "num_contexts": 3, "step": 15820 }, { "grad_norm": 0.35485365986824036, "learning_rate": 0.0002121901939738501, "loss": 0.3916, "step": 15830 }, { "gate_value": 0.2569548785686493, "icl_sequence_length": 68, "num_contexts": 3, "step": 15830 }, { "grad_norm": 0.3839747905731201, "learning_rate": 0.00021207732302544656, "loss": 0.3911, "step": 15840 }, { "gate_value": 0.257243812084198, "icl_sequence_length": 68, "num_contexts": 3, "step": 15840 }, { "grad_norm": 0.42446404695510864, "learning_rate": 0.00021196440964777808, "loss": 0.3931, "step": 15850 }, { "gate_value": 0.25690987706184387, "icl_sequence_length": 90, "num_contexts": 3, "step": 15850 }, { "grad_norm": 0.5473038554191589, "learning_rate": 0.00021185145391801989, "loss": 0.3968, "step": 15860 }, { "gate_value": 0.25695231556892395, "icl_sequence_length": 94, "num_contexts": 3, "step": 15860 }, { "grad_norm": 2.4800989627838135, "learning_rate": 0.00021173845591337614, "loss": 0.3932, "step": 15870 }, { "gate_value": 0.2575494349002838, "icl_sequence_length": 80, "num_contexts": 3, "step": 15870 }, { "grad_norm": 0.5532228946685791, "learning_rate": 0.00021162541571108, "loss": 0.3849, "step": 15880 }, { "gate_value": 0.25717470049858093, "icl_sequence_length": 88, "num_contexts": 3, "step": 15880 }, { "grad_norm": 0.6373085975646973, "learning_rate": 0.00021151233338839324, "loss": 0.3814, "step": 15890 }, { "gate_value": 0.25678393244743347, "icl_sequence_length": 78, "num_contexts": 3, "step": 15890 }, { "grad_norm": 0.9044815897941589, "learning_rate": 0.0002113992090226067, "loss": 0.3792, "step": 15900 }, { "gate_value": 0.2567276358604431, "icl_sequence_length": 76, "num_contexts": 3, "step": 15900 }, { "grad_norm": 1.4532538652420044, "learning_rate": 0.0002112860426910397, "loss": 0.407, "step": 15910 }, { "gate_value": 0.2564510107040405, "icl_sequence_length": 70, "num_contexts": 3, "step": 15910 }, { "grad_norm": 0.8240043520927429, "learning_rate": 0.00021117283447104045, "loss": 0.3954, "step": 15920 }, { "gate_value": 0.25678786635398865, "icl_sequence_length": 78, "num_contexts": 3, "step": 15920 }, { "grad_norm": 1.1029771566390991, "learning_rate": 0.00021105958443998568, "loss": 0.3692, "step": 15930 }, { "gate_value": 0.2571948766708374, "icl_sequence_length": 64, "num_contexts": 3, "step": 15930 }, { "grad_norm": 1.8991724252700806, "learning_rate": 0.00021094629267528065, "loss": 0.3921, "step": 15940 }, { "gate_value": 0.25753188133239746, "icl_sequence_length": 72, "num_contexts": 3, "step": 15940 }, { "grad_norm": 0.6801439523696899, "learning_rate": 0.0002108329592543593, "loss": 0.392, "step": 15950 }, { "gate_value": 0.2578553855419159, "icl_sequence_length": 78, "num_contexts": 3, "step": 15950 }, { "grad_norm": 1.262444257736206, "learning_rate": 0.0002107195842546839, "loss": 0.3761, "step": 15960 }, { "gate_value": 0.2574891149997711, "icl_sequence_length": 68, "num_contexts": 3, "step": 15960 }, { "grad_norm": 2.524458885192871, "learning_rate": 0.0002106061677537453, "loss": 0.3865, "step": 15970 }, { "gate_value": 0.2576063573360443, "icl_sequence_length": 84, "num_contexts": 3, "step": 15970 }, { "grad_norm": 1.6312613487243652, "learning_rate": 0.00021049270982906242, "loss": 0.3722, "step": 15980 }, { "gate_value": 0.2574409544467926, "icl_sequence_length": 80, "num_contexts": 3, "step": 15980 }, { "grad_norm": 3.748892068862915, "learning_rate": 0.0002103792105581828, "loss": 0.3816, "step": 15990 }, { "gate_value": 0.2578420639038086, "icl_sequence_length": 70, "num_contexts": 3, "step": 15990 }, { "grad_norm": 1.2964622974395752, "learning_rate": 0.00021026567001868212, "loss": 0.391, "step": 16000 }, { "gate_value": 0.25792190432548523, "icl_sequence_length": 86, "num_contexts": 3, "step": 16000 }, { "grad_norm": 3.316319704055786, "learning_rate": 0.00021015208828816423, "loss": 0.3748, "step": 16010 }, { "gate_value": 0.25808802247047424, "icl_sequence_length": 86, "num_contexts": 3, "step": 16010 }, { "grad_norm": 1.7387820482254028, "learning_rate": 0.0002100384654442613, "loss": 0.3812, "step": 16020 }, { "gate_value": 0.25833994150161743, "icl_sequence_length": 86, "num_contexts": 3, "step": 16020 }, { "grad_norm": 3.10507869720459, "learning_rate": 0.00020992480156463325, "loss": 0.3989, "step": 16030 }, { "gate_value": 0.2587062418460846, "icl_sequence_length": 74, "num_contexts": 3, "step": 16030 }, { "grad_norm": 2.5109314918518066, "learning_rate": 0.0002098110967269684, "loss": 0.3885, "step": 16040 }, { "gate_value": 0.25896432995796204, "icl_sequence_length": 78, "num_contexts": 3, "step": 16040 }, { "grad_norm": 9.999404907226562, "learning_rate": 0.00020969735100898296, "loss": 0.3975, "step": 16050 }, { "gate_value": 0.25936567783355713, "icl_sequence_length": 86, "num_contexts": 3, "step": 16050 }, { "grad_norm": 2.2664084434509277, "learning_rate": 0.00020958356448842096, "loss": 0.3859, "step": 16060 }, { "gate_value": 0.2597810924053192, "icl_sequence_length": 80, "num_contexts": 3, "step": 16060 }, { "grad_norm": 2.365952730178833, "learning_rate": 0.00020946973724305455, "loss": 0.3775, "step": 16070 }, { "gate_value": 0.2600749731063843, "icl_sequence_length": 68, "num_contexts": 3, "step": 16070 }, { "grad_norm": 3.8386356830596924, "learning_rate": 0.00020935586935068347, "loss": 0.3663, "step": 16080 }, { "gate_value": 0.2603296935558319, "icl_sequence_length": 78, "num_contexts": 3, "step": 16080 }, { "grad_norm": 1.1269052028656006, "learning_rate": 0.00020924196088913536, "loss": 0.3807, "step": 16090 }, { "gate_value": 0.26086583733558655, "icl_sequence_length": 74, "num_contexts": 3, "step": 16090 }, { "grad_norm": 9.94027328491211, "learning_rate": 0.00020912801193626564, "loss": 0.3623, "step": 16100 }, { "gate_value": 0.2611926794052124, "icl_sequence_length": 72, "num_contexts": 3, "step": 16100 }, { "grad_norm": 2.996548652648926, "learning_rate": 0.00020901402256995728, "loss": 0.3801, "step": 16110 }, { "gate_value": 0.26136496663093567, "icl_sequence_length": 70, "num_contexts": 3, "step": 16110 }, { "grad_norm": 1.541462779045105, "learning_rate": 0.000208899992868121, "loss": 0.3786, "step": 16120 }, { "gate_value": 0.26164090633392334, "icl_sequence_length": 86, "num_contexts": 3, "step": 16120 }, { "grad_norm": 2.2632198333740234, "learning_rate": 0.00020878592290869493, "loss": 0.3909, "step": 16130 }, { "gate_value": 0.26203489303588867, "icl_sequence_length": 94, "num_contexts": 3, "step": 16130 }, { "grad_norm": 1.2951090335845947, "learning_rate": 0.00020867181276964486, "loss": 0.374, "step": 16140 }, { "gate_value": 0.2623451054096222, "icl_sequence_length": 60, "num_contexts": 3, "step": 16140 }, { "grad_norm": 1.0222433805465698, "learning_rate": 0.00020855766252896407, "loss": 0.3705, "step": 16150 }, { "gate_value": 0.2623714804649353, "icl_sequence_length": 70, "num_contexts": 3, "step": 16150 }, { "grad_norm": 7.520640850067139, "learning_rate": 0.00020844347226467306, "loss": 0.3886, "step": 16160 }, { "gate_value": 0.26221963763237, "icl_sequence_length": 72, "num_contexts": 3, "step": 16160 }, { "grad_norm": 34.34793472290039, "learning_rate": 0.00020832924205481986, "loss": 0.3846, "step": 16170 }, { "gate_value": 0.26230815052986145, "icl_sequence_length": 62, "num_contexts": 3, "step": 16170 }, { "grad_norm": 0.3719327449798584, "learning_rate": 0.00020821497197747973, "loss": 0.3975, "step": 16180 }, { "gate_value": 0.26282429695129395, "icl_sequence_length": 78, "num_contexts": 3, "step": 16180 }, { "grad_norm": 0.6486546397209167, "learning_rate": 0.00020810066211075516, "loss": 0.3876, "step": 16190 }, { "gate_value": 0.2626320421695709, "icl_sequence_length": 70, "num_contexts": 3, "step": 16190 }, { "grad_norm": 82.20526885986328, "learning_rate": 0.00020798631253277598, "loss": 0.385, "step": 16200 }, { "gate_value": 0.2627756595611572, "icl_sequence_length": 74, "num_contexts": 3, "step": 16200 }, { "grad_norm": 0.4642679691314697, "learning_rate": 0.00020787192332169887, "loss": 0.3929, "step": 16210 }, { "gate_value": 0.2625521421432495, "icl_sequence_length": 80, "num_contexts": 3, "step": 16210 }, { "grad_norm": 41.34733581542969, "learning_rate": 0.00020775749455570792, "loss": 0.3805, "step": 16220 }, { "gate_value": 0.2622221112251282, "icl_sequence_length": 76, "num_contexts": 3, "step": 16220 }, { "grad_norm": 1.4750176668167114, "learning_rate": 0.00020764302631301403, "loss": 0.3977, "step": 16230 }, { "gate_value": 0.26216328144073486, "icl_sequence_length": 78, "num_contexts": 3, "step": 16230 }, { "grad_norm": 0.7368234992027283, "learning_rate": 0.0002075285186718552, "loss": 0.4035, "step": 16240 }, { "gate_value": 0.26280391216278076, "icl_sequence_length": 76, "num_contexts": 3, "step": 16240 }, { "grad_norm": 0.5541696548461914, "learning_rate": 0.00020741397171049637, "loss": 0.3978, "step": 16250 }, { "gate_value": 0.2634401321411133, "icl_sequence_length": 82, "num_contexts": 3, "step": 16250 }, { "grad_norm": 2.5238094329833984, "learning_rate": 0.0002072993855072292, "loss": 0.374, "step": 16260 }, { "gate_value": 0.26335403323173523, "icl_sequence_length": 82, "num_contexts": 3, "step": 16260 }, { "grad_norm": 0.4893578588962555, "learning_rate": 0.00020718476014037235, "loss": 0.3891, "step": 16270 }, { "gate_value": 0.2633839547634125, "icl_sequence_length": 80, "num_contexts": 3, "step": 16270 }, { "grad_norm": 0.690986692905426, "learning_rate": 0.00020707009568827117, "loss": 0.4066, "step": 16280 }, { "gate_value": 0.2635418176651001, "icl_sequence_length": 84, "num_contexts": 3, "step": 16280 }, { "grad_norm": 0.5379742980003357, "learning_rate": 0.00020695539222929767, "loss": 0.3842, "step": 16290 }, { "gate_value": 0.26373836398124695, "icl_sequence_length": 78, "num_contexts": 3, "step": 16290 }, { "grad_norm": 3.5610744953155518, "learning_rate": 0.00020684064984185076, "loss": 0.3884, "step": 16300 }, { "gate_value": 0.2638593912124634, "icl_sequence_length": 74, "num_contexts": 3, "step": 16300 }, { "grad_norm": 3.4267258644104004, "learning_rate": 0.00020672586860435557, "loss": 0.3813, "step": 16310 }, { "gate_value": 0.2640724778175354, "icl_sequence_length": 90, "num_contexts": 3, "step": 16310 }, { "grad_norm": 31.349645614624023, "learning_rate": 0.0002066110485952641, "loss": 0.3883, "step": 16320 }, { "gate_value": 0.2640630304813385, "icl_sequence_length": 76, "num_contexts": 3, "step": 16320 }, { "grad_norm": 0.7556735277175903, "learning_rate": 0.0002064961898930547, "loss": 0.3804, "step": 16330 }, { "gate_value": 0.2637392580509186, "icl_sequence_length": 74, "num_contexts": 3, "step": 16330 }, { "grad_norm": 0.6610752940177917, "learning_rate": 0.00020638129257623229, "loss": 0.3827, "step": 16340 }, { "gate_value": 0.2638241946697235, "icl_sequence_length": 88, "num_contexts": 3, "step": 16340 }, { "grad_norm": 8.386857986450195, "learning_rate": 0.00020626635672332802, "loss": 0.3712, "step": 16350 }, { "gate_value": 0.26400476694107056, "icl_sequence_length": 78, "num_contexts": 3, "step": 16350 }, { "grad_norm": 1.04672372341156, "learning_rate": 0.00020615138241289948, "loss": 0.3741, "step": 16360 }, { "gate_value": 0.26382866501808167, "icl_sequence_length": 84, "num_contexts": 3, "step": 16360 }, { "grad_norm": 0.6411679983139038, "learning_rate": 0.00020603636972353056, "loss": 0.3949, "step": 16370 }, { "gate_value": 0.2640067934989929, "icl_sequence_length": 78, "num_contexts": 3, "step": 16370 }, { "grad_norm": 0.7315338253974915, "learning_rate": 0.0002059213187338313, "loss": 0.3845, "step": 16380 }, { "gate_value": 0.26440954208374023, "icl_sequence_length": 86, "num_contexts": 3, "step": 16380 }, { "grad_norm": 1.2115294933319092, "learning_rate": 0.000205806229522438, "loss": 0.3826, "step": 16390 }, { "gate_value": 0.2647918164730072, "icl_sequence_length": 76, "num_contexts": 3, "step": 16390 }, { "grad_norm": 21.087554931640625, "learning_rate": 0.00020569110216801307, "loss": 0.393, "step": 16400 }, { "gate_value": 0.26522791385650635, "icl_sequence_length": 78, "num_contexts": 3, "step": 16400 }, { "grad_norm": 0.5692827701568604, "learning_rate": 0.0002055759367492449, "loss": 0.3909, "step": 16410 }, { "gate_value": 0.26617687940597534, "icl_sequence_length": 74, "num_contexts": 3, "step": 16410 }, { "grad_norm": 0.6674289107322693, "learning_rate": 0.00020546073334484804, "loss": 0.3812, "step": 16420 }, { "gate_value": 0.2657477855682373, "icl_sequence_length": 70, "num_contexts": 3, "step": 16420 }, { "grad_norm": 1.9666045904159546, "learning_rate": 0.00020534549203356288, "loss": 0.3826, "step": 16430 }, { "gate_value": 0.2655538022518158, "icl_sequence_length": 82, "num_contexts": 3, "step": 16430 }, { "grad_norm": 1.16848886013031, "learning_rate": 0.00020523021289415582, "loss": 0.3835, "step": 16440 }, { "gate_value": 0.2654661536216736, "icl_sequence_length": 84, "num_contexts": 3, "step": 16440 }, { "grad_norm": 3.0859735012054443, "learning_rate": 0.00020511489600541903, "loss": 0.3807, "step": 16450 }, { "gate_value": 0.2656824588775635, "icl_sequence_length": 82, "num_contexts": 3, "step": 16450 }, { "grad_norm": 56.08064651489258, "learning_rate": 0.0002049995414461705, "loss": 0.3982, "step": 16460 }, { "gate_value": 0.2662789821624756, "icl_sequence_length": 84, "num_contexts": 3, "step": 16460 }, { "grad_norm": 0.6443231701850891, "learning_rate": 0.00020488414929525404, "loss": 0.3912, "step": 16470 }, { "gate_value": 0.26696157455444336, "icl_sequence_length": 86, "num_contexts": 3, "step": 16470 }, { "grad_norm": 0.6962271332740784, "learning_rate": 0.00020476871963153907, "loss": 0.3855, "step": 16480 }, { "gate_value": 0.2670133709907532, "icl_sequence_length": 84, "num_contexts": 3, "step": 16480 }, { "grad_norm": 0.8205323219299316, "learning_rate": 0.00020465325253392062, "loss": 0.4058, "step": 16490 }, { "gate_value": 0.2671220302581787, "icl_sequence_length": 76, "num_contexts": 3, "step": 16490 }, { "grad_norm": 1.2751874923706055, "learning_rate": 0.00020453774808131944, "loss": 0.3817, "step": 16500 }, { "gate_value": 0.2672402858734131, "icl_sequence_length": 82, "num_contexts": 3, "step": 16500 }, { "grad_norm": 0.5550151467323303, "learning_rate": 0.00020442220635268166, "loss": 0.3749, "step": 16510 }, { "gate_value": 0.2680794596672058, "icl_sequence_length": 66, "num_contexts": 3, "step": 16510 }, { "grad_norm": 0.48716598749160767, "learning_rate": 0.00020430662742697907, "loss": 0.3914, "step": 16520 }, { "gate_value": 0.26837554574012756, "icl_sequence_length": 70, "num_contexts": 3, "step": 16520 }, { "grad_norm": 4.756121635437012, "learning_rate": 0.00020419101138320872, "loss": 0.3942, "step": 16530 }, { "gate_value": 0.26773133873939514, "icl_sequence_length": 74, "num_contexts": 3, "step": 16530 }, { "grad_norm": 0.6253768801689148, "learning_rate": 0.00020407535830039303, "loss": 0.3894, "step": 16540 }, { "gate_value": 0.26719731092453003, "icl_sequence_length": 74, "num_contexts": 3, "step": 16540 }, { "grad_norm": 0.7593842148780823, "learning_rate": 0.0002039596682575799, "loss": 0.3835, "step": 16550 }, { "gate_value": 0.2668936252593994, "icl_sequence_length": 84, "num_contexts": 3, "step": 16550 }, { "grad_norm": 0.5268985033035278, "learning_rate": 0.00020384394133384228, "loss": 0.3697, "step": 16560 }, { "gate_value": 0.26714757084846497, "icl_sequence_length": 76, "num_contexts": 3, "step": 16560 }, { "grad_norm": 0.9582957029342651, "learning_rate": 0.0002037281776082785, "loss": 0.3995, "step": 16570 }, { "gate_value": 0.267073392868042, "icl_sequence_length": 88, "num_contexts": 3, "step": 16570 }, { "grad_norm": 0.8234410285949707, "learning_rate": 0.00020361237716001195, "loss": 0.3968, "step": 16580 }, { "gate_value": 0.26781952381134033, "icl_sequence_length": 92, "num_contexts": 3, "step": 16580 }, { "grad_norm": 1.6054513454437256, "learning_rate": 0.00020349654006819113, "loss": 0.4014, "step": 16590 }, { "gate_value": 0.2684796154499054, "icl_sequence_length": 84, "num_contexts": 3, "step": 16590 }, { "grad_norm": 2.0337159633636475, "learning_rate": 0.00020338066641198963, "loss": 0.4016, "step": 16600 }, { "gate_value": 0.26811471581459045, "icl_sequence_length": 82, "num_contexts": 3, "step": 16600 }, { "grad_norm": 22.712963104248047, "learning_rate": 0.00020326475627060594, "loss": 0.3852, "step": 16610 }, { "gate_value": 0.2682332992553711, "icl_sequence_length": 92, "num_contexts": 3, "step": 16610 }, { "grad_norm": 0.768212080001831, "learning_rate": 0.00020314880972326367, "loss": 0.3818, "step": 16620 }, { "gate_value": 0.2681061029434204, "icl_sequence_length": 78, "num_contexts": 3, "step": 16620 }, { "grad_norm": 1.2150635719299316, "learning_rate": 0.00020303282684921108, "loss": 0.3934, "step": 16630 }, { "gate_value": 0.2679195702075958, "icl_sequence_length": 84, "num_contexts": 3, "step": 16630 }, { "grad_norm": 1.3667492866516113, "learning_rate": 0.00020291680772772138, "loss": 0.3872, "step": 16640 }, { "gate_value": 0.2678031623363495, "icl_sequence_length": 82, "num_contexts": 3, "step": 16640 }, { "grad_norm": 6.326725482940674, "learning_rate": 0.00020280075243809265, "loss": 0.3942, "step": 16650 }, { "gate_value": 0.2683236002922058, "icl_sequence_length": 80, "num_contexts": 3, "step": 16650 }, { "grad_norm": 14.302623748779297, "learning_rate": 0.0002026846610596474, "loss": 0.3753, "step": 16660 }, { "gate_value": 0.2683285176753998, "icl_sequence_length": 80, "num_contexts": 3, "step": 16660 }, { "grad_norm": 0.833937406539917, "learning_rate": 0.00020256853367173322, "loss": 0.3807, "step": 16670 }, { "gate_value": 0.26835429668426514, "icl_sequence_length": 74, "num_contexts": 3, "step": 16670 }, { "grad_norm": 1.282999038696289, "learning_rate": 0.00020245237035372194, "loss": 0.3945, "step": 16680 }, { "gate_value": 0.2686046361923218, "icl_sequence_length": 76, "num_contexts": 3, "step": 16680 }, { "grad_norm": 1.7827116250991821, "learning_rate": 0.00020233617118501005, "loss": 0.4033, "step": 16690 }, { "gate_value": 0.2689083218574524, "icl_sequence_length": 86, "num_contexts": 3, "step": 16690 }, { "grad_norm": 0.7584987878799438, "learning_rate": 0.00020221993624501872, "loss": 0.3908, "step": 16700 }, { "gate_value": 0.2689765393733978, "icl_sequence_length": 74, "num_contexts": 3, "step": 16700 }, { "grad_norm": 2.065035104751587, "learning_rate": 0.00020210366561319336, "loss": 0.382, "step": 16710 }, { "gate_value": 0.2689971923828125, "icl_sequence_length": 86, "num_contexts": 3, "step": 16710 }, { "grad_norm": 0.7940707206726074, "learning_rate": 0.00020198735936900386, "loss": 0.3896, "step": 16720 }, { "gate_value": 0.2692660391330719, "icl_sequence_length": 64, "num_contexts": 3, "step": 16720 }, { "grad_norm": 0.7941778302192688, "learning_rate": 0.00020187101759194443, "loss": 0.3776, "step": 16730 }, { "gate_value": 0.27001139521598816, "icl_sequence_length": 78, "num_contexts": 3, "step": 16730 }, { "grad_norm": 47.64362716674805, "learning_rate": 0.00020175464036153358, "loss": 0.3814, "step": 16740 }, { "gate_value": 0.2703426778316498, "icl_sequence_length": 90, "num_contexts": 3, "step": 16740 }, { "grad_norm": 0.5801603198051453, "learning_rate": 0.0002016382277573141, "loss": 0.3856, "step": 16750 }, { "gate_value": 0.27029573917388916, "icl_sequence_length": 68, "num_contexts": 3, "step": 16750 }, { "grad_norm": 4.548717498779297, "learning_rate": 0.00020152177985885284, "loss": 0.3725, "step": 16760 }, { "gate_value": 0.2708137333393097, "icl_sequence_length": 68, "num_contexts": 3, "step": 16760 }, { "grad_norm": 1.0393775701522827, "learning_rate": 0.00020140529674574087, "loss": 0.3797, "step": 16770 }, { "gate_value": 0.2712128460407257, "icl_sequence_length": 68, "num_contexts": 3, "step": 16770 }, { "grad_norm": 1.5896652936935425, "learning_rate": 0.0002012887784975933, "loss": 0.3955, "step": 16780 }, { "gate_value": 0.27125340700149536, "icl_sequence_length": 58, "num_contexts": 3, "step": 16780 }, { "grad_norm": 23.115083694458008, "learning_rate": 0.00020117222519404923, "loss": 0.394, "step": 16790 }, { "gate_value": 0.2709924280643463, "icl_sequence_length": 92, "num_contexts": 3, "step": 16790 }, { "grad_norm": 3.4433460235595703, "learning_rate": 0.00020105563691477177, "loss": 0.3856, "step": 16800 }, { "gate_value": 0.27060866355895996, "icl_sequence_length": 72, "num_contexts": 3, "step": 16800 }, { "grad_norm": 0.9717051386833191, "learning_rate": 0.00020093901373944794, "loss": 0.3935, "step": 16810 }, { "gate_value": 0.2705410122871399, "icl_sequence_length": 78, "num_contexts": 3, "step": 16810 }, { "grad_norm": 5.411111354827881, "learning_rate": 0.0002008223557477885, "loss": 0.3851, "step": 16820 }, { "gate_value": 0.27148672938346863, "icl_sequence_length": 88, "num_contexts": 3, "step": 16820 }, { "grad_norm": 1.2662025690078735, "learning_rate": 0.00020070566301952817, "loss": 0.394, "step": 16830 }, { "gate_value": 0.2718506455421448, "icl_sequence_length": 78, "num_contexts": 3, "step": 16830 }, { "grad_norm": 4.485749244689941, "learning_rate": 0.00020058893563442527, "loss": 0.3809, "step": 16840 }, { "gate_value": 0.2717529237270355, "icl_sequence_length": 92, "num_contexts": 3, "step": 16840 }, { "grad_norm": 70.55220794677734, "learning_rate": 0.00020047217367226192, "loss": 0.376, "step": 16850 }, { "gate_value": 0.2719095051288605, "icl_sequence_length": 72, "num_contexts": 3, "step": 16850 }, { "grad_norm": 1.0547325611114502, "learning_rate": 0.00020035537721284377, "loss": 0.3788, "step": 16860 }, { "gate_value": 0.2722213566303253, "icl_sequence_length": 72, "num_contexts": 3, "step": 16860 }, { "grad_norm": 1.2858396768569946, "learning_rate": 0.0002002385463360001, "loss": 0.384, "step": 16870 }, { "gate_value": 0.27259713411331177, "icl_sequence_length": 76, "num_contexts": 3, "step": 16870 }, { "grad_norm": 171.38514709472656, "learning_rate": 0.00020012168112158374, "loss": 0.3998, "step": 16880 }, { "gate_value": 0.2726678252220154, "icl_sequence_length": 66, "num_contexts": 3, "step": 16880 }, { "grad_norm": 1.532321810722351, "learning_rate": 0.00020000478164947094, "loss": 0.3847, "step": 16890 }, { "gate_value": 0.2719527781009674, "icl_sequence_length": 90, "num_contexts": 3, "step": 16890 }, { "grad_norm": 36.90795135498047, "learning_rate": 0.00019988784799956143, "loss": 0.3756, "step": 16900 }, { "gate_value": 0.271659255027771, "icl_sequence_length": 96, "num_contexts": 3, "step": 16900 }, { "grad_norm": 0.6787513494491577, "learning_rate": 0.00019977088025177823, "loss": 0.3902, "step": 16910 }, { "gate_value": 0.2718561887741089, "icl_sequence_length": 90, "num_contexts": 3, "step": 16910 }, { "grad_norm": 2.4867050647735596, "learning_rate": 0.00019965387848606766, "loss": 0.3832, "step": 16920 }, { "gate_value": 0.27170494198799133, "icl_sequence_length": 76, "num_contexts": 3, "step": 16920 }, { "grad_norm": 3.547852039337158, "learning_rate": 0.00019953684278239936, "loss": 0.394, "step": 16930 }, { "gate_value": 0.2722950279712677, "icl_sequence_length": 78, "num_contexts": 3, "step": 16930 }, { "grad_norm": 0.845325767993927, "learning_rate": 0.00019941977322076614, "loss": 0.381, "step": 16940 }, { "gate_value": 0.2729490399360657, "icl_sequence_length": 82, "num_contexts": 3, "step": 16940 }, { "grad_norm": 0.924738883972168, "learning_rate": 0.0001993026698811839, "loss": 0.3806, "step": 16950 }, { "gate_value": 0.27347734570503235, "icl_sequence_length": 72, "num_contexts": 3, "step": 16950 }, { "grad_norm": 1.2066264152526855, "learning_rate": 0.00019918553284369172, "loss": 0.3906, "step": 16960 }, { "gate_value": 0.2736641764640808, "icl_sequence_length": 78, "num_contexts": 3, "step": 16960 }, { "grad_norm": 1.266615867614746, "learning_rate": 0.0001990683621883516, "loss": 0.3807, "step": 16970 }, { "gate_value": 0.27343323826789856, "icl_sequence_length": 84, "num_contexts": 3, "step": 16970 }, { "grad_norm": 0.651129961013794, "learning_rate": 0.00019895115799524864, "loss": 0.3753, "step": 16980 }, { "gate_value": 0.27308720350265503, "icl_sequence_length": 80, "num_contexts": 3, "step": 16980 }, { "grad_norm": 1.6364394426345825, "learning_rate": 0.00019883392034449076, "loss": 0.3846, "step": 16990 }, { "gate_value": 0.27326497435569763, "icl_sequence_length": 72, "num_contexts": 3, "step": 16990 }, { "grad_norm": 1.037740707397461, "learning_rate": 0.00019871664931620883, "loss": 0.3864, "step": 17000 }, { "gate_value": 0.2733471095561981, "icl_sequence_length": 84, "num_contexts": 3, "step": 17000 }, { "grad_norm": 0.8848700523376465, "learning_rate": 0.0001985993449905564, "loss": 0.3888, "step": 17010 }, { "gate_value": 0.2732689678668976, "icl_sequence_length": 76, "num_contexts": 3, "step": 17010 }, { "grad_norm": 173.5919952392578, "learning_rate": 0.00019848200744770997, "loss": 0.3823, "step": 17020 }, { "gate_value": 0.27310124039649963, "icl_sequence_length": 80, "num_contexts": 3, "step": 17020 }, { "grad_norm": 0.7701226472854614, "learning_rate": 0.00019836463676786866, "loss": 0.3689, "step": 17030 }, { "gate_value": 0.27315619587898254, "icl_sequence_length": 82, "num_contexts": 3, "step": 17030 }, { "grad_norm": 1.866520643234253, "learning_rate": 0.0001982472330312541, "loss": 0.3665, "step": 17040 }, { "gate_value": 0.2732912600040436, "icl_sequence_length": 94, "num_contexts": 3, "step": 17040 }, { "grad_norm": 0.9565479159355164, "learning_rate": 0.00019812979631811072, "loss": 0.3866, "step": 17050 }, { "gate_value": 0.2739657461643219, "icl_sequence_length": 66, "num_contexts": 3, "step": 17050 }, { "grad_norm": 1.207183599472046, "learning_rate": 0.0001980123267087054, "loss": 0.3896, "step": 17060 }, { "gate_value": 0.2743777334690094, "icl_sequence_length": 78, "num_contexts": 3, "step": 17060 }, { "grad_norm": 0.6322810053825378, "learning_rate": 0.00019789482428332747, "loss": 0.3863, "step": 17070 }, { "gate_value": 0.2746823728084564, "icl_sequence_length": 72, "num_contexts": 3, "step": 17070 }, { "grad_norm": 0.6914767026901245, "learning_rate": 0.0001977772891222888, "loss": 0.3857, "step": 17080 }, { "gate_value": 0.27512282133102417, "icl_sequence_length": 74, "num_contexts": 3, "step": 17080 }, { "grad_norm": 2.1301958560943604, "learning_rate": 0.00019765972130592356, "loss": 0.3933, "step": 17090 }, { "gate_value": 0.2752041518688202, "icl_sequence_length": 92, "num_contexts": 3, "step": 17090 }, { "grad_norm": 1.580463171005249, "learning_rate": 0.00019754212091458814, "loss": 0.4096, "step": 17100 }, { "gate_value": 0.2751167416572571, "icl_sequence_length": 90, "num_contexts": 3, "step": 17100 }, { "grad_norm": 74.18382263183594, "learning_rate": 0.00019742448802866143, "loss": 0.3928, "step": 17110 }, { "gate_value": 0.2754330635070801, "icl_sequence_length": 92, "num_contexts": 3, "step": 17110 }, { "grad_norm": 1.2828655242919922, "learning_rate": 0.0001973068227285443, "loss": 0.3764, "step": 17120 }, { "gate_value": 0.27579745650291443, "icl_sequence_length": 82, "num_contexts": 3, "step": 17120 }, { "grad_norm": 1.951072096824646, "learning_rate": 0.00019718912509465993, "loss": 0.3882, "step": 17130 }, { "gate_value": 0.2759058177471161, "icl_sequence_length": 66, "num_contexts": 3, "step": 17130 }, { "grad_norm": 1.1944855451583862, "learning_rate": 0.00019707139520745354, "loss": 0.4006, "step": 17140 }, { "gate_value": 0.27621176838874817, "icl_sequence_length": 82, "num_contexts": 3, "step": 17140 }, { "grad_norm": 1.2348740100860596, "learning_rate": 0.00019695363314739235, "loss": 0.3775, "step": 17150 }, { "gate_value": 0.276276171207428, "icl_sequence_length": 72, "num_contexts": 3, "step": 17150 }, { "grad_norm": 1.934401512145996, "learning_rate": 0.00019683583899496565, "loss": 0.3779, "step": 17160 }, { "gate_value": 0.27666452527046204, "icl_sequence_length": 70, "num_contexts": 3, "step": 17160 }, { "grad_norm": 2.7875421047210693, "learning_rate": 0.00019671801283068464, "loss": 0.3676, "step": 17170 }, { "gate_value": 0.2770780622959137, "icl_sequence_length": 82, "num_contexts": 3, "step": 17170 }, { "grad_norm": 1.0745530128479004, "learning_rate": 0.0001966001547350824, "loss": 0.3832, "step": 17180 }, { "gate_value": 0.27716943621635437, "icl_sequence_length": 76, "num_contexts": 3, "step": 17180 }, { "grad_norm": 0.7714540362358093, "learning_rate": 0.0001964822647887138, "loss": 0.3953, "step": 17190 }, { "gate_value": 0.2768040895462036, "icl_sequence_length": 74, "num_contexts": 3, "step": 17190 }, { "grad_norm": 0.5804464221000671, "learning_rate": 0.00019636434307215552, "loss": 0.3839, "step": 17200 }, { "gate_value": 0.27638521790504456, "icl_sequence_length": 72, "num_contexts": 3, "step": 17200 }, { "grad_norm": 0.36973661184310913, "learning_rate": 0.00019624638966600591, "loss": 0.3748, "step": 17210 }, { "gate_value": 0.2765795588493347, "icl_sequence_length": 72, "num_contexts": 3, "step": 17210 }, { "grad_norm": 6.002297878265381, "learning_rate": 0.0001961284046508851, "loss": 0.3886, "step": 17220 }, { "gate_value": 0.2769390046596527, "icl_sequence_length": 84, "num_contexts": 3, "step": 17220 }, { "grad_norm": 2.958909511566162, "learning_rate": 0.00019601038810743463, "loss": 0.3828, "step": 17230 }, { "gate_value": 0.2774118483066559, "icl_sequence_length": 86, "num_contexts": 3, "step": 17230 }, { "grad_norm": 1.149144172668457, "learning_rate": 0.0001958923401163178, "loss": 0.3943, "step": 17240 }, { "gate_value": 0.2777392566204071, "icl_sequence_length": 92, "num_contexts": 3, "step": 17240 }, { "grad_norm": 2.879011631011963, "learning_rate": 0.00019577426075821915, "loss": 0.3885, "step": 17250 }, { "gate_value": 0.2781127095222473, "icl_sequence_length": 86, "num_contexts": 3, "step": 17250 }, { "grad_norm": 0.6843719482421875, "learning_rate": 0.00019565615011384494, "loss": 0.3917, "step": 17260 }, { "gate_value": 0.27844953536987305, "icl_sequence_length": 76, "num_contexts": 3, "step": 17260 }, { "grad_norm": 3.6135499477386475, "learning_rate": 0.00019553800826392262, "loss": 0.3723, "step": 17270 }, { "gate_value": 0.27857479453086853, "icl_sequence_length": 90, "num_contexts": 3, "step": 17270 }, { "grad_norm": 1.2397135496139526, "learning_rate": 0.00019541983528920112, "loss": 0.3697, "step": 17280 }, { "gate_value": 0.27856993675231934, "icl_sequence_length": 92, "num_contexts": 3, "step": 17280 }, { "grad_norm": 0.6731451153755188, "learning_rate": 0.00019530163127045046, "loss": 0.3858, "step": 17290 }, { "gate_value": 0.2787000238895416, "icl_sequence_length": 80, "num_contexts": 3, "step": 17290 }, { "grad_norm": 0.786884605884552, "learning_rate": 0.00019518339628846193, "loss": 0.4013, "step": 17300 }, { "gate_value": 0.27849066257476807, "icl_sequence_length": 68, "num_contexts": 3, "step": 17300 }, { "grad_norm": 18.715499877929688, "learning_rate": 0.00019506513042404815, "loss": 0.3896, "step": 17310 }, { "gate_value": 0.27890416979789734, "icl_sequence_length": 84, "num_contexts": 3, "step": 17310 }, { "grad_norm": 0.6340904235839844, "learning_rate": 0.00019494683375804265, "loss": 0.3884, "step": 17320 }, { "gate_value": 0.27936267852783203, "icl_sequence_length": 80, "num_contexts": 3, "step": 17320 }, { "grad_norm": 3.241877555847168, "learning_rate": 0.00019482850637130006, "loss": 0.3711, "step": 17330 }, { "gate_value": 0.27889105677604675, "icl_sequence_length": 68, "num_contexts": 3, "step": 17330 }, { "grad_norm": 0.9872519969940186, "learning_rate": 0.0001947101483446961, "loss": 0.387, "step": 17340 }, { "gate_value": 0.2789508104324341, "icl_sequence_length": 80, "num_contexts": 3, "step": 17340 }, { "grad_norm": 0.6962840557098389, "learning_rate": 0.00019459175975912736, "loss": 0.3963, "step": 17350 }, { "gate_value": 0.2791309952735901, "icl_sequence_length": 70, "num_contexts": 3, "step": 17350 }, { "grad_norm": 0.7151376008987427, "learning_rate": 0.0001944733406955113, "loss": 0.3745, "step": 17360 }, { "gate_value": 0.2794833183288574, "icl_sequence_length": 90, "num_contexts": 3, "step": 17360 }, { "grad_norm": 0.8369631767272949, "learning_rate": 0.00019435489123478624, "loss": 0.3894, "step": 17370 }, { "gate_value": 0.27932778000831604, "icl_sequence_length": 82, "num_contexts": 3, "step": 17370 }, { "grad_norm": 1.7139389514923096, "learning_rate": 0.00019423641145791123, "loss": 0.3873, "step": 17380 }, { "gate_value": 0.2792007029056549, "icl_sequence_length": 84, "num_contexts": 3, "step": 17380 }, { "grad_norm": 0.8466968536376953, "learning_rate": 0.0001941179014458661, "loss": 0.3863, "step": 17390 }, { "gate_value": 0.27936652302742004, "icl_sequence_length": 72, "num_contexts": 3, "step": 17390 }, { "grad_norm": 1.240417718887329, "learning_rate": 0.00019399936127965136, "loss": 0.4034, "step": 17400 }, { "gate_value": 0.2796449065208435, "icl_sequence_length": 66, "num_contexts": 3, "step": 17400 }, { "grad_norm": 0.7106504440307617, "learning_rate": 0.00019388079104028808, "loss": 0.3805, "step": 17410 }, { "gate_value": 0.2800625264644623, "icl_sequence_length": 92, "num_contexts": 3, "step": 17410 }, { "grad_norm": 0.7933707237243652, "learning_rate": 0.00019376219080881793, "loss": 0.3584, "step": 17420 }, { "gate_value": 0.28004422783851624, "icl_sequence_length": 76, "num_contexts": 3, "step": 17420 }, { "grad_norm": 50.58368682861328, "learning_rate": 0.00019364356066630295, "loss": 0.3856, "step": 17430 }, { "gate_value": 0.2805579602718353, "icl_sequence_length": 74, "num_contexts": 3, "step": 17430 }, { "grad_norm": 0.6874160170555115, "learning_rate": 0.0001935249006938258, "loss": 0.3559, "step": 17440 }, { "gate_value": 0.2811239957809448, "icl_sequence_length": 78, "num_contexts": 3, "step": 17440 }, { "grad_norm": 5.250424385070801, "learning_rate": 0.00019340621097248945, "loss": 0.3737, "step": 17450 }, { "gate_value": 0.28114932775497437, "icl_sequence_length": 80, "num_contexts": 3, "step": 17450 }, { "grad_norm": 12.111510276794434, "learning_rate": 0.00019328749158341728, "loss": 0.3757, "step": 17460 }, { "gate_value": 0.28097689151763916, "icl_sequence_length": 76, "num_contexts": 3, "step": 17460 }, { "grad_norm": 1.5629442930221558, "learning_rate": 0.00019316874260775268, "loss": 0.3783, "step": 17470 }, { "gate_value": 0.28098264336586, "icl_sequence_length": 84, "num_contexts": 3, "step": 17470 }, { "grad_norm": 0.4609309136867523, "learning_rate": 0.00019304996412665957, "loss": 0.3848, "step": 17480 }, { "gate_value": 0.2811479866504669, "icl_sequence_length": 60, "num_contexts": 3, "step": 17480 }, { "grad_norm": 0.9423713088035583, "learning_rate": 0.00019293115622132193, "loss": 0.3714, "step": 17490 }, { "gate_value": 0.2813046872615814, "icl_sequence_length": 78, "num_contexts": 3, "step": 17490 }, { "grad_norm": 1.93399178981781, "learning_rate": 0.00019281231897294384, "loss": 0.3816, "step": 17500 }, { "gate_value": 0.28104689717292786, "icl_sequence_length": 72, "num_contexts": 3, "step": 17500 }, { "grad_norm": 1.776007890701294, "learning_rate": 0.0001926934524627495, "loss": 0.3913, "step": 17510 }, { "gate_value": 0.28112155199050903, "icl_sequence_length": 76, "num_contexts": 3, "step": 17510 }, { "grad_norm": 0.6974020600318909, "learning_rate": 0.00019257455677198286, "loss": 0.3891, "step": 17520 }, { "gate_value": 0.2809901237487793, "icl_sequence_length": 90, "num_contexts": 3, "step": 17520 }, { "grad_norm": 1.8342015743255615, "learning_rate": 0.00019245563198190814, "loss": 0.3921, "step": 17530 }, { "gate_value": 0.2812676727771759, "icl_sequence_length": 72, "num_contexts": 3, "step": 17530 }, { "grad_norm": 1.2698308229446411, "learning_rate": 0.00019233667817380933, "loss": 0.394, "step": 17540 }, { "gate_value": 0.2814946174621582, "icl_sequence_length": 70, "num_contexts": 3, "step": 17540 }, { "grad_norm": 0.4935688376426697, "learning_rate": 0.0001922176954289902, "loss": 0.369, "step": 17550 }, { "gate_value": 0.2818371057510376, "icl_sequence_length": 84, "num_contexts": 3, "step": 17550 }, { "grad_norm": 0.6632453799247742, "learning_rate": 0.00019209868382877437, "loss": 0.3677, "step": 17560 }, { "gate_value": 0.2822357714176178, "icl_sequence_length": 68, "num_contexts": 3, "step": 17560 }, { "grad_norm": 0.8540254235267639, "learning_rate": 0.00019197964345450504, "loss": 0.3992, "step": 17570 }, { "gate_value": 0.28277909755706787, "icl_sequence_length": 90, "num_contexts": 3, "step": 17570 }, { "grad_norm": 1.1772438287734985, "learning_rate": 0.00019186057438754525, "loss": 0.4001, "step": 17580 }, { "gate_value": 0.28297939896583557, "icl_sequence_length": 86, "num_contexts": 3, "step": 17580 }, { "grad_norm": 98.65593719482422, "learning_rate": 0.00019174147670927765, "loss": 0.3855, "step": 17590 }, { "gate_value": 0.2828541100025177, "icl_sequence_length": 70, "num_contexts": 3, "step": 17590 }, { "grad_norm": 1.0209541320800781, "learning_rate": 0.00019162235050110429, "loss": 0.3751, "step": 17600 }, { "gate_value": 0.2828318476676941, "icl_sequence_length": 72, "num_contexts": 3, "step": 17600 }, { "grad_norm": 1.796211838722229, "learning_rate": 0.00019150319584444682, "loss": 0.3911, "step": 17610 }, { "gate_value": 0.28276461362838745, "icl_sequence_length": 72, "num_contexts": 3, "step": 17610 }, { "grad_norm": 21.861438751220703, "learning_rate": 0.00019138401282074635, "loss": 0.3939, "step": 17620 }, { "gate_value": 0.2829860746860504, "icl_sequence_length": 70, "num_contexts": 3, "step": 17620 }, { "grad_norm": 0.8495699763298035, "learning_rate": 0.00019126480151146334, "loss": 0.3733, "step": 17630 }, { "gate_value": 0.283086895942688, "icl_sequence_length": 70, "num_contexts": 3, "step": 17630 }, { "grad_norm": 5.792306423187256, "learning_rate": 0.0001911455619980776, "loss": 0.3805, "step": 17640 }, { "gate_value": 0.2834136188030243, "icl_sequence_length": 74, "num_contexts": 3, "step": 17640 }, { "grad_norm": 1.0493606328964233, "learning_rate": 0.00019102629436208824, "loss": 0.3987, "step": 17650 }, { "gate_value": 0.28334277868270874, "icl_sequence_length": 78, "num_contexts": 3, "step": 17650 }, { "grad_norm": 1.9529502391815186, "learning_rate": 0.00019090699868501353, "loss": 0.3736, "step": 17660 }, { "gate_value": 0.28328362107276917, "icl_sequence_length": 82, "num_contexts": 3, "step": 17660 }, { "grad_norm": 2.390843152999878, "learning_rate": 0.00019078767504839093, "loss": 0.3885, "step": 17670 }, { "gate_value": 0.2835651934146881, "icl_sequence_length": 78, "num_contexts": 3, "step": 17670 }, { "grad_norm": 0.9919568300247192, "learning_rate": 0.000190668323533777, "loss": 0.3791, "step": 17680 }, { "gate_value": 0.28367140889167786, "icl_sequence_length": 80, "num_contexts": 3, "step": 17680 }, { "grad_norm": 0.7665271759033203, "learning_rate": 0.00019054894422274754, "loss": 0.3778, "step": 17690 }, { "gate_value": 0.2842426002025604, "icl_sequence_length": 88, "num_contexts": 3, "step": 17690 }, { "grad_norm": 1.6776630878448486, "learning_rate": 0.00019042953719689695, "loss": 0.394, "step": 17700 }, { "gate_value": 0.2848225235939026, "icl_sequence_length": 74, "num_contexts": 3, "step": 17700 }, { "grad_norm": 1.3831422328948975, "learning_rate": 0.00019031010253783896, "loss": 0.3791, "step": 17710 }, { "gate_value": 0.28517720103263855, "icl_sequence_length": 64, "num_contexts": 3, "step": 17710 }, { "grad_norm": 0.912163496017456, "learning_rate": 0.00019019064032720594, "loss": 0.3893, "step": 17720 }, { "gate_value": 0.2853257954120636, "icl_sequence_length": 80, "num_contexts": 3, "step": 17720 }, { "grad_norm": 0.8194653391838074, "learning_rate": 0.00019007115064664922, "loss": 0.3826, "step": 17730 }, { "gate_value": 0.28528544306755066, "icl_sequence_length": 66, "num_contexts": 3, "step": 17730 }, { "grad_norm": 1.4953533411026, "learning_rate": 0.00018995163357783898, "loss": 0.3654, "step": 17740 }, { "gate_value": 0.28522640466690063, "icl_sequence_length": 82, "num_contexts": 3, "step": 17740 }, { "grad_norm": 1.0298045873641968, "learning_rate": 0.00018983208920246382, "loss": 0.3721, "step": 17750 }, { "gate_value": 0.28574541211128235, "icl_sequence_length": 78, "num_contexts": 3, "step": 17750 }, { "grad_norm": 5.820982456207275, "learning_rate": 0.00018971251760223137, "loss": 0.3592, "step": 17760 }, { "gate_value": 0.28601688146591187, "icl_sequence_length": 88, "num_contexts": 3, "step": 17760 }, { "grad_norm": 107.9212875366211, "learning_rate": 0.00018959291885886753, "loss": 0.4024, "step": 17770 }, { "gate_value": 0.28597700595855713, "icl_sequence_length": 90, "num_contexts": 3, "step": 17770 }, { "grad_norm": 3.506815195083618, "learning_rate": 0.000189473293054117, "loss": 0.381, "step": 17780 }, { "gate_value": 0.2859663963317871, "icl_sequence_length": 58, "num_contexts": 3, "step": 17780 }, { "grad_norm": 1.2677297592163086, "learning_rate": 0.00018935364026974292, "loss": 0.3859, "step": 17790 }, { "gate_value": 0.28602340817451477, "icl_sequence_length": 74, "num_contexts": 3, "step": 17790 }, { "grad_norm": 0.8437561988830566, "learning_rate": 0.00018923396058752673, "loss": 0.3775, "step": 17800 }, { "gate_value": 0.28563985228538513, "icl_sequence_length": 84, "num_contexts": 3, "step": 17800 }, { "grad_norm": 1.6269166469573975, "learning_rate": 0.0001891142540892685, "loss": 0.4039, "step": 17810 }, { "gate_value": 0.28554731607437134, "icl_sequence_length": 84, "num_contexts": 3, "step": 17810 }, { "grad_norm": 1.5911264419555664, "learning_rate": 0.00018899452085678639, "loss": 0.3821, "step": 17820 }, { "gate_value": 0.28552672266960144, "icl_sequence_length": 90, "num_contexts": 3, "step": 17820 }, { "grad_norm": 1.7108643054962158, "learning_rate": 0.00018887476097191697, "loss": 0.3847, "step": 17830 }, { "gate_value": 0.2859024405479431, "icl_sequence_length": 76, "num_contexts": 3, "step": 17830 }, { "grad_norm": 3.113037347793579, "learning_rate": 0.00018875497451651503, "loss": 0.3676, "step": 17840 }, { "gate_value": 0.28588712215423584, "icl_sequence_length": 86, "num_contexts": 3, "step": 17840 }, { "grad_norm": 0.8272459506988525, "learning_rate": 0.00018863516157245337, "loss": 0.3906, "step": 17850 }, { "gate_value": 0.2860240936279297, "icl_sequence_length": 74, "num_contexts": 3, "step": 17850 }, { "grad_norm": 3.2408292293548584, "learning_rate": 0.00018851532222162316, "loss": 0.3608, "step": 17860 }, { "gate_value": 0.28605082631111145, "icl_sequence_length": 60, "num_contexts": 3, "step": 17860 }, { "grad_norm": 0.8264132142066956, "learning_rate": 0.00018839545654593336, "loss": 0.3636, "step": 17870 }, { "gate_value": 0.28686094284057617, "icl_sequence_length": 70, "num_contexts": 3, "step": 17870 }, { "grad_norm": 0.7240784764289856, "learning_rate": 0.00018827556462731103, "loss": 0.4067, "step": 17880 }, { "gate_value": 0.28730401396751404, "icl_sequence_length": 80, "num_contexts": 3, "step": 17880 }, { "grad_norm": 0.634745180606842, "learning_rate": 0.00018815564654770125, "loss": 0.3767, "step": 17890 }, { "gate_value": 0.28744029998779297, "icl_sequence_length": 72, "num_contexts": 3, "step": 17890 }, { "grad_norm": 5.643700122833252, "learning_rate": 0.0001880357023890668, "loss": 0.38, "step": 17900 }, { "gate_value": 0.28771013021469116, "icl_sequence_length": 88, "num_contexts": 3, "step": 17900 }, { "grad_norm": 1.5269558429718018, "learning_rate": 0.00018791573223338843, "loss": 0.4023, "step": 17910 }, { "gate_value": 0.28753605484962463, "icl_sequence_length": 90, "num_contexts": 3, "step": 17910 }, { "grad_norm": 0.5712957978248596, "learning_rate": 0.00018779573616266461, "loss": 0.4074, "step": 17920 }, { "gate_value": 0.2872239947319031, "icl_sequence_length": 80, "num_contexts": 3, "step": 17920 }, { "grad_norm": 1.1614340543746948, "learning_rate": 0.0001876757142589115, "loss": 0.3899, "step": 17930 }, { "gate_value": 0.2880617678165436, "icl_sequence_length": 94, "num_contexts": 3, "step": 17930 }, { "grad_norm": 3.2989726066589355, "learning_rate": 0.000187555666604163, "loss": 0.3554, "step": 17940 }, { "gate_value": 0.2882530987262726, "icl_sequence_length": 80, "num_contexts": 3, "step": 17940 }, { "grad_norm": 0.9225475788116455, "learning_rate": 0.00018743559328047044, "loss": 0.3769, "step": 17950 }, { "gate_value": 0.2888268828392029, "icl_sequence_length": 64, "num_contexts": 3, "step": 17950 }, { "grad_norm": 3.0278470516204834, "learning_rate": 0.00018731549436990292, "loss": 0.3699, "step": 17960 }, { "gate_value": 0.2888841927051544, "icl_sequence_length": 78, "num_contexts": 3, "step": 17960 }, { "grad_norm": 1.4872838258743286, "learning_rate": 0.00018719536995454684, "loss": 0.3831, "step": 17970 }, { "gate_value": 0.28916996717453003, "icl_sequence_length": 76, "num_contexts": 3, "step": 17970 }, { "grad_norm": 1.2059917449951172, "learning_rate": 0.0001870752201165061, "loss": 0.392, "step": 17980 }, { "gate_value": 0.2897283434867859, "icl_sequence_length": 78, "num_contexts": 3, "step": 17980 }, { "grad_norm": 1.9197864532470703, "learning_rate": 0.00018695504493790207, "loss": 0.3979, "step": 17990 }, { "gate_value": 0.289908230304718, "icl_sequence_length": 78, "num_contexts": 3, "step": 17990 }, { "grad_norm": 5.284653663635254, "learning_rate": 0.00018683484450087324, "loss": 0.3796, "step": 18000 }, { "gate_value": 0.2897275686264038, "icl_sequence_length": 92, "num_contexts": 3, "step": 18000 }, { "grad_norm": 1.1138560771942139, "learning_rate": 0.00018671461888757556, "loss": 0.3793, "step": 18010 }, { "gate_value": 0.2901041805744171, "icl_sequence_length": 82, "num_contexts": 3, "step": 18010 }, { "grad_norm": 1.5257152318954468, "learning_rate": 0.00018659436818018208, "loss": 0.3853, "step": 18020 }, { "gate_value": 0.28984254598617554, "icl_sequence_length": 80, "num_contexts": 3, "step": 18020 }, { "grad_norm": 0.8911436200141907, "learning_rate": 0.00018647409246088298, "loss": 0.3775, "step": 18030 }, { "gate_value": 0.28980550169944763, "icl_sequence_length": 64, "num_contexts": 3, "step": 18030 }, { "grad_norm": 0.6340241432189941, "learning_rate": 0.0001863537918118856, "loss": 0.392, "step": 18040 }, { "gate_value": 0.2897473871707916, "icl_sequence_length": 56, "num_contexts": 3, "step": 18040 }, { "grad_norm": 0.7583088278770447, "learning_rate": 0.00018623346631541432, "loss": 0.3822, "step": 18050 }, { "gate_value": 0.2898584306240082, "icl_sequence_length": 90, "num_contexts": 3, "step": 18050 }, { "grad_norm": 2.2231905460357666, "learning_rate": 0.00018611311605371046, "loss": 0.3921, "step": 18060 }, { "gate_value": 0.2902577817440033, "icl_sequence_length": 92, "num_contexts": 3, "step": 18060 }, { "grad_norm": 6.011120796203613, "learning_rate": 0.00018599274110903238, "loss": 0.3943, "step": 18070 }, { "gate_value": 0.29029032588005066, "icl_sequence_length": 72, "num_contexts": 3, "step": 18070 }, { "grad_norm": 1.924442172050476, "learning_rate": 0.00018587234156365506, "loss": 0.3914, "step": 18080 }, { "gate_value": 0.2905459702014923, "icl_sequence_length": 86, "num_contexts": 3, "step": 18080 }, { "grad_norm": 8.414761543273926, "learning_rate": 0.0001857519174998706, "loss": 0.3703, "step": 18090 }, { "gate_value": 0.2910146415233612, "icl_sequence_length": 80, "num_contexts": 3, "step": 18090 }, { "grad_norm": 1.9152319431304932, "learning_rate": 0.00018563146899998762, "loss": 0.3739, "step": 18100 }, { "gate_value": 0.29149147868156433, "icl_sequence_length": 82, "num_contexts": 3, "step": 18100 }, { "grad_norm": 2.153871536254883, "learning_rate": 0.00018551099614633155, "loss": 0.3723, "step": 18110 }, { "gate_value": 0.2919636070728302, "icl_sequence_length": 80, "num_contexts": 3, "step": 18110 }, { "grad_norm": 0.6491678357124329, "learning_rate": 0.0001853904990212445, "loss": 0.3747, "step": 18120 }, { "gate_value": 0.29191774129867554, "icl_sequence_length": 90, "num_contexts": 3, "step": 18120 }, { "grad_norm": 0.706830620765686, "learning_rate": 0.00018526997770708506, "loss": 0.3734, "step": 18130 }, { "gate_value": 0.29252272844314575, "icl_sequence_length": 88, "num_contexts": 3, "step": 18130 }, { "grad_norm": 4.9841389656066895, "learning_rate": 0.00018514943228622842, "loss": 0.3879, "step": 18140 }, { "gate_value": 0.2927410900592804, "icl_sequence_length": 84, "num_contexts": 3, "step": 18140 }, { "grad_norm": 1.3059637546539307, "learning_rate": 0.00018502886284106623, "loss": 0.4032, "step": 18150 }, { "gate_value": 0.29293057322502136, "icl_sequence_length": 82, "num_contexts": 3, "step": 18150 }, { "grad_norm": 1.498307466506958, "learning_rate": 0.00018490826945400662, "loss": 0.3856, "step": 18160 }, { "gate_value": 0.2928726375102997, "icl_sequence_length": 78, "num_contexts": 3, "step": 18160 }, { "grad_norm": 5.742775917053223, "learning_rate": 0.00018478765220747407, "loss": 0.3816, "step": 18170 }, { "gate_value": 0.2927343249320984, "icl_sequence_length": 86, "num_contexts": 3, "step": 18170 }, { "grad_norm": 1.2477624416351318, "learning_rate": 0.00018466701118390914, "loss": 0.3687, "step": 18180 }, { "gate_value": 0.29233211278915405, "icl_sequence_length": 74, "num_contexts": 3, "step": 18180 }, { "grad_norm": 0.7571000456809998, "learning_rate": 0.00018454634646576906, "loss": 0.3849, "step": 18190 }, { "gate_value": 0.29229992628097534, "icl_sequence_length": 76, "num_contexts": 3, "step": 18190 }, { "grad_norm": 0.6805962324142456, "learning_rate": 0.00018442565813552684, "loss": 0.3817, "step": 18200 }, { "gate_value": 0.2926273047924042, "icl_sequence_length": 82, "num_contexts": 3, "step": 18200 }, { "grad_norm": 1.1327909231185913, "learning_rate": 0.00018430494627567196, "loss": 0.3911, "step": 18210 }, { "gate_value": 0.2928312122821808, "icl_sequence_length": 84, "num_contexts": 3, "step": 18210 }, { "grad_norm": 1.686699628829956, "learning_rate": 0.00018418421096870978, "loss": 0.3905, "step": 18220 }, { "gate_value": 0.293545126914978, "icl_sequence_length": 90, "num_contexts": 3, "step": 18220 }, { "grad_norm": 1.268710732460022, "learning_rate": 0.00018406345229716168, "loss": 0.3821, "step": 18230 }, { "gate_value": 0.29413795471191406, "icl_sequence_length": 68, "num_contexts": 3, "step": 18230 }, { "grad_norm": 1.1455614566802979, "learning_rate": 0.00018394267034356517, "loss": 0.3903, "step": 18240 }, { "gate_value": 0.29432111978530884, "icl_sequence_length": 80, "num_contexts": 3, "step": 18240 }, { "grad_norm": 1.1787275075912476, "learning_rate": 0.00018382186519047357, "loss": 0.3849, "step": 18250 }, { "gate_value": 0.2940574586391449, "icl_sequence_length": 90, "num_contexts": 3, "step": 18250 }, { "grad_norm": 1.0190541744232178, "learning_rate": 0.00018370103692045596, "loss": 0.3869, "step": 18260 }, { "gate_value": 0.29367583990097046, "icl_sequence_length": 74, "num_contexts": 3, "step": 18260 }, { "grad_norm": 1.1894299983978271, "learning_rate": 0.00018358018561609747, "loss": 0.3754, "step": 18270 }, { "gate_value": 0.2937644422054291, "icl_sequence_length": 88, "num_contexts": 3, "step": 18270 }, { "grad_norm": 0.9370589256286621, "learning_rate": 0.0001834593113599987, "loss": 0.3893, "step": 18280 }, { "gate_value": 0.2938789129257202, "icl_sequence_length": 72, "num_contexts": 3, "step": 18280 }, { "grad_norm": 2.4182047843933105, "learning_rate": 0.00018333841423477619, "loss": 0.4115, "step": 18290 }, { "gate_value": 0.29393965005874634, "icl_sequence_length": 74, "num_contexts": 3, "step": 18290 }, { "grad_norm": 1.9963648319244385, "learning_rate": 0.00018321749432306184, "loss": 0.3845, "step": 18300 }, { "gate_value": 0.2939322292804718, "icl_sequence_length": 72, "num_contexts": 3, "step": 18300 }, { "grad_norm": 1.7127970457077026, "learning_rate": 0.00018309655170750336, "loss": 0.3751, "step": 18310 }, { "gate_value": 0.2943023443222046, "icl_sequence_length": 70, "num_contexts": 3, "step": 18310 }, { "grad_norm": 13.848799705505371, "learning_rate": 0.0001829755864707639, "loss": 0.3866, "step": 18320 }, { "gate_value": 0.2945227324962616, "icl_sequence_length": 74, "num_contexts": 3, "step": 18320 }, { "grad_norm": 0.6617933511734009, "learning_rate": 0.00018285459869552199, "loss": 0.4022, "step": 18330 }, { "gate_value": 0.2949317991733551, "icl_sequence_length": 78, "num_contexts": 3, "step": 18330 }, { "grad_norm": 3.9121739864349365, "learning_rate": 0.00018273358846447168, "loss": 0.3735, "step": 18340 }, { "gate_value": 0.29527369141578674, "icl_sequence_length": 72, "num_contexts": 3, "step": 18340 }, { "grad_norm": 5.1793036460876465, "learning_rate": 0.00018261255586032234, "loss": 0.4012, "step": 18350 }, { "gate_value": 0.295279860496521, "icl_sequence_length": 92, "num_contexts": 3, "step": 18350 }, { "grad_norm": 12.034460067749023, "learning_rate": 0.00018249150096579856, "loss": 0.3865, "step": 18360 }, { "gate_value": 0.29541224241256714, "icl_sequence_length": 90, "num_contexts": 3, "step": 18360 }, { "grad_norm": 0.7827606797218323, "learning_rate": 0.00018237042386364026, "loss": 0.376, "step": 18370 }, { "gate_value": 0.295558899641037, "icl_sequence_length": 78, "num_contexts": 3, "step": 18370 }, { "grad_norm": 1.4565706253051758, "learning_rate": 0.00018224932463660245, "loss": 0.3713, "step": 18380 }, { "gate_value": 0.29634296894073486, "icl_sequence_length": 76, "num_contexts": 3, "step": 18380 }, { "grad_norm": 0.9638983607292175, "learning_rate": 0.0001821282033674554, "loss": 0.371, "step": 18390 }, { "gate_value": 0.2964816987514496, "icl_sequence_length": 90, "num_contexts": 3, "step": 18390 }, { "grad_norm": 9.79422664642334, "learning_rate": 0.0001820070601389843, "loss": 0.3794, "step": 18400 }, { "gate_value": 0.29639384150505066, "icl_sequence_length": 84, "num_contexts": 3, "step": 18400 }, { "grad_norm": 2.494586944580078, "learning_rate": 0.00018188589503398937, "loss": 0.3883, "step": 18410 }, { "gate_value": 0.296492338180542, "icl_sequence_length": 74, "num_contexts": 3, "step": 18410 }, { "grad_norm": 0.950502872467041, "learning_rate": 0.00018176470813528585, "loss": 0.3799, "step": 18420 }, { "gate_value": 0.29657965898513794, "icl_sequence_length": 88, "num_contexts": 3, "step": 18420 }, { "grad_norm": 1.4491907358169556, "learning_rate": 0.0001816434995257038, "loss": 0.3954, "step": 18430 }, { "gate_value": 0.2969636619091034, "icl_sequence_length": 88, "num_contexts": 3, "step": 18430 }, { "grad_norm": 1.5316132307052612, "learning_rate": 0.0001815222692880883, "loss": 0.3973, "step": 18440 }, { "gate_value": 0.29703307151794434, "icl_sequence_length": 80, "num_contexts": 3, "step": 18440 }, { "grad_norm": 1.4191664457321167, "learning_rate": 0.00018140101750529895, "loss": 0.3792, "step": 18450 }, { "gate_value": 0.2971894145011902, "icl_sequence_length": 64, "num_contexts": 3, "step": 18450 }, { "grad_norm": 1.043149709701538, "learning_rate": 0.0001812797442602102, "loss": 0.3817, "step": 18460 }, { "gate_value": 0.297396183013916, "icl_sequence_length": 86, "num_contexts": 3, "step": 18460 }, { "grad_norm": 4.110941410064697, "learning_rate": 0.0001811584496357112, "loss": 0.3768, "step": 18470 }, { "gate_value": 0.2977362275123596, "icl_sequence_length": 94, "num_contexts": 3, "step": 18470 }, { "grad_norm": 0.7887822985649109, "learning_rate": 0.00018103713371470564, "loss": 0.388, "step": 18480 }, { "gate_value": 0.2975766360759735, "icl_sequence_length": 80, "num_contexts": 3, "step": 18480 }, { "grad_norm": 6.283224582672119, "learning_rate": 0.00018091579658011196, "loss": 0.3834, "step": 18490 }, { "gate_value": 0.29731979966163635, "icl_sequence_length": 84, "num_contexts": 3, "step": 18490 }, { "grad_norm": 3.4314112663269043, "learning_rate": 0.00018079443831486275, "loss": 0.3889, "step": 18500 }, { "gate_value": 0.2973383963108063, "icl_sequence_length": 86, "num_contexts": 3, "step": 18500 }, { "grad_norm": 3.1021149158477783, "learning_rate": 0.00018067305900190534, "loss": 0.3863, "step": 18510 }, { "gate_value": 0.2972663938999176, "icl_sequence_length": 70, "num_contexts": 3, "step": 18510 }, { "grad_norm": 0.9524483680725098, "learning_rate": 0.00018055165872420137, "loss": 0.3853, "step": 18520 }, { "gate_value": 0.2974157929420471, "icl_sequence_length": 80, "num_contexts": 3, "step": 18520 }, { "grad_norm": 8.135797500610352, "learning_rate": 0.0001804302375647267, "loss": 0.3788, "step": 18530 }, { "gate_value": 0.2975497543811798, "icl_sequence_length": 80, "num_contexts": 3, "step": 18530 }, { "grad_norm": 0.8587509393692017, "learning_rate": 0.00018030879560647164, "loss": 0.3761, "step": 18540 }, { "gate_value": 0.2978472411632538, "icl_sequence_length": 70, "num_contexts": 3, "step": 18540 }, { "grad_norm": 3.4958364963531494, "learning_rate": 0.00018018733293244054, "loss": 0.3779, "step": 18550 }, { "gate_value": 0.2981078624725342, "icl_sequence_length": 64, "num_contexts": 3, "step": 18550 }, { "grad_norm": 0.891386091709137, "learning_rate": 0.00018006584962565204, "loss": 0.3635, "step": 18560 }, { "gate_value": 0.29821139574050903, "icl_sequence_length": 84, "num_contexts": 3, "step": 18560 }, { "grad_norm": 4.393123149871826, "learning_rate": 0.00017994434576913882, "loss": 0.3784, "step": 18570 }, { "gate_value": 0.29809004068374634, "icl_sequence_length": 82, "num_contexts": 3, "step": 18570 }, { "grad_norm": 1.5946396589279175, "learning_rate": 0.00017982282144594767, "loss": 0.3702, "step": 18580 }, { "gate_value": 0.29804927110671997, "icl_sequence_length": 94, "num_contexts": 3, "step": 18580 }, { "grad_norm": 0.6159313917160034, "learning_rate": 0.0001797012767391392, "loss": 0.3688, "step": 18590 }, { "gate_value": 0.29824721813201904, "icl_sequence_length": 64, "num_contexts": 3, "step": 18590 }, { "grad_norm": 1.7293890714645386, "learning_rate": 0.0001795797117317882, "loss": 0.3851, "step": 18600 }, { "gate_value": 0.2984820604324341, "icl_sequence_length": 90, "num_contexts": 3, "step": 18600 }, { "grad_norm": 39.279598236083984, "learning_rate": 0.0001794581265069831, "loss": 0.3844, "step": 18610 }, { "gate_value": 0.2984519302845001, "icl_sequence_length": 80, "num_contexts": 3, "step": 18610 }, { "grad_norm": 2.361945152282715, "learning_rate": 0.00017933652114782636, "loss": 0.3984, "step": 18620 }, { "gate_value": 0.2985188663005829, "icl_sequence_length": 86, "num_contexts": 3, "step": 18620 }, { "grad_norm": 11.486175537109375, "learning_rate": 0.00017921489573743404, "loss": 0.384, "step": 18630 }, { "gate_value": 0.29849180579185486, "icl_sequence_length": 70, "num_contexts": 3, "step": 18630 }, { "grad_norm": 2.7124886512756348, "learning_rate": 0.0001790932503589359, "loss": 0.3773, "step": 18640 }, { "gate_value": 0.29862406849861145, "icl_sequence_length": 76, "num_contexts": 3, "step": 18640 }, { "grad_norm": 14.254828453063965, "learning_rate": 0.00017897158509547556, "loss": 0.3846, "step": 18650 }, { "gate_value": 0.2986295521259308, "icl_sequence_length": 70, "num_contexts": 3, "step": 18650 }, { "grad_norm": 2.347095012664795, "learning_rate": 0.00017884990003020991, "loss": 0.3816, "step": 18660 }, { "gate_value": 0.2986813187599182, "icl_sequence_length": 90, "num_contexts": 3, "step": 18660 }, { "grad_norm": 1.8625816106796265, "learning_rate": 0.0001787281952463097, "loss": 0.3963, "step": 18670 }, { "gate_value": 0.29919448494911194, "icl_sequence_length": 68, "num_contexts": 3, "step": 18670 }, { "grad_norm": 1.2128583192825317, "learning_rate": 0.0001786064708269589, "loss": 0.4016, "step": 18680 }, { "gate_value": 0.299286812543869, "icl_sequence_length": 74, "num_contexts": 3, "step": 18680 }, { "grad_norm": 1.9459328651428223, "learning_rate": 0.000178484726855355, "loss": 0.3903, "step": 18690 }, { "gate_value": 0.29897111654281616, "icl_sequence_length": 84, "num_contexts": 3, "step": 18690 }, { "grad_norm": 2.7386348247528076, "learning_rate": 0.00017836296341470896, "loss": 0.3939, "step": 18700 }, { "gate_value": 0.29894396662712097, "icl_sequence_length": 94, "num_contexts": 3, "step": 18700 }, { "grad_norm": 5.737401485443115, "learning_rate": 0.00017824118058824481, "loss": 0.3728, "step": 18710 }, { "gate_value": 0.2992154657840729, "icl_sequence_length": 90, "num_contexts": 3, "step": 18710 }, { "grad_norm": 2.06498384475708, "learning_rate": 0.00017811937845920006, "loss": 0.3961, "step": 18720 }, { "gate_value": 0.2994868755340576, "icl_sequence_length": 86, "num_contexts": 3, "step": 18720 }, { "grad_norm": 1.8841884136199951, "learning_rate": 0.0001779975571108253, "loss": 0.3934, "step": 18730 }, { "gate_value": 0.2994190454483032, "icl_sequence_length": 78, "num_contexts": 3, "step": 18730 }, { "grad_norm": 9.062392234802246, "learning_rate": 0.00017787571662638418, "loss": 0.3905, "step": 18740 }, { "gate_value": 0.29949483275413513, "icl_sequence_length": 76, "num_contexts": 3, "step": 18740 }, { "grad_norm": 13.06369400024414, "learning_rate": 0.00017775385708915367, "loss": 0.3656, "step": 18750 }, { "gate_value": 0.2998977303504944, "icl_sequence_length": 82, "num_contexts": 3, "step": 18750 }, { "grad_norm": 0.7034562230110168, "learning_rate": 0.00017763197858242352, "loss": 0.3782, "step": 18760 }, { "gate_value": 0.3001224398612976, "icl_sequence_length": 90, "num_contexts": 3, "step": 18760 }, { "grad_norm": 5.080331802368164, "learning_rate": 0.00017751008118949653, "loss": 0.3833, "step": 18770 }, { "gate_value": 0.30042609572410583, "icl_sequence_length": 66, "num_contexts": 3, "step": 18770 }, { "grad_norm": 0.9425305724143982, "learning_rate": 0.00017738816499368853, "loss": 0.3898, "step": 18780 }, { "gate_value": 0.3005761206150055, "icl_sequence_length": 58, "num_contexts": 3, "step": 18780 }, { "grad_norm": 1.4428623914718628, "learning_rate": 0.00017726623007832795, "loss": 0.3729, "step": 18790 }, { "gate_value": 0.3003407418727875, "icl_sequence_length": 66, "num_contexts": 3, "step": 18790 }, { "grad_norm": 1.1261143684387207, "learning_rate": 0.00017714427652675626, "loss": 0.377, "step": 18800 }, { "gate_value": 0.3002099096775055, "icl_sequence_length": 82, "num_contexts": 3, "step": 18800 }, { "grad_norm": 3.563581705093384, "learning_rate": 0.00017702230442232747, "loss": 0.3857, "step": 18810 }, { "gate_value": 0.30062517523765564, "icl_sequence_length": 88, "num_contexts": 3, "step": 18810 }, { "grad_norm": 1.9583052396774292, "learning_rate": 0.0001769003138484084, "loss": 0.3667, "step": 18820 }, { "gate_value": 0.30108368396759033, "icl_sequence_length": 76, "num_contexts": 3, "step": 18820 }, { "grad_norm": 7.607995510101318, "learning_rate": 0.00017677830488837854, "loss": 0.3907, "step": 18830 }, { "gate_value": 0.3014366328716278, "icl_sequence_length": 78, "num_contexts": 3, "step": 18830 }, { "grad_norm": 2.672945261001587, "learning_rate": 0.00017665627762562973, "loss": 0.375, "step": 18840 }, { "gate_value": 0.3011823296546936, "icl_sequence_length": 80, "num_contexts": 3, "step": 18840 }, { "grad_norm": 2.078075408935547, "learning_rate": 0.00017653423214356655, "loss": 0.3736, "step": 18850 }, { "gate_value": 0.30117878317832947, "icl_sequence_length": 78, "num_contexts": 3, "step": 18850 }, { "grad_norm": 2.0478668212890625, "learning_rate": 0.00017641216852560594, "loss": 0.3922, "step": 18860 }, { "gate_value": 0.3012368977069855, "icl_sequence_length": 82, "num_contexts": 3, "step": 18860 }, { "grad_norm": 3.1022074222564697, "learning_rate": 0.0001762900868551771, "loss": 0.3634, "step": 18870 }, { "gate_value": 0.30124127864837646, "icl_sequence_length": 82, "num_contexts": 3, "step": 18870 }, { "grad_norm": 2.3055784702301025, "learning_rate": 0.00017616798721572185, "loss": 0.3762, "step": 18880 }, { "gate_value": 0.30172550678253174, "icl_sequence_length": 70, "num_contexts": 3, "step": 18880 }, { "grad_norm": 1.4114841222763062, "learning_rate": 0.00017604586969069408, "loss": 0.3907, "step": 18890 }, { "gate_value": 0.3020336627960205, "icl_sequence_length": 86, "num_contexts": 3, "step": 18890 }, { "grad_norm": 1.7120617628097534, "learning_rate": 0.00017592373436355998, "loss": 0.3888, "step": 18900 }, { "gate_value": 0.30191630125045776, "icl_sequence_length": 84, "num_contexts": 3, "step": 18900 }, { "grad_norm": 2.659731388092041, "learning_rate": 0.00017580158131779791, "loss": 0.3621, "step": 18910 }, { "gate_value": 0.30202534794807434, "icl_sequence_length": 76, "num_contexts": 3, "step": 18910 }, { "grad_norm": 2.554063320159912, "learning_rate": 0.00017567941063689827, "loss": 0.3951, "step": 18920 }, { "gate_value": 0.30230647325515747, "icl_sequence_length": 80, "num_contexts": 3, "step": 18920 }, { "grad_norm": 1.8052372932434082, "learning_rate": 0.0001755572224043636, "loss": 0.3651, "step": 18930 }, { "gate_value": 0.3022986054420471, "icl_sequence_length": 72, "num_contexts": 3, "step": 18930 }, { "grad_norm": 1.0668402910232544, "learning_rate": 0.0001754350167037084, "loss": 0.3835, "step": 18940 }, { "gate_value": 0.30235445499420166, "icl_sequence_length": 80, "num_contexts": 3, "step": 18940 }, { "grad_norm": 1.3296093940734863, "learning_rate": 0.0001753127936184592, "loss": 0.376, "step": 18950 }, { "gate_value": 0.30267301201820374, "icl_sequence_length": 72, "num_contexts": 3, "step": 18950 }, { "grad_norm": 1.3229838609695435, "learning_rate": 0.00017519055323215416, "loss": 0.3851, "step": 18960 }, { "gate_value": 0.3031296730041504, "icl_sequence_length": 76, "num_contexts": 3, "step": 18960 }, { "grad_norm": 4.205599308013916, "learning_rate": 0.0001750682956283435, "loss": 0.3895, "step": 18970 }, { "gate_value": 0.3030919134616852, "icl_sequence_length": 92, "num_contexts": 3, "step": 18970 }, { "grad_norm": 1.8597735166549683, "learning_rate": 0.00017494602089058924, "loss": 0.377, "step": 18980 }, { "gate_value": 0.3035913407802582, "icl_sequence_length": 84, "num_contexts": 3, "step": 18980 }, { "grad_norm": 27.68014907836914, "learning_rate": 0.00017482372910246487, "loss": 0.3806, "step": 18990 }, { "gate_value": 0.30397024750709534, "icl_sequence_length": 74, "num_contexts": 3, "step": 18990 }, { "grad_norm": 1.0020619630813599, "learning_rate": 0.0001747014203475558, "loss": 0.3818, "step": 19000 }, { "gate_value": 0.3041447699069977, "icl_sequence_length": 94, "num_contexts": 3, "step": 19000 }, { "grad_norm": 1.5626013278961182, "learning_rate": 0.00017457909470945876, "loss": 0.3674, "step": 19010 }, { "gate_value": 0.3043736517429352, "icl_sequence_length": 80, "num_contexts": 3, "step": 19010 }, { "grad_norm": 2.424412965774536, "learning_rate": 0.00017445675227178227, "loss": 0.373, "step": 19020 }, { "gate_value": 0.30459895730018616, "icl_sequence_length": 88, "num_contexts": 3, "step": 19020 }, { "grad_norm": 1.1943349838256836, "learning_rate": 0.00017433439311814627, "loss": 0.3713, "step": 19030 }, { "gate_value": 0.30527937412261963, "icl_sequence_length": 94, "num_contexts": 3, "step": 19030 }, { "grad_norm": 1.907333254814148, "learning_rate": 0.00017421201733218195, "loss": 0.3864, "step": 19040 }, { "gate_value": 0.305605947971344, "icl_sequence_length": 94, "num_contexts": 3, "step": 19040 }, { "grad_norm": 5.313255310058594, "learning_rate": 0.00017408962499753218, "loss": 0.3802, "step": 19050 }, { "gate_value": 0.3056756556034088, "icl_sequence_length": 80, "num_contexts": 3, "step": 19050 }, { "grad_norm": 0.7734915018081665, "learning_rate": 0.00017396721619785085, "loss": 0.3919, "step": 19060 }, { "gate_value": 0.305307537317276, "icl_sequence_length": 90, "num_contexts": 3, "step": 19060 }, { "grad_norm": 1.0982208251953125, "learning_rate": 0.00017384479101680318, "loss": 0.3778, "step": 19070 }, { "gate_value": 0.30533578991889954, "icl_sequence_length": 88, "num_contexts": 3, "step": 19070 }, { "grad_norm": 1.1880580186843872, "learning_rate": 0.00017372234953806577, "loss": 0.3613, "step": 19080 }, { "gate_value": 0.30517280101776123, "icl_sequence_length": 70, "num_contexts": 3, "step": 19080 }, { "grad_norm": 3.1580140590667725, "learning_rate": 0.00017359989184532614, "loss": 0.3745, "step": 19090 }, { "gate_value": 0.3055513799190521, "icl_sequence_length": 66, "num_contexts": 3, "step": 19090 }, { "grad_norm": 0.8944216370582581, "learning_rate": 0.00017347741802228292, "loss": 0.3741, "step": 19100 }, { "gate_value": 0.3054380714893341, "icl_sequence_length": 70, "num_contexts": 3, "step": 19100 }, { "grad_norm": 3.154456377029419, "learning_rate": 0.00017335492815264588, "loss": 0.3932, "step": 19110 }, { "gate_value": 0.30571210384368896, "icl_sequence_length": 88, "num_contexts": 3, "step": 19110 }, { "grad_norm": 1.052626132965088, "learning_rate": 0.00017323242232013562, "loss": 0.3689, "step": 19120 }, { "gate_value": 0.3059786856174469, "icl_sequence_length": 66, "num_contexts": 3, "step": 19120 }, { "grad_norm": 2.3785758018493652, "learning_rate": 0.00017310990060848385, "loss": 0.3836, "step": 19130 }, { "gate_value": 0.30624398589134216, "icl_sequence_length": 72, "num_contexts": 3, "step": 19130 }, { "grad_norm": 1.4953151941299438, "learning_rate": 0.00017298736310143292, "loss": 0.3721, "step": 19140 }, { "gate_value": 0.3064724802970886, "icl_sequence_length": 88, "num_contexts": 3, "step": 19140 }, { "grad_norm": 0.9571640491485596, "learning_rate": 0.00017286480988273607, "loss": 0.378, "step": 19150 }, { "gate_value": 0.3065114915370941, "icl_sequence_length": 68, "num_contexts": 3, "step": 19150 }, { "grad_norm": 2.1345882415771484, "learning_rate": 0.00017274224103615721, "loss": 0.3851, "step": 19160 }, { "gate_value": 0.30675628781318665, "icl_sequence_length": 88, "num_contexts": 3, "step": 19160 }, { "grad_norm": 5.007328033447266, "learning_rate": 0.0001726196566454711, "loss": 0.3687, "step": 19170 }, { "gate_value": 0.3067236840724945, "icl_sequence_length": 74, "num_contexts": 3, "step": 19170 }, { "grad_norm": 2.5974600315093994, "learning_rate": 0.00017249705679446296, "loss": 0.362, "step": 19180 }, { "gate_value": 0.3066827654838562, "icl_sequence_length": 72, "num_contexts": 3, "step": 19180 }, { "grad_norm": 1.0741844177246094, "learning_rate": 0.0001723744415669286, "loss": 0.3822, "step": 19190 }, { "gate_value": 0.3067505657672882, "icl_sequence_length": 74, "num_contexts": 3, "step": 19190 }, { "grad_norm": 0.9603002667427063, "learning_rate": 0.00017225181104667446, "loss": 0.3975, "step": 19200 }, { "gate_value": 0.30703407526016235, "icl_sequence_length": 70, "num_contexts": 3, "step": 19200 }, { "grad_norm": 8.882355690002441, "learning_rate": 0.00017212916531751725, "loss": 0.3815, "step": 19210 }, { "gate_value": 0.30765148997306824, "icl_sequence_length": 74, "num_contexts": 3, "step": 19210 }, { "grad_norm": 1.7542152404785156, "learning_rate": 0.00017200650446328418, "loss": 0.3865, "step": 19220 }, { "gate_value": 0.3078744113445282, "icl_sequence_length": 82, "num_contexts": 3, "step": 19220 }, { "grad_norm": 0.6525892019271851, "learning_rate": 0.00017188382856781292, "loss": 0.3764, "step": 19230 }, { "gate_value": 0.3077229857444763, "icl_sequence_length": 86, "num_contexts": 3, "step": 19230 }, { "grad_norm": 10.071258544921875, "learning_rate": 0.0001717611377149511, "loss": 0.3827, "step": 19240 }, { "gate_value": 0.3076436221599579, "icl_sequence_length": 72, "num_contexts": 3, "step": 19240 }, { "grad_norm": 0.9834297895431519, "learning_rate": 0.00017163843198855685, "loss": 0.3681, "step": 19250 }, { "gate_value": 0.3078278601169586, "icl_sequence_length": 82, "num_contexts": 3, "step": 19250 }, { "grad_norm": 1.0656040906906128, "learning_rate": 0.00017151571147249844, "loss": 0.3908, "step": 19260 }, { "gate_value": 0.3073131740093231, "icl_sequence_length": 88, "num_contexts": 3, "step": 19260 }, { "grad_norm": 1.851256012916565, "learning_rate": 0.00017139297625065402, "loss": 0.3864, "step": 19270 }, { "gate_value": 0.3072352111339569, "icl_sequence_length": 80, "num_contexts": 3, "step": 19270 }, { "grad_norm": 2.0325307846069336, "learning_rate": 0.00017127022640691218, "loss": 0.4071, "step": 19280 }, { "gate_value": 0.30702707171440125, "icl_sequence_length": 80, "num_contexts": 3, "step": 19280 }, { "grad_norm": 11.795533180236816, "learning_rate": 0.0001711474620251711, "loss": 0.3894, "step": 19290 }, { "gate_value": 0.3068554997444153, "icl_sequence_length": 82, "num_contexts": 3, "step": 19290 }, { "grad_norm": 2.296201467514038, "learning_rate": 0.0001710246831893391, "loss": 0.3815, "step": 19300 }, { "gate_value": 0.3070986568927765, "icl_sequence_length": 74, "num_contexts": 3, "step": 19300 }, { "grad_norm": 3.2662224769592285, "learning_rate": 0.00017090188998333442, "loss": 0.3754, "step": 19310 }, { "gate_value": 0.30730971693992615, "icl_sequence_length": 72, "num_contexts": 3, "step": 19310 }, { "grad_norm": 1.9833241701126099, "learning_rate": 0.000170779082491085, "loss": 0.3747, "step": 19320 }, { "gate_value": 0.3075919449329376, "icl_sequence_length": 74, "num_contexts": 3, "step": 19320 }, { "grad_norm": 3.4221928119659424, "learning_rate": 0.00017065626079652873, "loss": 0.3738, "step": 19330 }, { "gate_value": 0.30789434909820557, "icl_sequence_length": 68, "num_contexts": 3, "step": 19330 }, { "grad_norm": 2.5420238971710205, "learning_rate": 0.00017053342498361286, "loss": 0.379, "step": 19340 }, { "gate_value": 0.3079376518726349, "icl_sequence_length": 58, "num_contexts": 3, "step": 19340 }, { "grad_norm": 28.063114166259766, "learning_rate": 0.00017041057513629467, "loss": 0.3751, "step": 19350 }, { "gate_value": 0.30805227160453796, "icl_sequence_length": 82, "num_contexts": 3, "step": 19350 }, { "grad_norm": 1.6978594064712524, "learning_rate": 0.00017028771133854086, "loss": 0.4035, "step": 19360 }, { "gate_value": 0.3074844181537628, "icl_sequence_length": 90, "num_contexts": 3, "step": 19360 }, { "grad_norm": 0.8512712121009827, "learning_rate": 0.00017016483367432767, "loss": 0.378, "step": 19370 }, { "gate_value": 0.3074425458908081, "icl_sequence_length": 74, "num_contexts": 3, "step": 19370 }, { "grad_norm": 1.114200472831726, "learning_rate": 0.00017004194222764075, "loss": 0.3955, "step": 19380 }, { "gate_value": 0.307817280292511, "icl_sequence_length": 62, "num_contexts": 3, "step": 19380 }, { "grad_norm": 1.6457468271255493, "learning_rate": 0.00016991903708247534, "loss": 0.3736, "step": 19390 }, { "gate_value": 0.3081192076206207, "icl_sequence_length": 86, "num_contexts": 3, "step": 19390 }, { "grad_norm": 1.3923431634902954, "learning_rate": 0.00016979611832283588, "loss": 0.3708, "step": 19400 }, { "gate_value": 0.3083207905292511, "icl_sequence_length": 86, "num_contexts": 3, "step": 19400 }, { "grad_norm": 0.729960560798645, "learning_rate": 0.00016967318603273624, "loss": 0.3699, "step": 19410 }, { "gate_value": 0.3086252808570862, "icl_sequence_length": 80, "num_contexts": 3, "step": 19410 }, { "grad_norm": 2.3151090145111084, "learning_rate": 0.00016955024029619944, "loss": 0.3798, "step": 19420 }, { "gate_value": 0.30867984890937805, "icl_sequence_length": 84, "num_contexts": 3, "step": 19420 }, { "grad_norm": 2.409742593765259, "learning_rate": 0.00016942728119725777, "loss": 0.3731, "step": 19430 }, { "gate_value": 0.30868929624557495, "icl_sequence_length": 80, "num_contexts": 3, "step": 19430 }, { "grad_norm": 9.685903549194336, "learning_rate": 0.00016930430881995255, "loss": 0.38, "step": 19440 }, { "gate_value": 0.3084772229194641, "icl_sequence_length": 76, "num_contexts": 3, "step": 19440 }, { "grad_norm": 3.204035758972168, "learning_rate": 0.0001691813232483343, "loss": 0.3808, "step": 19450 }, { "gate_value": 0.30828583240509033, "icl_sequence_length": 78, "num_contexts": 3, "step": 19450 }, { "grad_norm": 1.6053801774978638, "learning_rate": 0.0001690583245664625, "loss": 0.391, "step": 19460 }, { "gate_value": 0.30858343839645386, "icl_sequence_length": 78, "num_contexts": 3, "step": 19460 }, { "grad_norm": 0.9320996999740601, "learning_rate": 0.00016893531285840555, "loss": 0.3744, "step": 19470 }, { "gate_value": 0.308741956949234, "icl_sequence_length": 76, "num_contexts": 3, "step": 19470 }, { "grad_norm": 1.2532542943954468, "learning_rate": 0.0001688122882082408, "loss": 0.3756, "step": 19480 }, { "gate_value": 0.3092672824859619, "icl_sequence_length": 90, "num_contexts": 3, "step": 19480 }, { "grad_norm": 1.312092661857605, "learning_rate": 0.00016868925070005444, "loss": 0.3833, "step": 19490 }, { "gate_value": 0.30962440371513367, "icl_sequence_length": 62, "num_contexts": 3, "step": 19490 }, { "grad_norm": 1.3440266847610474, "learning_rate": 0.00016856620041794145, "loss": 0.3685, "step": 19500 }, { "gate_value": 0.309724360704422, "icl_sequence_length": 90, "num_contexts": 3, "step": 19500 }, { "grad_norm": 1.1109507083892822, "learning_rate": 0.0001684431374460056, "loss": 0.3872, "step": 19510 }, { "gate_value": 0.3096853196620941, "icl_sequence_length": 62, "num_contexts": 3, "step": 19510 }, { "grad_norm": 2.074230670928955, "learning_rate": 0.00016832006186835916, "loss": 0.3962, "step": 19520 }, { "gate_value": 0.30953168869018555, "icl_sequence_length": 74, "num_contexts": 3, "step": 19520 }, { "grad_norm": 1.1803638935089111, "learning_rate": 0.0001681969737691232, "loss": 0.3739, "step": 19530 }, { "gate_value": 0.30993011593818665, "icl_sequence_length": 94, "num_contexts": 3, "step": 19530 }, { "grad_norm": 11.847143173217773, "learning_rate": 0.00016807387323242726, "loss": 0.3726, "step": 19540 }, { "gate_value": 0.3103095293045044, "icl_sequence_length": 80, "num_contexts": 3, "step": 19540 }, { "grad_norm": 0.9531516432762146, "learning_rate": 0.00016795076034240938, "loss": 0.3822, "step": 19550 }, { "gate_value": 0.3104497492313385, "icl_sequence_length": 88, "num_contexts": 3, "step": 19550 }, { "grad_norm": 1.2661352157592773, "learning_rate": 0.00016782763518321611, "loss": 0.387, "step": 19560 }, { "gate_value": 0.31011050939559937, "icl_sequence_length": 84, "num_contexts": 3, "step": 19560 }, { "grad_norm": 1.1231858730316162, "learning_rate": 0.00016770449783900225, "loss": 0.3876, "step": 19570 }, { "gate_value": 0.3103155493736267, "icl_sequence_length": 78, "num_contexts": 3, "step": 19570 }, { "grad_norm": 1.1371983289718628, "learning_rate": 0.0001675813483939311, "loss": 0.395, "step": 19580 }, { "gate_value": 0.3104420304298401, "icl_sequence_length": 76, "num_contexts": 3, "step": 19580 }, { "grad_norm": 1.1629470586776733, "learning_rate": 0.00016745818693217405, "loss": 0.3586, "step": 19590 }, { "gate_value": 0.31009596586227417, "icl_sequence_length": 80, "num_contexts": 3, "step": 19590 }, { "grad_norm": 2.172532558441162, "learning_rate": 0.0001673350135379109, "loss": 0.3732, "step": 19600 }, { "gate_value": 0.3102813959121704, "icl_sequence_length": 86, "num_contexts": 3, "step": 19600 }, { "grad_norm": 3.175884246826172, "learning_rate": 0.00016721182829532944, "loss": 0.3675, "step": 19610 }, { "gate_value": 0.3106948733329773, "icl_sequence_length": 70, "num_contexts": 3, "step": 19610 }, { "grad_norm": 1.5540252923965454, "learning_rate": 0.00016708863128862562, "loss": 0.3866, "step": 19620 }, { "gate_value": 0.3106013238430023, "icl_sequence_length": 76, "num_contexts": 3, "step": 19620 }, { "grad_norm": 1.5975329875946045, "learning_rate": 0.0001669654226020035, "loss": 0.3736, "step": 19630 }, { "gate_value": 0.3107665479183197, "icl_sequence_length": 68, "num_contexts": 3, "step": 19630 }, { "grad_norm": 0.9779009222984314, "learning_rate": 0.00016684220231967496, "loss": 0.3776, "step": 19640 }, { "gate_value": 0.3108340799808502, "icl_sequence_length": 90, "num_contexts": 3, "step": 19640 }, { "grad_norm": 1.3642061948776245, "learning_rate": 0.00016671897052585998, "loss": 0.3755, "step": 19650 }, { "gate_value": 0.3109137713909149, "icl_sequence_length": 80, "num_contexts": 3, "step": 19650 }, { "grad_norm": 1.699450135231018, "learning_rate": 0.0001665957273047863, "loss": 0.3786, "step": 19660 }, { "gate_value": 0.3109440505504608, "icl_sequence_length": 86, "num_contexts": 3, "step": 19660 }, { "grad_norm": 1.0772122144699097, "learning_rate": 0.00016647247274068945, "loss": 0.3843, "step": 19670 }, { "gate_value": 0.3111564517021179, "icl_sequence_length": 70, "num_contexts": 3, "step": 19670 }, { "grad_norm": 0.9683305621147156, "learning_rate": 0.00016634920691781282, "loss": 0.3721, "step": 19680 }, { "gate_value": 0.311585932970047, "icl_sequence_length": 82, "num_contexts": 3, "step": 19680 }, { "grad_norm": 2.59132719039917, "learning_rate": 0.00016622592992040743, "loss": 0.3803, "step": 19690 }, { "gate_value": 0.31226620078086853, "icl_sequence_length": 90, "num_contexts": 3, "step": 19690 }, { "grad_norm": 3.2184154987335205, "learning_rate": 0.00016610264183273196, "loss": 0.378, "step": 19700 }, { "gate_value": 0.3124104142189026, "icl_sequence_length": 82, "num_contexts": 3, "step": 19700 }, { "grad_norm": 1.0623730421066284, "learning_rate": 0.00016597934273905262, "loss": 0.3832, "step": 19710 }, { "gate_value": 0.3129165470600128, "icl_sequence_length": 76, "num_contexts": 3, "step": 19710 }, { "grad_norm": 2.3293986320495605, "learning_rate": 0.00016585603272364322, "loss": 0.3877, "step": 19720 }, { "gate_value": 0.31323570013046265, "icl_sequence_length": 74, "num_contexts": 3, "step": 19720 }, { "grad_norm": 7.679978370666504, "learning_rate": 0.00016573271187078493, "loss": 0.3829, "step": 19730 }, { "gate_value": 0.31327542662620544, "icl_sequence_length": 76, "num_contexts": 3, "step": 19730 }, { "grad_norm": 1.5804600715637207, "learning_rate": 0.00016560938026476647, "loss": 0.3736, "step": 19740 }, { "gate_value": 0.3131873309612274, "icl_sequence_length": 86, "num_contexts": 3, "step": 19740 }, { "grad_norm": 1.0514229536056519, "learning_rate": 0.00016548603798988373, "loss": 0.384, "step": 19750 }, { "gate_value": 0.31321951746940613, "icl_sequence_length": 94, "num_contexts": 3, "step": 19750 }, { "grad_norm": 47.53086471557617, "learning_rate": 0.0001653626851304401, "loss": 0.3661, "step": 19760 }, { "gate_value": 0.313575804233551, "icl_sequence_length": 88, "num_contexts": 3, "step": 19760 }, { "grad_norm": 2.0501813888549805, "learning_rate": 0.00016523932177074597, "loss": 0.3891, "step": 19770 }, { "gate_value": 0.31374025344848633, "icl_sequence_length": 60, "num_contexts": 3, "step": 19770 }, { "grad_norm": 11.146804809570312, "learning_rate": 0.0001651159479951192, "loss": 0.387, "step": 19780 }, { "gate_value": 0.31384482979774475, "icl_sequence_length": 78, "num_contexts": 3, "step": 19780 }, { "grad_norm": 1.0424987077713013, "learning_rate": 0.00016499256388788447, "loss": 0.396, "step": 19790 }, { "gate_value": 0.31350716948509216, "icl_sequence_length": 90, "num_contexts": 3, "step": 19790 }, { "grad_norm": 3.2343297004699707, "learning_rate": 0.0001648691695333737, "loss": 0.3627, "step": 19800 }, { "gate_value": 0.31341326236724854, "icl_sequence_length": 92, "num_contexts": 3, "step": 19800 }, { "grad_norm": 1.6307929754257202, "learning_rate": 0.00016474576501592574, "loss": 0.3692, "step": 19810 }, { "gate_value": 0.31364917755126953, "icl_sequence_length": 70, "num_contexts": 3, "step": 19810 }, { "grad_norm": 0.7756251096725464, "learning_rate": 0.00016462235041988642, "loss": 0.3693, "step": 19820 }, { "gate_value": 0.3137316107749939, "icl_sequence_length": 88, "num_contexts": 3, "step": 19820 }, { "grad_norm": 0.9996859431266785, "learning_rate": 0.00016449892582960852, "loss": 0.3981, "step": 19830 }, { "gate_value": 0.31357425451278687, "icl_sequence_length": 84, "num_contexts": 3, "step": 19830 }, { "grad_norm": 1.6043193340301514, "learning_rate": 0.00016437549132945151, "loss": 0.3682, "step": 19840 }, { "gate_value": 0.3138103783130646, "icl_sequence_length": 72, "num_contexts": 3, "step": 19840 }, { "grad_norm": 1.084348201751709, "learning_rate": 0.00016425204700378174, "loss": 0.3857, "step": 19850 }, { "gate_value": 0.3141813576221466, "icl_sequence_length": 78, "num_contexts": 3, "step": 19850 }, { "grad_norm": 7.152972221374512, "learning_rate": 0.00016412859293697224, "loss": 0.3833, "step": 19860 }, { "gate_value": 0.31427866220474243, "icl_sequence_length": 72, "num_contexts": 3, "step": 19860 }, { "grad_norm": 0.9384665489196777, "learning_rate": 0.00016400512921340265, "loss": 0.3848, "step": 19870 }, { "gate_value": 0.3139939606189728, "icl_sequence_length": 78, "num_contexts": 3, "step": 19870 }, { "grad_norm": 0.4689892530441284, "learning_rate": 0.00016388165591745934, "loss": 0.3623, "step": 19880 }, { "gate_value": 0.31366410851478577, "icl_sequence_length": 86, "num_contexts": 3, "step": 19880 }, { "grad_norm": 1.7431423664093018, "learning_rate": 0.0001637581731335351, "loss": 0.3971, "step": 19890 }, { "gate_value": 0.3138532340526581, "icl_sequence_length": 88, "num_contexts": 3, "step": 19890 }, { "grad_norm": 0.8681080341339111, "learning_rate": 0.00016363468094602923, "loss": 0.3749, "step": 19900 }, { "gate_value": 0.3135307729244232, "icl_sequence_length": 78, "num_contexts": 3, "step": 19900 }, { "grad_norm": 0.8464221358299255, "learning_rate": 0.00016351117943934755, "loss": 0.3772, "step": 19910 }, { "gate_value": 0.313311904668808, "icl_sequence_length": 82, "num_contexts": 3, "step": 19910 }, { "grad_norm": 5.371399879455566, "learning_rate": 0.00016338766869790206, "loss": 0.3933, "step": 19920 }, { "gate_value": 0.3136154115200043, "icl_sequence_length": 80, "num_contexts": 3, "step": 19920 }, { "grad_norm": 1.4953688383102417, "learning_rate": 0.00016326414880611133, "loss": 0.3858, "step": 19930 }, { "gate_value": 0.3138081133365631, "icl_sequence_length": 80, "num_contexts": 3, "step": 19930 }, { "grad_norm": 1.1205352544784546, "learning_rate": 0.00016314061984839992, "loss": 0.3751, "step": 19940 }, { "gate_value": 0.3141990602016449, "icl_sequence_length": 88, "num_contexts": 3, "step": 19940 }, { "grad_norm": 4.253011703491211, "learning_rate": 0.00016301708190919872, "loss": 0.3851, "step": 19950 }, { "gate_value": 0.3143591582775116, "icl_sequence_length": 84, "num_contexts": 3, "step": 19950 }, { "grad_norm": 5.624688625335693, "learning_rate": 0.00016289353507294483, "loss": 0.3721, "step": 19960 }, { "gate_value": 0.31418493390083313, "icl_sequence_length": 86, "num_contexts": 3, "step": 19960 }, { "grad_norm": 3.7468695640563965, "learning_rate": 0.00016276997942408128, "loss": 0.3649, "step": 19970 }, { "gate_value": 0.3142998516559601, "icl_sequence_length": 94, "num_contexts": 3, "step": 19970 }, { "grad_norm": 0.7305354475975037, "learning_rate": 0.00016264641504705723, "loss": 0.3788, "step": 19980 }, { "gate_value": 0.31467312574386597, "icl_sequence_length": 68, "num_contexts": 3, "step": 19980 }, { "grad_norm": 2.8758230209350586, "learning_rate": 0.00016252284202632772, "loss": 0.3707, "step": 19990 }, { "gate_value": 0.31497639417648315, "icl_sequence_length": 74, "num_contexts": 3, "step": 19990 }, { "grad_norm": 5.836865425109863, "learning_rate": 0.00016239926044635378, "loss": 0.3755, "step": 20000 }, { "gate_value": 0.3148883581161499, "icl_sequence_length": 90, "num_contexts": 3, "step": 20000 }, { "grad_norm": 1.1395916938781738, "learning_rate": 0.00016227567039160223, "loss": 0.3766, "step": 20010 }, { "gate_value": 0.31475260853767395, "icl_sequence_length": 72, "num_contexts": 3, "step": 20010 }, { "grad_norm": 1.6680474281311035, "learning_rate": 0.00016215207194654571, "loss": 0.393, "step": 20020 }, { "gate_value": 0.314287006855011, "icl_sequence_length": 88, "num_contexts": 3, "step": 20020 }, { "grad_norm": 45.070587158203125, "learning_rate": 0.0001620284651956626, "loss": 0.3865, "step": 20030 }, { "gate_value": 0.31412649154663086, "icl_sequence_length": 80, "num_contexts": 3, "step": 20030 }, { "grad_norm": 1.6862635612487793, "learning_rate": 0.000161904850223437, "loss": 0.3744, "step": 20040 }, { "gate_value": 0.3144453465938568, "icl_sequence_length": 82, "num_contexts": 3, "step": 20040 }, { "grad_norm": 1.0638667345046997, "learning_rate": 0.0001617812271143585, "loss": 0.3745, "step": 20050 }, { "gate_value": 0.314824640750885, "icl_sequence_length": 80, "num_contexts": 3, "step": 20050 }, { "grad_norm": 2.623652219772339, "learning_rate": 0.00016165759595292232, "loss": 0.3882, "step": 20060 }, { "gate_value": 0.3149595856666565, "icl_sequence_length": 94, "num_contexts": 3, "step": 20060 }, { "grad_norm": 1.5121970176696777, "learning_rate": 0.0001615339568236293, "loss": 0.3986, "step": 20070 }, { "gate_value": 0.31516796350479126, "icl_sequence_length": 82, "num_contexts": 3, "step": 20070 }, { "grad_norm": 79.21330261230469, "learning_rate": 0.0001614103098109855, "loss": 0.3744, "step": 20080 }, { "gate_value": 0.3159092962741852, "icl_sequence_length": 88, "num_contexts": 3, "step": 20080 }, { "grad_norm": 1.1513302326202393, "learning_rate": 0.00016128665499950254, "loss": 0.3859, "step": 20090 }, { "gate_value": 0.31618189811706543, "icl_sequence_length": 88, "num_contexts": 3, "step": 20090 }, { "grad_norm": 3.856543779373169, "learning_rate": 0.0001611629924736973, "loss": 0.3594, "step": 20100 }, { "gate_value": 0.31636902689933777, "icl_sequence_length": 68, "num_contexts": 3, "step": 20100 }, { "grad_norm": 1.5914669036865234, "learning_rate": 0.000161039322318092, "loss": 0.3943, "step": 20110 }, { "gate_value": 0.31623247265815735, "icl_sequence_length": 94, "num_contexts": 3, "step": 20110 }, { "grad_norm": 0.7739580869674683, "learning_rate": 0.000160915644617214, "loss": 0.3857, "step": 20120 }, { "gate_value": 0.3166380524635315, "icl_sequence_length": 84, "num_contexts": 3, "step": 20120 }, { "grad_norm": 2.524143695831299, "learning_rate": 0.0001607919594555958, "loss": 0.366, "step": 20130 }, { "gate_value": 0.3169264495372772, "icl_sequence_length": 76, "num_contexts": 3, "step": 20130 }, { "grad_norm": 4.026458263397217, "learning_rate": 0.0001606682669177751, "loss": 0.369, "step": 20140 }, { "gate_value": 0.31718090176582336, "icl_sequence_length": 72, "num_contexts": 3, "step": 20140 }, { "grad_norm": 0.697846531867981, "learning_rate": 0.0001605445670882945, "loss": 0.3765, "step": 20150 }, { "gate_value": 0.31738293170928955, "icl_sequence_length": 82, "num_contexts": 3, "step": 20150 }, { "grad_norm": 0.518775463104248, "learning_rate": 0.0001604208600517018, "loss": 0.3669, "step": 20160 }, { "gate_value": 0.3173922598361969, "icl_sequence_length": 78, "num_contexts": 3, "step": 20160 }, { "grad_norm": 3.240859270095825, "learning_rate": 0.0001602971458925495, "loss": 0.3905, "step": 20170 }, { "gate_value": 0.3173951208591461, "icl_sequence_length": 80, "num_contexts": 3, "step": 20170 }, { "grad_norm": 1.02534019947052, "learning_rate": 0.00016017342469539503, "loss": 0.361, "step": 20180 }, { "gate_value": 0.31715652346611023, "icl_sequence_length": 88, "num_contexts": 3, "step": 20180 }, { "grad_norm": 2.5602641105651855, "learning_rate": 0.00016004969654480079, "loss": 0.3677, "step": 20190 }, { "gate_value": 0.3173889219760895, "icl_sequence_length": 96, "num_contexts": 3, "step": 20190 }, { "grad_norm": 1.1085784435272217, "learning_rate": 0.00015992596152533364, "loss": 0.3866, "step": 20200 }, { "gate_value": 0.31750160455703735, "icl_sequence_length": 92, "num_contexts": 3, "step": 20200 }, { "grad_norm": 2.646415948867798, "learning_rate": 0.00015980221972156542, "loss": 0.3617, "step": 20210 }, { "gate_value": 0.3176015019416809, "icl_sequence_length": 86, "num_contexts": 3, "step": 20210 }, { "grad_norm": 1.0765670537948608, "learning_rate": 0.00015967847121807247, "loss": 0.3898, "step": 20220 }, { "gate_value": 0.317767858505249, "icl_sequence_length": 72, "num_contexts": 3, "step": 20220 }, { "grad_norm": 1.4987940788269043, "learning_rate": 0.00015955471609943567, "loss": 0.3831, "step": 20230 }, { "gate_value": 0.3180540204048157, "icl_sequence_length": 84, "num_contexts": 3, "step": 20230 }, { "grad_norm": 6.206676959991455, "learning_rate": 0.00015943095445024056, "loss": 0.3755, "step": 20240 }, { "gate_value": 0.31847628951072693, "icl_sequence_length": 88, "num_contexts": 3, "step": 20240 }, { "grad_norm": 0.9935562610626221, "learning_rate": 0.00015930718635507696, "loss": 0.3719, "step": 20250 }, { "gate_value": 0.3188856542110443, "icl_sequence_length": 88, "num_contexts": 3, "step": 20250 }, { "grad_norm": 2.0614676475524902, "learning_rate": 0.00015918341189853928, "loss": 0.3862, "step": 20260 }, { "gate_value": 0.31922855973243713, "icl_sequence_length": 88, "num_contexts": 3, "step": 20260 }, { "grad_norm": 78.66289520263672, "learning_rate": 0.00015905963116522617, "loss": 0.3763, "step": 20270 }, { "gate_value": 0.31915542483329773, "icl_sequence_length": 80, "num_contexts": 3, "step": 20270 }, { "grad_norm": 0.7880850434303284, "learning_rate": 0.00015893584423974056, "loss": 0.3764, "step": 20280 }, { "gate_value": 0.31941351294517517, "icl_sequence_length": 76, "num_contexts": 3, "step": 20280 }, { "grad_norm": 2.9438118934631348, "learning_rate": 0.0001588120512066897, "loss": 0.395, "step": 20290 }, { "gate_value": 0.31935545802116394, "icl_sequence_length": 72, "num_contexts": 3, "step": 20290 }, { "grad_norm": 64.0041732788086, "learning_rate": 0.0001586882521506849, "loss": 0.3741, "step": 20300 }, { "gate_value": 0.31919434666633606, "icl_sequence_length": 58, "num_contexts": 3, "step": 20300 }, { "grad_norm": 1.1446757316589355, "learning_rate": 0.00015856444715634167, "loss": 0.3544, "step": 20310 }, { "gate_value": 0.31936153769493103, "icl_sequence_length": 82, "num_contexts": 3, "step": 20310 }, { "grad_norm": 1.5598700046539307, "learning_rate": 0.0001584406363082796, "loss": 0.3766, "step": 20320 }, { "gate_value": 0.3194507360458374, "icl_sequence_length": 92, "num_contexts": 3, "step": 20320 }, { "grad_norm": 0.8992844820022583, "learning_rate": 0.00015831681969112214, "loss": 0.3718, "step": 20330 }, { "gate_value": 0.3197779953479767, "icl_sequence_length": 82, "num_contexts": 3, "step": 20330 }, { "grad_norm": 4.334936618804932, "learning_rate": 0.00015819299738949695, "loss": 0.3809, "step": 20340 }, { "gate_value": 0.3198601007461548, "icl_sequence_length": 80, "num_contexts": 3, "step": 20340 }, { "grad_norm": 1.361315369606018, "learning_rate": 0.00015806916948803525, "loss": 0.3833, "step": 20350 }, { "gate_value": 0.319922536611557, "icl_sequence_length": 74, "num_contexts": 3, "step": 20350 }, { "grad_norm": 3.1220877170562744, "learning_rate": 0.00015794533607137228, "loss": 0.3722, "step": 20360 }, { "gate_value": 0.3200428783893585, "icl_sequence_length": 74, "num_contexts": 3, "step": 20360 }, { "grad_norm": 3.3610692024230957, "learning_rate": 0.00015782149722414702, "loss": 0.3697, "step": 20370 }, { "gate_value": 0.319830060005188, "icl_sequence_length": 84, "num_contexts": 3, "step": 20370 }, { "grad_norm": 5.070735931396484, "learning_rate": 0.00015769765303100215, "loss": 0.3562, "step": 20380 }, { "gate_value": 0.31996458768844604, "icl_sequence_length": 72, "num_contexts": 3, "step": 20380 }, { "grad_norm": 9.352119445800781, "learning_rate": 0.0001575738035765841, "loss": 0.3728, "step": 20390 }, { "gate_value": 0.31997665762901306, "icl_sequence_length": 68, "num_contexts": 3, "step": 20390 }, { "grad_norm": 1.9495102167129517, "learning_rate": 0.00015744994894554263, "loss": 0.3838, "step": 20400 }, { "gate_value": 0.3198350667953491, "icl_sequence_length": 88, "num_contexts": 3, "step": 20400 }, { "grad_norm": 2.5276057720184326, "learning_rate": 0.00015732608922253136, "loss": 0.3896, "step": 20410 }, { "gate_value": 0.31992819905281067, "icl_sequence_length": 74, "num_contexts": 3, "step": 20410 }, { "grad_norm": 6.810713768005371, "learning_rate": 0.00015720222449220716, "loss": 0.3698, "step": 20420 }, { "gate_value": 0.3202584683895111, "icl_sequence_length": 76, "num_contexts": 3, "step": 20420 }, { "grad_norm": 1.0463260412216187, "learning_rate": 0.00015707835483923043, "loss": 0.3519, "step": 20430 }, { "gate_value": 0.3203528821468353, "icl_sequence_length": 80, "num_contexts": 3, "step": 20430 }, { "grad_norm": 2.1792423725128174, "learning_rate": 0.00015695448034826494, "loss": 0.3746, "step": 20440 }, { "gate_value": 0.32040420174598694, "icl_sequence_length": 82, "num_contexts": 3, "step": 20440 }, { "grad_norm": 1.2219457626342773, "learning_rate": 0.00015683060110397768, "loss": 0.3657, "step": 20450 }, { "gate_value": 0.32081183791160583, "icl_sequence_length": 94, "num_contexts": 3, "step": 20450 }, { "grad_norm": 4.518383979797363, "learning_rate": 0.00015670671719103898, "loss": 0.3767, "step": 20460 }, { "gate_value": 0.321494460105896, "icl_sequence_length": 68, "num_contexts": 3, "step": 20460 }, { "grad_norm": 5.510248184204102, "learning_rate": 0.00015658282869412233, "loss": 0.3645, "step": 20470 }, { "gate_value": 0.32175546884536743, "icl_sequence_length": 74, "num_contexts": 3, "step": 20470 }, { "grad_norm": 2.041200876235962, "learning_rate": 0.00015645893569790428, "loss": 0.3933, "step": 20480 }, { "gate_value": 0.3220183849334717, "icl_sequence_length": 76, "num_contexts": 3, "step": 20480 }, { "grad_norm": 9.356095314025879, "learning_rate": 0.00015633503828706467, "loss": 0.3667, "step": 20490 }, { "gate_value": 0.3222240209579468, "icl_sequence_length": 92, "num_contexts": 3, "step": 20490 }, { "grad_norm": 52.88069152832031, "learning_rate": 0.00015621113654628612, "loss": 0.3883, "step": 20500 }, { "gate_value": 0.3223245441913605, "icl_sequence_length": 88, "num_contexts": 3, "step": 20500 }, { "grad_norm": 2.949965476989746, "learning_rate": 0.00015608723056025425, "loss": 0.3662, "step": 20510 }, { "gate_value": 0.3225291073322296, "icl_sequence_length": 94, "num_contexts": 3, "step": 20510 }, { "grad_norm": 31.80426597595215, "learning_rate": 0.00015596332041365775, "loss": 0.3581, "step": 20520 }, { "gate_value": 0.32278192043304443, "icl_sequence_length": 88, "num_contexts": 3, "step": 20520 }, { "grad_norm": 1.4811211824417114, "learning_rate": 0.00015583940619118793, "loss": 0.3805, "step": 20530 }, { "gate_value": 0.3228819966316223, "icl_sequence_length": 82, "num_contexts": 3, "step": 20530 }, { "grad_norm": 8.001051902770996, "learning_rate": 0.00015571548797753906, "loss": 0.3505, "step": 20540 }, { "gate_value": 0.32308506965637207, "icl_sequence_length": 88, "num_contexts": 3, "step": 20540 }, { "grad_norm": 2.4301977157592773, "learning_rate": 0.00015559156585740808, "loss": 0.3809, "step": 20550 }, { "gate_value": 0.3232133984565735, "icl_sequence_length": 76, "num_contexts": 3, "step": 20550 }, { "grad_norm": 2.834927797317505, "learning_rate": 0.00015546763991549452, "loss": 0.3784, "step": 20560 }, { "gate_value": 0.3232627809047699, "icl_sequence_length": 86, "num_contexts": 3, "step": 20560 }, { "grad_norm": 11.861950874328613, "learning_rate": 0.00015534371023650067, "loss": 0.378, "step": 20570 }, { "gate_value": 0.32333946228027344, "icl_sequence_length": 88, "num_contexts": 3, "step": 20570 }, { "grad_norm": 16.24566078186035, "learning_rate": 0.00015521977690513124, "loss": 0.3764, "step": 20580 }, { "gate_value": 0.32341697812080383, "icl_sequence_length": 88, "num_contexts": 3, "step": 20580 }, { "grad_norm": 3.6404664516448975, "learning_rate": 0.0001550958400060935, "loss": 0.3775, "step": 20590 }, { "gate_value": 0.32357144355773926, "icl_sequence_length": 86, "num_contexts": 3, "step": 20590 }, { "grad_norm": 1.6172356605529785, "learning_rate": 0.0001549718996240972, "loss": 0.3794, "step": 20600 }, { "gate_value": 0.323573499917984, "icl_sequence_length": 70, "num_contexts": 3, "step": 20600 }, { "grad_norm": 5.064068794250488, "learning_rate": 0.00015484795584385432, "loss": 0.3727, "step": 20610 }, { "gate_value": 0.3236609399318695, "icl_sequence_length": 66, "num_contexts": 3, "step": 20610 }, { "grad_norm": 5.703829765319824, "learning_rate": 0.00015472400875007943, "loss": 0.3676, "step": 20620 }, { "gate_value": 0.32396653294563293, "icl_sequence_length": 80, "num_contexts": 3, "step": 20620 }, { "grad_norm": 4.807945728302002, "learning_rate": 0.00015460005842748905, "loss": 0.3829, "step": 20630 }, { "gate_value": 0.32434651255607605, "icl_sequence_length": 64, "num_contexts": 3, "step": 20630 }, { "grad_norm": 26.915267944335938, "learning_rate": 0.0001544761049608021, "loss": 0.3684, "step": 20640 }, { "gate_value": 0.32444456219673157, "icl_sequence_length": 68, "num_contexts": 3, "step": 20640 }, { "grad_norm": 2.3243069648742676, "learning_rate": 0.00015435214843473964, "loss": 0.3878, "step": 20650 }, { "gate_value": 0.32470571994781494, "icl_sequence_length": 92, "num_contexts": 3, "step": 20650 }, { "grad_norm": 13.799264907836914, "learning_rate": 0.00015422818893402477, "loss": 0.3961, "step": 20660 }, { "gate_value": 0.3247778117656708, "icl_sequence_length": 74, "num_contexts": 3, "step": 20660 }, { "grad_norm": 54.99798583984375, "learning_rate": 0.00015410422654338265, "loss": 0.3866, "step": 20670 }, { "gate_value": 0.32493239641189575, "icl_sequence_length": 68, "num_contexts": 3, "step": 20670 }, { "grad_norm": 2.2542836666107178, "learning_rate": 0.00015398026134754036, "loss": 0.3703, "step": 20680 }, { "gate_value": 0.32522520422935486, "icl_sequence_length": 74, "num_contexts": 3, "step": 20680 }, { "grad_norm": 2.2880892753601074, "learning_rate": 0.00015385629343122695, "loss": 0.3565, "step": 20690 }, { "gate_value": 0.3254174292087555, "icl_sequence_length": 84, "num_contexts": 3, "step": 20690 }, { "grad_norm": 3.325277090072632, "learning_rate": 0.0001537323228791734, "loss": 0.3657, "step": 20700 }, { "gate_value": 0.32538145780563354, "icl_sequence_length": 80, "num_contexts": 3, "step": 20700 }, { "grad_norm": 4.995002746582031, "learning_rate": 0.00015360834977611227, "loss": 0.3799, "step": 20710 }, { "gate_value": 0.3254181742668152, "icl_sequence_length": 74, "num_contexts": 3, "step": 20710 }, { "grad_norm": 2.443607807159424, "learning_rate": 0.0001534843742067782, "loss": 0.3817, "step": 20720 }, { "gate_value": 0.325557678937912, "icl_sequence_length": 90, "num_contexts": 3, "step": 20720 }, { "grad_norm": 1.345854640007019, "learning_rate": 0.00015336039625590714, "loss": 0.3697, "step": 20730 }, { "gate_value": 0.3256950080394745, "icl_sequence_length": 84, "num_contexts": 3, "step": 20730 }, { "grad_norm": 249.50662231445312, "learning_rate": 0.00015323641600823693, "loss": 0.3654, "step": 20740 }, { "gate_value": 0.32584673166275024, "icl_sequence_length": 90, "num_contexts": 3, "step": 20740 }, { "grad_norm": 21.927440643310547, "learning_rate": 0.00015311243354850692, "loss": 0.3975, "step": 20750 }, { "gate_value": 0.32607078552246094, "icl_sequence_length": 84, "num_contexts": 3, "step": 20750 }, { "grad_norm": 2.4049830436706543, "learning_rate": 0.0001529884489614579, "loss": 0.365, "step": 20760 }, { "gate_value": 0.3262093663215637, "icl_sequence_length": 70, "num_contexts": 3, "step": 20760 }, { "grad_norm": 149.2460174560547, "learning_rate": 0.00015286446233183225, "loss": 0.3816, "step": 20770 }, { "gate_value": 0.3263449966907501, "icl_sequence_length": 84, "num_contexts": 3, "step": 20770 }, { "grad_norm": 3.6130568981170654, "learning_rate": 0.00015274047374437354, "loss": 0.3588, "step": 20780 }, { "gate_value": 0.32662174105644226, "icl_sequence_length": 92, "num_contexts": 3, "step": 20780 }, { "grad_norm": 14.619281768798828, "learning_rate": 0.0001526164832838269, "loss": 0.3855, "step": 20790 }, { "gate_value": 0.32708847522735596, "icl_sequence_length": 80, "num_contexts": 3, "step": 20790 }, { "grad_norm": 27.90945053100586, "learning_rate": 0.0001524924910349386, "loss": 0.3747, "step": 20800 }, { "gate_value": 0.32717645168304443, "icl_sequence_length": 72, "num_contexts": 3, "step": 20800 }, { "grad_norm": 2.590574264526367, "learning_rate": 0.00015236849708245617, "loss": 0.3668, "step": 20810 }, { "gate_value": 0.32714733481407166, "icl_sequence_length": 88, "num_contexts": 3, "step": 20810 }, { "grad_norm": 4.525856018066406, "learning_rate": 0.0001522445015111284, "loss": 0.3645, "step": 20820 }, { "gate_value": 0.327190101146698, "icl_sequence_length": 86, "num_contexts": 3, "step": 20820 }, { "grad_norm": 36.371551513671875, "learning_rate": 0.00015212050440570492, "loss": 0.3843, "step": 20830 }, { "gate_value": 0.32727375626564026, "icl_sequence_length": 80, "num_contexts": 3, "step": 20830 }, { "grad_norm": 2.06235671043396, "learning_rate": 0.00015199650585093669, "loss": 0.3631, "step": 20840 }, { "gate_value": 0.32733866572380066, "icl_sequence_length": 86, "num_contexts": 3, "step": 20840 }, { "grad_norm": 31.483142852783203, "learning_rate": 0.00015187250593157552, "loss": 0.3772, "step": 20850 }, { "gate_value": 0.32737815380096436, "icl_sequence_length": 62, "num_contexts": 3, "step": 20850 }, { "grad_norm": 2.382258415222168, "learning_rate": 0.00015174850473237425, "loss": 0.38, "step": 20860 }, { "gate_value": 0.3275003731250763, "icl_sequence_length": 82, "num_contexts": 3, "step": 20860 }, { "grad_norm": 2.1796786785125732, "learning_rate": 0.00015162450233808646, "loss": 0.3658, "step": 20870 }, { "gate_value": 0.3276655077934265, "icl_sequence_length": 90, "num_contexts": 3, "step": 20870 }, { "grad_norm": 9.256353378295898, "learning_rate": 0.0001515004988334666, "loss": 0.3795, "step": 20880 }, { "gate_value": 0.3275793790817261, "icl_sequence_length": 58, "num_contexts": 3, "step": 20880 }, { "grad_norm": 5.035216808319092, "learning_rate": 0.00015137649430327, "loss": 0.3836, "step": 20890 }, { "gate_value": 0.3275890052318573, "icl_sequence_length": 72, "num_contexts": 3, "step": 20890 }, { "grad_norm": 6.4507527351379395, "learning_rate": 0.0001512524888322525, "loss": 0.3782, "step": 20900 }, { "gate_value": 0.3277978301048279, "icl_sequence_length": 90, "num_contexts": 3, "step": 20900 }, { "grad_norm": 26.93199920654297, "learning_rate": 0.0001511284825051707, "loss": 0.3752, "step": 20910 }, { "gate_value": 0.3279309570789337, "icl_sequence_length": 78, "num_contexts": 3, "step": 20910 }, { "grad_norm": 3.2107746601104736, "learning_rate": 0.00015100447540678178, "loss": 0.3715, "step": 20920 }, { "gate_value": 0.3280923664569855, "icl_sequence_length": 60, "num_contexts": 3, "step": 20920 }, { "grad_norm": 1.704662561416626, "learning_rate": 0.0001508804676218433, "loss": 0.3785, "step": 20930 }, { "gate_value": 0.3281390368938446, "icl_sequence_length": 84, "num_contexts": 3, "step": 20930 }, { "grad_norm": 6.8710551261901855, "learning_rate": 0.00015075645923511355, "loss": 0.369, "step": 20940 }, { "gate_value": 0.3282588720321655, "icl_sequence_length": 90, "num_contexts": 3, "step": 20940 }, { "grad_norm": 100.34821319580078, "learning_rate": 0.0001506324503313511, "loss": 0.3687, "step": 20950 }, { "gate_value": 0.32841092348098755, "icl_sequence_length": 90, "num_contexts": 3, "step": 20950 }, { "grad_norm": 16.857439041137695, "learning_rate": 0.00015050844099531474, "loss": 0.3728, "step": 20960 }, { "gate_value": 0.3285426199436188, "icl_sequence_length": 80, "num_contexts": 3, "step": 20960 }, { "grad_norm": 7.527184963226318, "learning_rate": 0.00015038443131176377, "loss": 0.3741, "step": 20970 }, { "gate_value": 0.3288117051124573, "icl_sequence_length": 66, "num_contexts": 3, "step": 20970 }, { "grad_norm": 4.734735488891602, "learning_rate": 0.00015026042136545762, "loss": 0.3713, "step": 20980 }, { "gate_value": 0.32907599210739136, "icl_sequence_length": 82, "num_contexts": 3, "step": 20980 }, { "grad_norm": 10.151193618774414, "learning_rate": 0.00015013641124115592, "loss": 0.3578, "step": 20990 }, { "gate_value": 0.3292025029659271, "icl_sequence_length": 66, "num_contexts": 3, "step": 20990 }, { "grad_norm": 41.710880279541016, "learning_rate": 0.0001500124010236185, "loss": 0.3581, "step": 21000 }, { "gate_value": 0.3292621374130249, "icl_sequence_length": 80, "num_contexts": 3, "step": 21000 }, { "grad_norm": 11.441078186035156, "learning_rate": 0.00014988839079760496, "loss": 0.3614, "step": 21010 }, { "gate_value": 0.32934167981147766, "icl_sequence_length": 92, "num_contexts": 3, "step": 21010 }, { "grad_norm": 127.89350891113281, "learning_rate": 0.00014976438064787537, "loss": 0.3897, "step": 21020 }, { "gate_value": 0.329465389251709, "icl_sequence_length": 72, "num_contexts": 3, "step": 21020 }, { "grad_norm": 9.434091567993164, "learning_rate": 0.00014964037065918936, "loss": 0.3745, "step": 21030 }, { "gate_value": 0.3296959102153778, "icl_sequence_length": 84, "num_contexts": 3, "step": 21030 }, { "grad_norm": 71.84469604492188, "learning_rate": 0.0001495163609163066, "loss": 0.3718, "step": 21040 }, { "gate_value": 0.3298918604850769, "icl_sequence_length": 80, "num_contexts": 3, "step": 21040 }, { "grad_norm": 8.007318496704102, "learning_rate": 0.0001493923515039866, "loss": 0.3768, "step": 21050 }, { "gate_value": 0.3300043046474457, "icl_sequence_length": 78, "num_contexts": 3, "step": 21050 }, { "grad_norm": 5.606169700622559, "learning_rate": 0.00014926834250698857, "loss": 0.3848, "step": 21060 }, { "gate_value": 0.3299660384654999, "icl_sequence_length": 82, "num_contexts": 3, "step": 21060 }, { "grad_norm": 18.77029800415039, "learning_rate": 0.00014914433401007162, "loss": 0.3851, "step": 21070 }, { "gate_value": 0.3300321102142334, "icl_sequence_length": 82, "num_contexts": 3, "step": 21070 }, { "grad_norm": 59.220279693603516, "learning_rate": 0.0001490203260979942, "loss": 0.3835, "step": 21080 }, { "gate_value": 0.33016636967658997, "icl_sequence_length": 72, "num_contexts": 3, "step": 21080 }, { "grad_norm": 27.8267765045166, "learning_rate": 0.00014889631885551472, "loss": 0.3909, "step": 21090 }, { "gate_value": 0.3302639126777649, "icl_sequence_length": 84, "num_contexts": 3, "step": 21090 }, { "grad_norm": 29.264480590820312, "learning_rate": 0.0001487723123673909, "loss": 0.3811, "step": 21100 }, { "gate_value": 0.3303276598453522, "icl_sequence_length": 94, "num_contexts": 3, "step": 21100 }, { "grad_norm": 3.484534978866577, "learning_rate": 0.00014864830671837997, "loss": 0.3698, "step": 21110 }, { "gate_value": 0.33042216300964355, "icl_sequence_length": 78, "num_contexts": 3, "step": 21110 }, { "grad_norm": 136.35006713867188, "learning_rate": 0.00014852430199323871, "loss": 0.3573, "step": 21120 }, { "gate_value": 0.3304680585861206, "icl_sequence_length": 86, "num_contexts": 3, "step": 21120 }, { "grad_norm": 29.405948638916016, "learning_rate": 0.00014840029827672312, "loss": 0.376, "step": 21130 }, { "gate_value": 0.33045992255210876, "icl_sequence_length": 82, "num_contexts": 3, "step": 21130 }, { "grad_norm": 2.838723659515381, "learning_rate": 0.00014827629565358853, "loss": 0.3611, "step": 21140 }, { "gate_value": 0.33051925897598267, "icl_sequence_length": 82, "num_contexts": 3, "step": 21140 }, { "grad_norm": 111.33936309814453, "learning_rate": 0.00014815229420858962, "loss": 0.384, "step": 21150 }, { "gate_value": 0.33056896924972534, "icl_sequence_length": 64, "num_contexts": 3, "step": 21150 }, { "grad_norm": 2.623337745666504, "learning_rate": 0.00014802829402648016, "loss": 0.3814, "step": 21160 }, { "gate_value": 0.3307247757911682, "icl_sequence_length": 70, "num_contexts": 3, "step": 21160 }, { "grad_norm": 53.253849029541016, "learning_rate": 0.00014790429519201322, "loss": 0.3619, "step": 21170 }, { "gate_value": 0.330961138010025, "icl_sequence_length": 72, "num_contexts": 3, "step": 21170 }, { "grad_norm": 8.739072799682617, "learning_rate": 0.00014778029778994068, "loss": 0.3898, "step": 21180 }, { "gate_value": 0.33113235235214233, "icl_sequence_length": 66, "num_contexts": 3, "step": 21180 }, { "grad_norm": 3.783287286758423, "learning_rate": 0.00014765630190501368, "loss": 0.3705, "step": 21190 }, { "gate_value": 0.3310948610305786, "icl_sequence_length": 86, "num_contexts": 3, "step": 21190 }, { "grad_norm": 2.4001498222351074, "learning_rate": 0.00014753230762198227, "loss": 0.3639, "step": 21200 }, { "gate_value": 0.33120977878570557, "icl_sequence_length": 78, "num_contexts": 3, "step": 21200 }, { "grad_norm": 4.936431407928467, "learning_rate": 0.00014740831502559534, "loss": 0.3806, "step": 21210 }, { "gate_value": 0.33138588070869446, "icl_sequence_length": 96, "num_contexts": 3, "step": 21210 }, { "grad_norm": 14.909619331359863, "learning_rate": 0.0001472843242006006, "loss": 0.3693, "step": 21220 }, { "gate_value": 0.3314835727214813, "icl_sequence_length": 74, "num_contexts": 3, "step": 21220 }, { "grad_norm": 1.7902761697769165, "learning_rate": 0.0001471603352317447, "loss": 0.3644, "step": 21230 }, { "gate_value": 0.33160167932510376, "icl_sequence_length": 84, "num_contexts": 3, "step": 21230 }, { "grad_norm": 32.85311508178711, "learning_rate": 0.00014703634820377286, "loss": 0.3717, "step": 21240 }, { "gate_value": 0.3315005898475647, "icl_sequence_length": 84, "num_contexts": 3, "step": 21240 }, { "grad_norm": 2.701444625854492, "learning_rate": 0.0001469123632014291, "loss": 0.3538, "step": 21250 }, { "gate_value": 0.33133813738822937, "icl_sequence_length": 78, "num_contexts": 3, "step": 21250 }, { "grad_norm": 31.169008255004883, "learning_rate": 0.00014678838030945593, "loss": 0.3972, "step": 21260 }, { "gate_value": 0.33133530616760254, "icl_sequence_length": 76, "num_contexts": 3, "step": 21260 }, { "grad_norm": 6.129772186279297, "learning_rate": 0.0001466643996125946, "loss": 0.4004, "step": 21270 }, { "gate_value": 0.33128389716148376, "icl_sequence_length": 74, "num_contexts": 3, "step": 21270 }, { "grad_norm": 4.4965314865112305, "learning_rate": 0.00014654042119558464, "loss": 0.3586, "step": 21280 }, { "gate_value": 0.3313887417316437, "icl_sequence_length": 94, "num_contexts": 3, "step": 21280 }, { "grad_norm": 5.3050456047058105, "learning_rate": 0.00014641644514316418, "loss": 0.3583, "step": 21290 }, { "gate_value": 0.33156993985176086, "icl_sequence_length": 84, "num_contexts": 3, "step": 21290 }, { "grad_norm": 14.692100524902344, "learning_rate": 0.00014629247154006973, "loss": 0.3723, "step": 21300 }, { "gate_value": 0.3315286934375763, "icl_sequence_length": 82, "num_contexts": 3, "step": 21300 }, { "grad_norm": 3.1017932891845703, "learning_rate": 0.00014616850047103597, "loss": 0.3956, "step": 21310 }, { "gate_value": 0.3315356373786926, "icl_sequence_length": 80, "num_contexts": 3, "step": 21310 }, { "grad_norm": 9.048178672790527, "learning_rate": 0.000146044532020796, "loss": 0.376, "step": 21320 }, { "gate_value": 0.331571102142334, "icl_sequence_length": 70, "num_contexts": 3, "step": 21320 }, { "grad_norm": 5.0327630043029785, "learning_rate": 0.0001459205662740811, "loss": 0.3802, "step": 21330 }, { "gate_value": 0.3317570686340332, "icl_sequence_length": 78, "num_contexts": 3, "step": 21330 }, { "grad_norm": 8.62702751159668, "learning_rate": 0.0001457966033156207, "loss": 0.374, "step": 21340 }, { "gate_value": 0.3319891095161438, "icl_sequence_length": 86, "num_contexts": 3, "step": 21340 }, { "grad_norm": 3.6938068866729736, "learning_rate": 0.00014567264323014228, "loss": 0.3571, "step": 21350 }, { "gate_value": 0.332073837518692, "icl_sequence_length": 72, "num_contexts": 3, "step": 21350 }, { "grad_norm": 361.4523620605469, "learning_rate": 0.0001455486861023714, "loss": 0.3681, "step": 21360 }, { "gate_value": 0.3321523368358612, "icl_sequence_length": 88, "num_contexts": 3, "step": 21360 }, { "grad_norm": 10.629288673400879, "learning_rate": 0.00014542473201703163, "loss": 0.3674, "step": 21370 }, { "gate_value": 0.3323724567890167, "icl_sequence_length": 72, "num_contexts": 3, "step": 21370 }, { "grad_norm": 145.07919311523438, "learning_rate": 0.00014530078105884435, "loss": 0.3646, "step": 21380 }, { "gate_value": 0.33256107568740845, "icl_sequence_length": 82, "num_contexts": 3, "step": 21380 }, { "grad_norm": 10.726923942565918, "learning_rate": 0.0001451768333125289, "loss": 0.3641, "step": 21390 }, { "gate_value": 0.33275166153907776, "icl_sequence_length": 80, "num_contexts": 3, "step": 21390 }, { "grad_norm": 51.94292068481445, "learning_rate": 0.0001450528888628025, "loss": 0.3949, "step": 21400 }, { "gate_value": 0.3329571485519409, "icl_sequence_length": 78, "num_contexts": 3, "step": 21400 }, { "grad_norm": 8.739526748657227, "learning_rate": 0.00014492894779437985, "loss": 0.3752, "step": 21410 }, { "gate_value": 0.33292508125305176, "icl_sequence_length": 92, "num_contexts": 3, "step": 21410 }, { "grad_norm": 6.976619720458984, "learning_rate": 0.00014480501019197353, "loss": 0.3626, "step": 21420 }, { "gate_value": 0.3328370749950409, "icl_sequence_length": 90, "num_contexts": 3, "step": 21420 }, { "grad_norm": 2.2733330726623535, "learning_rate": 0.0001446810761402938, "loss": 0.3961, "step": 21430 }, { "gate_value": 0.33283504843711853, "icl_sequence_length": 88, "num_contexts": 3, "step": 21430 }, { "grad_norm": 4.6120524406433105, "learning_rate": 0.00014455714572404833, "loss": 0.3714, "step": 21440 }, { "gate_value": 0.3329114019870758, "icl_sequence_length": 88, "num_contexts": 3, "step": 21440 }, { "grad_norm": 230.9474639892578, "learning_rate": 0.00014443321902794248, "loss": 0.3815, "step": 21450 }, { "gate_value": 0.3328808546066284, "icl_sequence_length": 90, "num_contexts": 3, "step": 21450 }, { "grad_norm": 1.880590558052063, "learning_rate": 0.0001443092961366789, "loss": 0.3749, "step": 21460 }, { "gate_value": 0.3330598771572113, "icl_sequence_length": 66, "num_contexts": 3, "step": 21460 }, { "grad_norm": 2.525514841079712, "learning_rate": 0.00014418537713495775, "loss": 0.3768, "step": 21470 }, { "gate_value": 0.333394318819046, "icl_sequence_length": 76, "num_contexts": 3, "step": 21470 }, { "grad_norm": 1.9454431533813477, "learning_rate": 0.0001440614621074765, "loss": 0.3853, "step": 21480 }, { "gate_value": 0.3335770070552826, "icl_sequence_length": 90, "num_contexts": 3, "step": 21480 }, { "grad_norm": 1.8111017942428589, "learning_rate": 0.00014393755113892997, "loss": 0.3682, "step": 21490 }, { "gate_value": 0.33381104469299316, "icl_sequence_length": 76, "num_contexts": 3, "step": 21490 }, { "grad_norm": 4.904592514038086, "learning_rate": 0.00014381364431401, "loss": 0.3897, "step": 21500 }, { "gate_value": 0.333828330039978, "icl_sequence_length": 74, "num_contexts": 3, "step": 21500 }, { "grad_norm": 2.9814579486846924, "learning_rate": 0.00014368974171740585, "loss": 0.3776, "step": 21510 }, { "gate_value": 0.333970844745636, "icl_sequence_length": 92, "num_contexts": 3, "step": 21510 }, { "grad_norm": 44.95869064331055, "learning_rate": 0.00014356584343380366, "loss": 0.365, "step": 21520 }, { "gate_value": 0.3343297243118286, "icl_sequence_length": 84, "num_contexts": 3, "step": 21520 }, { "grad_norm": 14.722635269165039, "learning_rate": 0.0001434419495478869, "loss": 0.3879, "step": 21530 }, { "gate_value": 0.3345628082752228, "icl_sequence_length": 62, "num_contexts": 3, "step": 21530 }, { "grad_norm": 2.9341938495635986, "learning_rate": 0.00014331806014433572, "loss": 0.3726, "step": 21540 }, { "gate_value": 0.3346128761768341, "icl_sequence_length": 88, "num_contexts": 3, "step": 21540 }, { "grad_norm": 16.510175704956055, "learning_rate": 0.00014319417530782744, "loss": 0.3785, "step": 21550 }, { "gate_value": 0.3347155749797821, "icl_sequence_length": 62, "num_contexts": 3, "step": 21550 }, { "grad_norm": 1.5997096300125122, "learning_rate": 0.00014307029512303617, "loss": 0.3938, "step": 21560 }, { "gate_value": 0.33479592204093933, "icl_sequence_length": 76, "num_contexts": 3, "step": 21560 }, { "grad_norm": 1.6002236604690552, "learning_rate": 0.00014294641967463282, "loss": 0.3648, "step": 21570 }, { "gate_value": 0.33493053913116455, "icl_sequence_length": 86, "num_contexts": 3, "step": 21570 }, { "grad_norm": 288.05035400390625, "learning_rate": 0.0001428225490472852, "loss": 0.3557, "step": 21580 }, { "gate_value": 0.3353734314441681, "icl_sequence_length": 86, "num_contexts": 3, "step": 21580 }, { "grad_norm": 2.035752534866333, "learning_rate": 0.00014269868332565755, "loss": 0.3721, "step": 21590 }, { "gate_value": 0.3355160653591156, "icl_sequence_length": 82, "num_contexts": 3, "step": 21590 }, { "grad_norm": 7.391807556152344, "learning_rate": 0.000142574822594411, "loss": 0.3924, "step": 21600 }, { "gate_value": 0.33554723858833313, "icl_sequence_length": 88, "num_contexts": 3, "step": 21600 }, { "grad_norm": 5.8543195724487305, "learning_rate": 0.00014245096693820322, "loss": 0.3779, "step": 21610 }, { "gate_value": 0.3356199860572815, "icl_sequence_length": 72, "num_contexts": 3, "step": 21610 }, { "grad_norm": 2.9621081352233887, "learning_rate": 0.0001423271164416883, "loss": 0.3571, "step": 21620 }, { "gate_value": 0.33589836955070496, "icl_sequence_length": 76, "num_contexts": 3, "step": 21620 }, { "grad_norm": 4.828012466430664, "learning_rate": 0.000142203271189517, "loss": 0.3721, "step": 21630 }, { "gate_value": 0.33621811866760254, "icl_sequence_length": 78, "num_contexts": 3, "step": 21630 }, { "grad_norm": 4.43719482421875, "learning_rate": 0.0001420794312663363, "loss": 0.3671, "step": 21640 }, { "gate_value": 0.3363811671733856, "icl_sequence_length": 76, "num_contexts": 3, "step": 21640 }, { "grad_norm": 12.81857681274414, "learning_rate": 0.00014195559675678963, "loss": 0.3775, "step": 21650 }, { "gate_value": 0.33632469177246094, "icl_sequence_length": 82, "num_contexts": 3, "step": 21650 }, { "grad_norm": 2.1967718601226807, "learning_rate": 0.00014183176774551672, "loss": 0.3817, "step": 21660 }, { "gate_value": 0.336483359336853, "icl_sequence_length": 80, "num_contexts": 3, "step": 21660 }, { "grad_norm": 6.7688140869140625, "learning_rate": 0.00014170794431715353, "loss": 0.3945, "step": 21670 }, { "gate_value": 0.3365987539291382, "icl_sequence_length": 96, "num_contexts": 3, "step": 21670 }, { "grad_norm": 2.248080015182495, "learning_rate": 0.0001415841265563323, "loss": 0.3865, "step": 21680 }, { "gate_value": 0.3365277647972107, "icl_sequence_length": 82, "num_contexts": 3, "step": 21680 }, { "grad_norm": 7.746638774871826, "learning_rate": 0.00014146031454768113, "loss": 0.3612, "step": 21690 }, { "gate_value": 0.3365982472896576, "icl_sequence_length": 82, "num_contexts": 3, "step": 21690 }, { "grad_norm": 56.98549270629883, "learning_rate": 0.00014133650837582445, "loss": 0.3678, "step": 21700 }, { "gate_value": 0.3367842435836792, "icl_sequence_length": 92, "num_contexts": 3, "step": 21700 }, { "grad_norm": 4.436330795288086, "learning_rate": 0.00014121270812538262, "loss": 0.398, "step": 21710 }, { "gate_value": 0.3368651270866394, "icl_sequence_length": 78, "num_contexts": 3, "step": 21710 }, { "grad_norm": 3.1496005058288574, "learning_rate": 0.0001410889138809719, "loss": 0.3745, "step": 21720 }, { "gate_value": 0.33694028854370117, "icl_sequence_length": 58, "num_contexts": 3, "step": 21720 }, { "grad_norm": 3.2747929096221924, "learning_rate": 0.00014096512572720453, "loss": 0.3676, "step": 21730 }, { "gate_value": 0.3370567262172699, "icl_sequence_length": 76, "num_contexts": 3, "step": 21730 }, { "grad_norm": 3.3517582416534424, "learning_rate": 0.0001408413437486885, "loss": 0.374, "step": 21740 }, { "gate_value": 0.3370315134525299, "icl_sequence_length": 86, "num_contexts": 3, "step": 21740 }, { "grad_norm": 9.315972328186035, "learning_rate": 0.00014071756803002772, "loss": 0.3617, "step": 21750 }, { "gate_value": 0.33682939410209656, "icl_sequence_length": 90, "num_contexts": 3, "step": 21750 }, { "grad_norm": 6.208014011383057, "learning_rate": 0.00014059379865582163, "loss": 0.3576, "step": 21760 }, { "gate_value": 0.33689308166503906, "icl_sequence_length": 88, "num_contexts": 3, "step": 21760 }, { "grad_norm": 18.423473358154297, "learning_rate": 0.0001404700357106655, "loss": 0.3841, "step": 21770 }, { "gate_value": 0.3369765877723694, "icl_sequence_length": 86, "num_contexts": 3, "step": 21770 }, { "grad_norm": 5.565635681152344, "learning_rate": 0.00014034627927915006, "loss": 0.3669, "step": 21780 }, { "gate_value": 0.33682945370674133, "icl_sequence_length": 76, "num_contexts": 3, "step": 21780 }, { "grad_norm": 9.092445373535156, "learning_rate": 0.0001402225294458617, "loss": 0.3609, "step": 21790 }, { "gate_value": 0.3367845416069031, "icl_sequence_length": 76, "num_contexts": 3, "step": 21790 }, { "grad_norm": 3.235175848007202, "learning_rate": 0.00014009878629538225, "loss": 0.3716, "step": 21800 }, { "gate_value": 0.33702075481414795, "icl_sequence_length": 82, "num_contexts": 3, "step": 21800 }, { "grad_norm": 2.2112083435058594, "learning_rate": 0.00013997504991228906, "loss": 0.3923, "step": 21810 }, { "gate_value": 0.3373863995075226, "icl_sequence_length": 86, "num_contexts": 3, "step": 21810 }, { "grad_norm": 1262.691650390625, "learning_rate": 0.00013985132038115466, "loss": 0.3723, "step": 21820 }, { "gate_value": 0.33774295449256897, "icl_sequence_length": 86, "num_contexts": 3, "step": 21820 }, { "grad_norm": 7.494561672210693, "learning_rate": 0.00013972759778654715, "loss": 0.3738, "step": 21830 }, { "gate_value": 0.33784347772598267, "icl_sequence_length": 64, "num_contexts": 3, "step": 21830 }, { "grad_norm": 5.770691394805908, "learning_rate": 0.00013960388221302962, "loss": 0.3621, "step": 21840 }, { "gate_value": 0.33791738748550415, "icl_sequence_length": 88, "num_contexts": 3, "step": 21840 }, { "grad_norm": 318.8439025878906, "learning_rate": 0.00013948017374516063, "loss": 0.3611, "step": 21850 }, { "gate_value": 0.3379685580730438, "icl_sequence_length": 80, "num_contexts": 3, "step": 21850 }, { "grad_norm": 43.76850509643555, "learning_rate": 0.00013935647246749372, "loss": 0.3729, "step": 21860 }, { "gate_value": 0.338062584400177, "icl_sequence_length": 76, "num_contexts": 3, "step": 21860 }, { "grad_norm": 88.09666442871094, "learning_rate": 0.00013923277846457743, "loss": 0.3917, "step": 21870 }, { "gate_value": 0.3383744955062866, "icl_sequence_length": 90, "num_contexts": 3, "step": 21870 }, { "grad_norm": 11.775474548339844, "learning_rate": 0.00013910909182095554, "loss": 0.3741, "step": 21880 }, { "gate_value": 0.33857882022857666, "icl_sequence_length": 82, "num_contexts": 3, "step": 21880 }, { "grad_norm": 103.6872329711914, "learning_rate": 0.00013898541262116675, "loss": 0.3704, "step": 21890 }, { "gate_value": 0.3385585844516754, "icl_sequence_length": 78, "num_contexts": 3, "step": 21890 }, { "grad_norm": 1.8290860652923584, "learning_rate": 0.0001388617409497445, "loss": 0.3702, "step": 21900 }, { "gate_value": 0.33865779638290405, "icl_sequence_length": 64, "num_contexts": 3, "step": 21900 }, { "grad_norm": 1.7455637454986572, "learning_rate": 0.00013873807689121736, "loss": 0.3851, "step": 21910 }, { "gate_value": 0.33894455432891846, "icl_sequence_length": 78, "num_contexts": 3, "step": 21910 }, { "grad_norm": 19.623218536376953, "learning_rate": 0.00013861442053010841, "loss": 0.3834, "step": 21920 }, { "gate_value": 0.3391704261302948, "icl_sequence_length": 68, "num_contexts": 3, "step": 21920 }, { "grad_norm": 7.914072036743164, "learning_rate": 0.00013849077195093572, "loss": 0.3822, "step": 21930 }, { "gate_value": 0.33932042121887207, "icl_sequence_length": 72, "num_contexts": 3, "step": 21930 }, { "grad_norm": 7.193912029266357, "learning_rate": 0.0001383671312382118, "loss": 0.3685, "step": 21940 }, { "gate_value": 0.3394797742366791, "icl_sequence_length": 64, "num_contexts": 3, "step": 21940 }, { "grad_norm": 6.832879543304443, "learning_rate": 0.00013824349847644407, "loss": 0.3638, "step": 21950 }, { "gate_value": 0.33957207202911377, "icl_sequence_length": 80, "num_contexts": 3, "step": 21950 }, { "grad_norm": 22.681137084960938, "learning_rate": 0.00013811987375013428, "loss": 0.3832, "step": 21960 }, { "gate_value": 0.3396352231502533, "icl_sequence_length": 70, "num_contexts": 3, "step": 21960 }, { "grad_norm": 16.05438995361328, "learning_rate": 0.0001379962571437787, "loss": 0.3688, "step": 21970 }, { "gate_value": 0.33961719274520874, "icl_sequence_length": 76, "num_contexts": 3, "step": 21970 }, { "grad_norm": 205.68336486816406, "learning_rate": 0.00013787264874186818, "loss": 0.3691, "step": 21980 }, { "gate_value": 0.3396041989326477, "icl_sequence_length": 64, "num_contexts": 3, "step": 21980 }, { "grad_norm": 8.141767501831055, "learning_rate": 0.00013774904862888792, "loss": 0.3897, "step": 21990 }, { "gate_value": 0.33965280652046204, "icl_sequence_length": 72, "num_contexts": 3, "step": 21990 }, { "grad_norm": 23.583955764770508, "learning_rate": 0.00013762545688931737, "loss": 0.3658, "step": 22000 }, { "gate_value": 0.3397328555583954, "icl_sequence_length": 68, "num_contexts": 3, "step": 22000 }, { "grad_norm": 7.479121685028076, "learning_rate": 0.00013750187360763038, "loss": 0.3654, "step": 22010 }, { "gate_value": 0.33978742361068726, "icl_sequence_length": 84, "num_contexts": 3, "step": 22010 }, { "grad_norm": 3.828936815261841, "learning_rate": 0.0001373782988682949, "loss": 0.3588, "step": 22020 }, { "gate_value": 0.3398081064224243, "icl_sequence_length": 72, "num_contexts": 3, "step": 22020 }, { "grad_norm": 12.60776138305664, "learning_rate": 0.00013725473275577314, "loss": 0.379, "step": 22030 }, { "gate_value": 0.3398773670196533, "icl_sequence_length": 70, "num_contexts": 3, "step": 22030 }, { "grad_norm": 5.787103652954102, "learning_rate": 0.00013713117535452135, "loss": 0.3601, "step": 22040 }, { "gate_value": 0.3399393856525421, "icl_sequence_length": 90, "num_contexts": 3, "step": 22040 }, { "grad_norm": 3.2277674674987793, "learning_rate": 0.00013700762674898992, "loss": 0.3653, "step": 22050 }, { "gate_value": 0.3401627838611603, "icl_sequence_length": 72, "num_contexts": 3, "step": 22050 }, { "grad_norm": 21.32828712463379, "learning_rate": 0.00013688408702362308, "loss": 0.3684, "step": 22060 }, { "gate_value": 0.34032222628593445, "icl_sequence_length": 68, "num_contexts": 3, "step": 22060 }, { "grad_norm": 9.054245948791504, "learning_rate": 0.00013676055626285903, "loss": 0.3559, "step": 22070 }, { "gate_value": 0.34050172567367554, "icl_sequence_length": 62, "num_contexts": 3, "step": 22070 }, { "grad_norm": 3.0994348526000977, "learning_rate": 0.00013663703455112994, "loss": 0.3766, "step": 22080 }, { "gate_value": 0.3407020568847656, "icl_sequence_length": 80, "num_contexts": 3, "step": 22080 }, { "grad_norm": 10.302519798278809, "learning_rate": 0.00013651352197286177, "loss": 0.3635, "step": 22090 }, { "gate_value": 0.34067562222480774, "icl_sequence_length": 90, "num_contexts": 3, "step": 22090 }, { "grad_norm": 9.207005500793457, "learning_rate": 0.0001363900186124741, "loss": 0.3746, "step": 22100 }, { "gate_value": 0.34073901176452637, "icl_sequence_length": 78, "num_contexts": 3, "step": 22100 }, { "grad_norm": 18.054229736328125, "learning_rate": 0.00013626652455438044, "loss": 0.3584, "step": 22110 }, { "gate_value": 0.3407975435256958, "icl_sequence_length": 76, "num_contexts": 3, "step": 22110 }, { "grad_norm": 62.869850158691406, "learning_rate": 0.0001361430398829877, "loss": 0.3601, "step": 22120 }, { "gate_value": 0.3408588171005249, "icl_sequence_length": 86, "num_contexts": 3, "step": 22120 }, { "grad_norm": 7.5538835525512695, "learning_rate": 0.00013601956468269657, "loss": 0.3597, "step": 22130 }, { "gate_value": 0.3410506248474121, "icl_sequence_length": 82, "num_contexts": 3, "step": 22130 }, { "grad_norm": 2.9426703453063965, "learning_rate": 0.00013589609903790119, "loss": 0.3644, "step": 22140 }, { "gate_value": 0.34119319915771484, "icl_sequence_length": 68, "num_contexts": 3, "step": 22140 }, { "grad_norm": 12.853556632995605, "learning_rate": 0.00013577264303298907, "loss": 0.3637, "step": 22150 }, { "gate_value": 0.34135085344314575, "icl_sequence_length": 78, "num_contexts": 3, "step": 22150 }, { "grad_norm": 4.324552536010742, "learning_rate": 0.00013564919675234128, "loss": 0.3793, "step": 22160 }, { "gate_value": 0.34146222472190857, "icl_sequence_length": 84, "num_contexts": 3, "step": 22160 }, { "grad_norm": 5.883782386779785, "learning_rate": 0.00013552576028033218, "loss": 0.3737, "step": 22170 }, { "gate_value": 0.3415074646472931, "icl_sequence_length": 88, "num_contexts": 3, "step": 22170 }, { "grad_norm": 10.931557655334473, "learning_rate": 0.00013540233370132944, "loss": 0.3589, "step": 22180 }, { "gate_value": 0.3415931165218353, "icl_sequence_length": 66, "num_contexts": 3, "step": 22180 }, { "grad_norm": 17.461891174316406, "learning_rate": 0.000135278917099694, "loss": 0.3777, "step": 22190 }, { "gate_value": 0.3416183888912201, "icl_sequence_length": 76, "num_contexts": 3, "step": 22190 }, { "grad_norm": 3.816490650177002, "learning_rate": 0.00013515551055977987, "loss": 0.3762, "step": 22200 }, { "gate_value": 0.341708779335022, "icl_sequence_length": 76, "num_contexts": 3, "step": 22200 }, { "grad_norm": 11.506010055541992, "learning_rate": 0.00013503211416593435, "loss": 0.3672, "step": 22210 }, { "gate_value": 0.34182965755462646, "icl_sequence_length": 88, "num_contexts": 3, "step": 22210 }, { "grad_norm": 7.808537483215332, "learning_rate": 0.00013490872800249763, "loss": 0.367, "step": 22220 }, { "gate_value": 0.3420865833759308, "icl_sequence_length": 86, "num_contexts": 3, "step": 22220 }, { "grad_norm": 6.892821788787842, "learning_rate": 0.000134785352153803, "loss": 0.3692, "step": 22230 }, { "gate_value": 0.34221896529197693, "icl_sequence_length": 70, "num_contexts": 3, "step": 22230 }, { "grad_norm": 30.9285888671875, "learning_rate": 0.0001346619867041768, "loss": 0.3762, "step": 22240 }, { "gate_value": 0.34225592017173767, "icl_sequence_length": 76, "num_contexts": 3, "step": 22240 }, { "grad_norm": 103.91609191894531, "learning_rate": 0.00013453863173793797, "loss": 0.3689, "step": 22250 }, { "gate_value": 0.3423255681991577, "icl_sequence_length": 64, "num_contexts": 3, "step": 22250 }, { "grad_norm": 37.15800094604492, "learning_rate": 0.0001344152873393986, "loss": 0.361, "step": 22260 }, { "gate_value": 0.3423844873905182, "icl_sequence_length": 90, "num_contexts": 3, "step": 22260 }, { "grad_norm": 10.238519668579102, "learning_rate": 0.00013429195359286332, "loss": 0.3649, "step": 22270 }, { "gate_value": 0.3424571454524994, "icl_sequence_length": 78, "num_contexts": 3, "step": 22270 }, { "grad_norm": 6.922329902648926, "learning_rate": 0.00013416863058262967, "loss": 0.3755, "step": 22280 }, { "gate_value": 0.34252744913101196, "icl_sequence_length": 90, "num_contexts": 3, "step": 22280 }, { "grad_norm": 4.249993801116943, "learning_rate": 0.00013404531839298774, "loss": 0.3693, "step": 22290 }, { "gate_value": 0.3425276577472687, "icl_sequence_length": 84, "num_contexts": 3, "step": 22290 }, { "grad_norm": 2.9230473041534424, "learning_rate": 0.00013392201710822022, "loss": 0.3579, "step": 22300 }, { "gate_value": 0.34264761209487915, "icl_sequence_length": 86, "num_contexts": 3, "step": 22300 }, { "grad_norm": 19.932083129882812, "learning_rate": 0.00013379872681260245, "loss": 0.3587, "step": 22310 }, { "gate_value": 0.3429376780986786, "icl_sequence_length": 90, "num_contexts": 3, "step": 22310 }, { "grad_norm": 14.836755752563477, "learning_rate": 0.0001336754475904021, "loss": 0.3836, "step": 22320 }, { "gate_value": 0.3430907726287842, "icl_sequence_length": 78, "num_contexts": 3, "step": 22320 }, { "grad_norm": 8.244352340698242, "learning_rate": 0.00013355217952587943, "loss": 0.3618, "step": 22330 }, { "gate_value": 0.343147337436676, "icl_sequence_length": 68, "num_contexts": 3, "step": 22330 }, { "grad_norm": 18.00697898864746, "learning_rate": 0.00013342892270328696, "loss": 0.3703, "step": 22340 }, { "gate_value": 0.3431670665740967, "icl_sequence_length": 72, "num_contexts": 3, "step": 22340 }, { "grad_norm": 8.288820266723633, "learning_rate": 0.0001333056772068695, "loss": 0.3639, "step": 22350 }, { "gate_value": 0.343341201543808, "icl_sequence_length": 70, "num_contexts": 3, "step": 22350 }, { "grad_norm": 4.5525288581848145, "learning_rate": 0.0001331824431208643, "loss": 0.3646, "step": 22360 }, { "gate_value": 0.34348997473716736, "icl_sequence_length": 72, "num_contexts": 3, "step": 22360 }, { "grad_norm": 20.53244400024414, "learning_rate": 0.00013305922052950063, "loss": 0.3857, "step": 22370 }, { "gate_value": 0.3435961902141571, "icl_sequence_length": 58, "num_contexts": 3, "step": 22370 }, { "grad_norm": 4.72353458404541, "learning_rate": 0.00013293600951699996, "loss": 0.3781, "step": 22380 }, { "gate_value": 0.3436838984489441, "icl_sequence_length": 78, "num_contexts": 3, "step": 22380 }, { "grad_norm": 5.0068135261535645, "learning_rate": 0.00013281281016757593, "loss": 0.36, "step": 22390 }, { "gate_value": 0.3438510298728943, "icl_sequence_length": 84, "num_contexts": 3, "step": 22390 }, { "grad_norm": 4.280457496643066, "learning_rate": 0.00013268962256543404, "loss": 0.3698, "step": 22400 }, { "gate_value": 0.3440343141555786, "icl_sequence_length": 64, "num_contexts": 3, "step": 22400 }, { "grad_norm": 38.63629913330078, "learning_rate": 0.00013256644679477195, "loss": 0.3626, "step": 22410 }, { "gate_value": 0.3442992568016052, "icl_sequence_length": 80, "num_contexts": 3, "step": 22410 }, { "grad_norm": 16.1657657623291, "learning_rate": 0.00013244328293977913, "loss": 0.3695, "step": 22420 }, { "gate_value": 0.34445565938949585, "icl_sequence_length": 78, "num_contexts": 3, "step": 22420 }, { "grad_norm": 11.632641792297363, "learning_rate": 0.00013232013108463678, "loss": 0.3674, "step": 22430 }, { "gate_value": 0.3445255756378174, "icl_sequence_length": 86, "num_contexts": 3, "step": 22430 }, { "grad_norm": 13.374603271484375, "learning_rate": 0.00013219699131351815, "loss": 0.3747, "step": 22440 }, { "gate_value": 0.34462597966194153, "icl_sequence_length": 62, "num_contexts": 3, "step": 22440 }, { "grad_norm": 35.441802978515625, "learning_rate": 0.00013207386371058807, "loss": 0.3587, "step": 22450 }, { "gate_value": 0.3447570502758026, "icl_sequence_length": 82, "num_contexts": 3, "step": 22450 }, { "grad_norm": 7.294458389282227, "learning_rate": 0.00013195074836000313, "loss": 0.3637, "step": 22460 }, { "gate_value": 0.3447839617729187, "icl_sequence_length": 86, "num_contexts": 3, "step": 22460 }, { "grad_norm": 5.761556625366211, "learning_rate": 0.00013182764534591147, "loss": 0.3711, "step": 22470 }, { "gate_value": 0.34476137161254883, "icl_sequence_length": 76, "num_contexts": 3, "step": 22470 }, { "grad_norm": 10.302973747253418, "learning_rate": 0.00013170455475245284, "loss": 0.3596, "step": 22480 }, { "gate_value": 0.3446881175041199, "icl_sequence_length": 80, "num_contexts": 3, "step": 22480 }, { "grad_norm": 53.81081771850586, "learning_rate": 0.00013158147666375857, "loss": 0.3793, "step": 22490 }, { "gate_value": 0.3447395861148834, "icl_sequence_length": 76, "num_contexts": 3, "step": 22490 }, { "grad_norm": 7.093109607696533, "learning_rate": 0.00013145841116395132, "loss": 0.3738, "step": 22500 }, { "gate_value": 0.3448319733142853, "icl_sequence_length": 58, "num_contexts": 3, "step": 22500 }, { "grad_norm": 8.175000190734863, "learning_rate": 0.00013133535833714522, "loss": 0.3624, "step": 22510 }, { "gate_value": 0.3448798656463623, "icl_sequence_length": 68, "num_contexts": 3, "step": 22510 }, { "grad_norm": 8.24658489227295, "learning_rate": 0.0001312123182674457, "loss": 0.3696, "step": 22520 }, { "gate_value": 0.34493088722229004, "icl_sequence_length": 64, "num_contexts": 3, "step": 22520 }, { "grad_norm": 9.602054595947266, "learning_rate": 0.00013108929103894943, "loss": 0.3786, "step": 22530 }, { "gate_value": 0.3449787199497223, "icl_sequence_length": 90, "num_contexts": 3, "step": 22530 }, { "grad_norm": 14.278233528137207, "learning_rate": 0.00013096627673574445, "loss": 0.3816, "step": 22540 }, { "gate_value": 0.34499263763427734, "icl_sequence_length": 72, "num_contexts": 3, "step": 22540 }, { "grad_norm": 19.222492218017578, "learning_rate": 0.00013084327544190982, "loss": 0.3667, "step": 22550 }, { "gate_value": 0.3449796736240387, "icl_sequence_length": 62, "num_contexts": 3, "step": 22550 }, { "grad_norm": 24.835494995117188, "learning_rate": 0.00013072028724151583, "loss": 0.3662, "step": 22560 }, { "gate_value": 0.345020592212677, "icl_sequence_length": 78, "num_contexts": 3, "step": 22560 }, { "grad_norm": 7.1763153076171875, "learning_rate": 0.00013059731221862366, "loss": 0.3691, "step": 22570 }, { "gate_value": 0.3450656235218048, "icl_sequence_length": 78, "num_contexts": 3, "step": 22570 }, { "grad_norm": 4.902071952819824, "learning_rate": 0.00013047435045728567, "loss": 0.3872, "step": 22580 }, { "gate_value": 0.34517911076545715, "icl_sequence_length": 72, "num_contexts": 3, "step": 22580 }, { "grad_norm": 19.429061889648438, "learning_rate": 0.000130351402041545, "loss": 0.3782, "step": 22590 }, { "gate_value": 0.3452775776386261, "icl_sequence_length": 78, "num_contexts": 3, "step": 22590 }, { "grad_norm": 30.76957893371582, "learning_rate": 0.00013022846705543578, "loss": 0.3711, "step": 22600 }, { "gate_value": 0.3454075753688812, "icl_sequence_length": 76, "num_contexts": 3, "step": 22600 }, { "grad_norm": 36.29285430908203, "learning_rate": 0.00013010554558298294, "loss": 0.3873, "step": 22610 }, { "gate_value": 0.34549373388290405, "icl_sequence_length": 82, "num_contexts": 3, "step": 22610 }, { "grad_norm": 15.497030258178711, "learning_rate": 0.00012998263770820206, "loss": 0.3655, "step": 22620 }, { "gate_value": 0.3456413447856903, "icl_sequence_length": 66, "num_contexts": 3, "step": 22620 }, { "grad_norm": 9.922740936279297, "learning_rate": 0.00012985974351509955, "loss": 0.3735, "step": 22630 }, { "gate_value": 0.3457619249820709, "icl_sequence_length": 94, "num_contexts": 3, "step": 22630 }, { "grad_norm": 6.184462070465088, "learning_rate": 0.00012973686308767244, "loss": 0.3595, "step": 22640 }, { "gate_value": 0.34587883949279785, "icl_sequence_length": 70, "num_contexts": 3, "step": 22640 }, { "grad_norm": 8.652168273925781, "learning_rate": 0.0001296139965099083, "loss": 0.3737, "step": 22650 }, { "gate_value": 0.34601011872291565, "icl_sequence_length": 68, "num_contexts": 3, "step": 22650 }, { "grad_norm": 4.410395622253418, "learning_rate": 0.00012949114386578538, "loss": 0.3758, "step": 22660 }, { "gate_value": 0.3462025225162506, "icl_sequence_length": 76, "num_contexts": 3, "step": 22660 }, { "grad_norm": 7.113274574279785, "learning_rate": 0.00012936830523927218, "loss": 0.3733, "step": 22670 }, { "gate_value": 0.3464626967906952, "icl_sequence_length": 84, "num_contexts": 3, "step": 22670 }, { "grad_norm": 3.876377820968628, "learning_rate": 0.00012924548071432783, "loss": 0.3706, "step": 22680 }, { "gate_value": 0.34653839468955994, "icl_sequence_length": 88, "num_contexts": 3, "step": 22680 }, { "grad_norm": 24.962556838989258, "learning_rate": 0.00012912267037490174, "loss": 0.3503, "step": 22690 }, { "gate_value": 0.34656399488449097, "icl_sequence_length": 82, "num_contexts": 3, "step": 22690 }, { "grad_norm": 5.596436023712158, "learning_rate": 0.0001289998743049336, "loss": 0.3641, "step": 22700 }, { "gate_value": 0.3467022776603699, "icl_sequence_length": 60, "num_contexts": 3, "step": 22700 }, { "grad_norm": 9.493281364440918, "learning_rate": 0.00012887709258835328, "loss": 0.372, "step": 22710 }, { "gate_value": 0.34704118967056274, "icl_sequence_length": 90, "num_contexts": 3, "step": 22710 }, { "grad_norm": 15.705657005310059, "learning_rate": 0.00012875432530908107, "loss": 0.3802, "step": 22720 }, { "gate_value": 0.3472428023815155, "icl_sequence_length": 86, "num_contexts": 3, "step": 22720 }, { "grad_norm": 26.376834869384766, "learning_rate": 0.0001286315725510271, "loss": 0.364, "step": 22730 }, { "gate_value": 0.34734347462654114, "icl_sequence_length": 84, "num_contexts": 3, "step": 22730 }, { "grad_norm": 6.039391040802002, "learning_rate": 0.00012850883439809188, "loss": 0.3738, "step": 22740 }, { "gate_value": 0.3473823666572571, "icl_sequence_length": 84, "num_contexts": 3, "step": 22740 }, { "grad_norm": 114.58489990234375, "learning_rate": 0.00012838611093416564, "loss": 0.3636, "step": 22750 }, { "gate_value": 0.3474694490432739, "icl_sequence_length": 76, "num_contexts": 3, "step": 22750 }, { "grad_norm": 347.19915771484375, "learning_rate": 0.00012826340224312874, "loss": 0.3635, "step": 22760 }, { "gate_value": 0.34752917289733887, "icl_sequence_length": 74, "num_contexts": 3, "step": 22760 }, { "grad_norm": 15.640114784240723, "learning_rate": 0.00012814070840885152, "loss": 0.347, "step": 22770 }, { "gate_value": 0.34775689244270325, "icl_sequence_length": 88, "num_contexts": 3, "step": 22770 }, { "grad_norm": 11.231976509094238, "learning_rate": 0.00012801802951519393, "loss": 0.3707, "step": 22780 }, { "gate_value": 0.34792718291282654, "icl_sequence_length": 80, "num_contexts": 3, "step": 22780 }, { "grad_norm": 62.72665023803711, "learning_rate": 0.00012789536564600595, "loss": 0.3661, "step": 22790 }, { "gate_value": 0.34802743792533875, "icl_sequence_length": 66, "num_contexts": 3, "step": 22790 }, { "grad_norm": 10.408900260925293, "learning_rate": 0.0001277727168851271, "loss": 0.3529, "step": 22800 }, { "gate_value": 0.34815752506256104, "icl_sequence_length": 78, "num_contexts": 3, "step": 22800 }, { "grad_norm": 8.37360954284668, "learning_rate": 0.00012765008331638663, "loss": 0.3933, "step": 22810 }, { "gate_value": 0.34828460216522217, "icl_sequence_length": 66, "num_contexts": 3, "step": 22810 }, { "grad_norm": 6.180023193359375, "learning_rate": 0.00012752746502360347, "loss": 0.3553, "step": 22820 }, { "gate_value": 0.3484150171279907, "icl_sequence_length": 94, "num_contexts": 3, "step": 22820 }, { "grad_norm": 6.112893104553223, "learning_rate": 0.00012740486209058608, "loss": 0.3482, "step": 22830 }, { "gate_value": 0.3484891951084137, "icl_sequence_length": 68, "num_contexts": 3, "step": 22830 }, { "grad_norm": 104.99455261230469, "learning_rate": 0.0001272822746011324, "loss": 0.3768, "step": 22840 }, { "gate_value": 0.3485853672027588, "icl_sequence_length": 84, "num_contexts": 3, "step": 22840 }, { "grad_norm": 9.214521408081055, "learning_rate": 0.00012715970263902978, "loss": 0.3643, "step": 22850 }, { "gate_value": 0.3486069142818451, "icl_sequence_length": 74, "num_contexts": 3, "step": 22850 }, { "grad_norm": 29.462339401245117, "learning_rate": 0.00012703714628805503, "loss": 0.3769, "step": 22860 }, { "gate_value": 0.34865206480026245, "icl_sequence_length": 60, "num_contexts": 3, "step": 22860 }, { "grad_norm": 24.155452728271484, "learning_rate": 0.0001269146056319743, "loss": 0.3508, "step": 22870 }, { "gate_value": 0.3487738072872162, "icl_sequence_length": 72, "num_contexts": 3, "step": 22870 }, { "grad_norm": 51.79835891723633, "learning_rate": 0.00012679208075454292, "loss": 0.3564, "step": 22880 }, { "gate_value": 0.34891486167907715, "icl_sequence_length": 86, "num_contexts": 3, "step": 22880 }, { "grad_norm": 5.804798603057861, "learning_rate": 0.00012666957173950558, "loss": 0.372, "step": 22890 }, { "gate_value": 0.34904196858406067, "icl_sequence_length": 76, "num_contexts": 3, "step": 22890 }, { "grad_norm": 16.304189682006836, "learning_rate": 0.0001265470786705959, "loss": 0.3567, "step": 22900 }, { "gate_value": 0.34922027587890625, "icl_sequence_length": 74, "num_contexts": 3, "step": 22900 }, { "grad_norm": 4.497400760650635, "learning_rate": 0.00012642460163153678, "loss": 0.3704, "step": 22910 }, { "gate_value": 0.34943127632141113, "icl_sequence_length": 74, "num_contexts": 3, "step": 22910 }, { "grad_norm": 17.74970245361328, "learning_rate": 0.00012630214070604017, "loss": 0.3658, "step": 22920 }, { "gate_value": 0.3495607078075409, "icl_sequence_length": 82, "num_contexts": 3, "step": 22920 }, { "grad_norm": 13.01252269744873, "learning_rate": 0.00012617969597780693, "loss": 0.3718, "step": 22930 }, { "gate_value": 0.3495199382305145, "icl_sequence_length": 74, "num_contexts": 3, "step": 22930 }, { "grad_norm": 3.868537425994873, "learning_rate": 0.00012605726753052687, "loss": 0.3695, "step": 22940 }, { "gate_value": 0.3495273292064667, "icl_sequence_length": 88, "num_contexts": 3, "step": 22940 }, { "grad_norm": 487.12237548828125, "learning_rate": 0.00012593485544787868, "loss": 0.3789, "step": 22950 }, { "gate_value": 0.3496508002281189, "icl_sequence_length": 84, "num_contexts": 3, "step": 22950 }, { "grad_norm": 3.913041830062866, "learning_rate": 0.00012581245981352986, "loss": 0.3912, "step": 22960 }, { "gate_value": 0.34978440403938293, "icl_sequence_length": 64, "num_contexts": 3, "step": 22960 }, { "grad_norm": 25.10056495666504, "learning_rate": 0.00012569008071113672, "loss": 0.3656, "step": 22970 }, { "gate_value": 0.34976524114608765, "icl_sequence_length": 76, "num_contexts": 3, "step": 22970 }, { "grad_norm": 39.096656799316406, "learning_rate": 0.0001255677182243442, "loss": 0.364, "step": 22980 }, { "gate_value": 0.3497866094112396, "icl_sequence_length": 86, "num_contexts": 3, "step": 22980 }, { "grad_norm": 6.127499103546143, "learning_rate": 0.00012544537243678583, "loss": 0.361, "step": 22990 }, { "gate_value": 0.34986892342567444, "icl_sequence_length": 78, "num_contexts": 3, "step": 22990 }, { "grad_norm": 7.031398296356201, "learning_rate": 0.0001253230434320839, "loss": 0.3651, "step": 23000 }, { "gate_value": 0.34987005591392517, "icl_sequence_length": 66, "num_contexts": 3, "step": 23000 }, { "grad_norm": 11.885310173034668, "learning_rate": 0.00012520073129384908, "loss": 0.3806, "step": 23010 }, { "gate_value": 0.34989798069000244, "icl_sequence_length": 90, "num_contexts": 3, "step": 23010 }, { "grad_norm": 10.68913459777832, "learning_rate": 0.00012507843610568058, "loss": 0.3606, "step": 23020 }, { "gate_value": 0.3498261868953705, "icl_sequence_length": 82, "num_contexts": 3, "step": 23020 }, { "grad_norm": 6.32187032699585, "learning_rate": 0.000124956157951166, "loss": 0.3536, "step": 23030 }, { "gate_value": 0.349841833114624, "icl_sequence_length": 68, "num_contexts": 3, "step": 23030 }, { "grad_norm": 3.669666051864624, "learning_rate": 0.00012483389691388133, "loss": 0.366, "step": 23040 }, { "gate_value": 0.3500032126903534, "icl_sequence_length": 92, "num_contexts": 3, "step": 23040 }, { "grad_norm": 4.756706237792969, "learning_rate": 0.00012471165307739078, "loss": 0.3562, "step": 23050 }, { "gate_value": 0.3502030074596405, "icl_sequence_length": 82, "num_contexts": 3, "step": 23050 }, { "grad_norm": 6.585669994354248, "learning_rate": 0.0001245894265252469, "loss": 0.358, "step": 23060 }, { "gate_value": 0.35031023621559143, "icl_sequence_length": 64, "num_contexts": 3, "step": 23060 }, { "grad_norm": 4.768988609313965, "learning_rate": 0.00012446721734099046, "loss": 0.3816, "step": 23070 }, { "gate_value": 0.35037532448768616, "icl_sequence_length": 72, "num_contexts": 3, "step": 23070 }, { "grad_norm": 7.378237724304199, "learning_rate": 0.00012434502560815017, "loss": 0.3819, "step": 23080 }, { "gate_value": 0.3505246937274933, "icl_sequence_length": 90, "num_contexts": 3, "step": 23080 }, { "grad_norm": 15.408870697021484, "learning_rate": 0.00012422285141024293, "loss": 0.3707, "step": 23090 }, { "gate_value": 0.35071197152137756, "icl_sequence_length": 84, "num_contexts": 3, "step": 23090 }, { "grad_norm": 6.119144439697266, "learning_rate": 0.0001241006948307737, "loss": 0.3516, "step": 23100 }, { "gate_value": 0.3509138226509094, "icl_sequence_length": 88, "num_contexts": 3, "step": 23100 }, { "grad_norm": 19.70357894897461, "learning_rate": 0.00012397855595323534, "loss": 0.3648, "step": 23110 }, { "gate_value": 0.35107842087745667, "icl_sequence_length": 72, "num_contexts": 3, "step": 23110 }, { "grad_norm": 4.001468181610107, "learning_rate": 0.00012385643486110864, "loss": 0.3521, "step": 23120 }, { "gate_value": 0.3513704538345337, "icl_sequence_length": 80, "num_contexts": 3, "step": 23120 }, { "grad_norm": 6.317371845245361, "learning_rate": 0.00012373433163786216, "loss": 0.3623, "step": 23130 }, { "gate_value": 0.35159164667129517, "icl_sequence_length": 74, "num_contexts": 3, "step": 23130 }, { "grad_norm": 7.321961879730225, "learning_rate": 0.00012361224636695236, "loss": 0.3586, "step": 23140 }, { "gate_value": 0.35160747170448303, "icl_sequence_length": 60, "num_contexts": 3, "step": 23140 }, { "grad_norm": 20.925722122192383, "learning_rate": 0.0001234901791318233, "loss": 0.3752, "step": 23150 }, { "gate_value": 0.3516046404838562, "icl_sequence_length": 80, "num_contexts": 3, "step": 23150 }, { "grad_norm": 11.033568382263184, "learning_rate": 0.00012336813001590684, "loss": 0.369, "step": 23160 }, { "gate_value": 0.3517070412635803, "icl_sequence_length": 88, "num_contexts": 3, "step": 23160 }, { "grad_norm": 6.5204339027404785, "learning_rate": 0.0001232460991026225, "loss": 0.3717, "step": 23170 }, { "gate_value": 0.3518829345703125, "icl_sequence_length": 70, "num_contexts": 3, "step": 23170 }, { "grad_norm": 8.151719093322754, "learning_rate": 0.00012312408647537705, "loss": 0.3689, "step": 23180 }, { "gate_value": 0.35193541646003723, "icl_sequence_length": 66, "num_contexts": 3, "step": 23180 }, { "grad_norm": 4.423820495605469, "learning_rate": 0.00012300209221756506, "loss": 0.3564, "step": 23190 }, { "gate_value": 0.35213330388069153, "icl_sequence_length": 82, "num_contexts": 3, "step": 23190 }, { "grad_norm": 34.13200378417969, "learning_rate": 0.0001228801164125685, "loss": 0.3685, "step": 23200 }, { "gate_value": 0.3523845374584198, "icl_sequence_length": 70, "num_contexts": 3, "step": 23200 }, { "grad_norm": 10.104931831359863, "learning_rate": 0.00012275815914375662, "loss": 0.3572, "step": 23210 }, { "gate_value": 0.3526081144809723, "icl_sequence_length": 58, "num_contexts": 3, "step": 23210 }, { "grad_norm": 8.901763916015625, "learning_rate": 0.00012263622049448614, "loss": 0.383, "step": 23220 }, { "gate_value": 0.35284721851348877, "icl_sequence_length": 76, "num_contexts": 3, "step": 23220 }, { "grad_norm": 7.556049346923828, "learning_rate": 0.00012251430054810086, "loss": 0.3847, "step": 23230 }, { "gate_value": 0.3529858887195587, "icl_sequence_length": 64, "num_contexts": 3, "step": 23230 }, { "grad_norm": 21.75843048095703, "learning_rate": 0.00012239239938793204, "loss": 0.3719, "step": 23240 }, { "gate_value": 0.3530533015727997, "icl_sequence_length": 84, "num_contexts": 3, "step": 23240 }, { "grad_norm": 80.72978973388672, "learning_rate": 0.00012227051709729785, "loss": 0.3807, "step": 23250 }, { "gate_value": 0.35311317443847656, "icl_sequence_length": 90, "num_contexts": 3, "step": 23250 }, { "grad_norm": 8.681241989135742, "learning_rate": 0.00012214865375950385, "loss": 0.3587, "step": 23260 }, { "gate_value": 0.3531028926372528, "icl_sequence_length": 88, "num_contexts": 3, "step": 23260 }, { "grad_norm": 34.17095184326172, "learning_rate": 0.0001220268094578423, "loss": 0.3773, "step": 23270 }, { "gate_value": 0.3529850244522095, "icl_sequence_length": 76, "num_contexts": 3, "step": 23270 }, { "grad_norm": 9.34839916229248, "learning_rate": 0.00012190498427559274, "loss": 0.3868, "step": 23280 }, { "gate_value": 0.35284411907196045, "icl_sequence_length": 94, "num_contexts": 3, "step": 23280 }, { "grad_norm": 5.790085315704346, "learning_rate": 0.0001217831782960215, "loss": 0.3751, "step": 23290 }, { "gate_value": 0.3530334234237671, "icl_sequence_length": 82, "num_contexts": 3, "step": 23290 }, { "grad_norm": 46.09357833862305, "learning_rate": 0.00012166139160238184, "loss": 0.3757, "step": 23300 }, { "gate_value": 0.35317593812942505, "icl_sequence_length": 88, "num_contexts": 3, "step": 23300 }, { "grad_norm": 5.113748550415039, "learning_rate": 0.00012153962427791376, "loss": 0.3573, "step": 23310 }, { "gate_value": 0.35320231318473816, "icl_sequence_length": 82, "num_contexts": 3, "step": 23310 }, { "grad_norm": 34.75386047363281, "learning_rate": 0.00012141787640584418, "loss": 0.368, "step": 23320 }, { "gate_value": 0.35337674617767334, "icl_sequence_length": 82, "num_contexts": 3, "step": 23320 }, { "grad_norm": 7.989880084991455, "learning_rate": 0.00012129614806938652, "loss": 0.3824, "step": 23330 }, { "gate_value": 0.3536171317100525, "icl_sequence_length": 84, "num_contexts": 3, "step": 23330 }, { "grad_norm": 6.5942840576171875, "learning_rate": 0.00012117443935174101, "loss": 0.3701, "step": 23340 }, { "gate_value": 0.3538232743740082, "icl_sequence_length": 90, "num_contexts": 3, "step": 23340 }, { "grad_norm": 14.675801277160645, "learning_rate": 0.00012105275033609445, "loss": 0.3856, "step": 23350 }, { "gate_value": 0.3539339303970337, "icl_sequence_length": 80, "num_contexts": 3, "step": 23350 }, { "grad_norm": 26.552522659301758, "learning_rate": 0.00012093108110562001, "loss": 0.3625, "step": 23360 }, { "gate_value": 0.35394522547721863, "icl_sequence_length": 76, "num_contexts": 3, "step": 23360 }, { "grad_norm": 5.980011940002441, "learning_rate": 0.00012080943174347752, "loss": 0.3599, "step": 23370 }, { "gate_value": 0.3539412319660187, "icl_sequence_length": 90, "num_contexts": 3, "step": 23370 }, { "grad_norm": 8.659141540527344, "learning_rate": 0.00012068780233281322, "loss": 0.3612, "step": 23380 }, { "gate_value": 0.3540070652961731, "icl_sequence_length": 60, "num_contexts": 3, "step": 23380 }, { "grad_norm": 40.22635269165039, "learning_rate": 0.00012056619295675959, "loss": 0.3601, "step": 23390 }, { "gate_value": 0.35405296087265015, "icl_sequence_length": 92, "num_contexts": 3, "step": 23390 }, { "grad_norm": 6.858108997344971, "learning_rate": 0.00012044460369843556, "loss": 0.367, "step": 23400 }, { "gate_value": 0.354130357503891, "icl_sequence_length": 90, "num_contexts": 3, "step": 23400 }, { "grad_norm": 7.824497222900391, "learning_rate": 0.00012032303464094619, "loss": 0.376, "step": 23410 }, { "gate_value": 0.3540987968444824, "icl_sequence_length": 76, "num_contexts": 3, "step": 23410 }, { "grad_norm": 13.613783836364746, "learning_rate": 0.00012020148586738284, "loss": 0.3676, "step": 23420 }, { "gate_value": 0.3542543947696686, "icl_sequence_length": 84, "num_contexts": 3, "step": 23420 }, { "grad_norm": 10.455016136169434, "learning_rate": 0.00012007995746082288, "loss": 0.3565, "step": 23430 }, { "gate_value": 0.3542884290218353, "icl_sequence_length": 76, "num_contexts": 3, "step": 23430 }, { "grad_norm": 5.269458770751953, "learning_rate": 0.0001199584495043299, "loss": 0.3641, "step": 23440 }, { "gate_value": 0.3544134199619293, "icl_sequence_length": 74, "num_contexts": 3, "step": 23440 }, { "grad_norm": 18.11334228515625, "learning_rate": 0.00011983696208095342, "loss": 0.3571, "step": 23450 }, { "gate_value": 0.35484281182289124, "icl_sequence_length": 60, "num_contexts": 3, "step": 23450 }, { "grad_norm": 33.68678665161133, "learning_rate": 0.0001197154952737289, "loss": 0.3826, "step": 23460 }, { "gate_value": 0.3551397919654846, "icl_sequence_length": 94, "num_contexts": 3, "step": 23460 }, { "grad_norm": 11.15624713897705, "learning_rate": 0.0001195940491656778, "loss": 0.3778, "step": 23470 }, { "gate_value": 0.35536882281303406, "icl_sequence_length": 84, "num_contexts": 3, "step": 23470 }, { "grad_norm": 108.3549575805664, "learning_rate": 0.00011947262383980739, "loss": 0.3563, "step": 23480 }, { "gate_value": 0.35545867681503296, "icl_sequence_length": 78, "num_contexts": 3, "step": 23480 }, { "grad_norm": 12.360928535461426, "learning_rate": 0.00011935121937911072, "loss": 0.3551, "step": 23490 }, { "gate_value": 0.3555486500263214, "icl_sequence_length": 88, "num_contexts": 3, "step": 23490 }, { "grad_norm": 5.003032684326172, "learning_rate": 0.00011922983586656662, "loss": 0.3574, "step": 23500 }, { "gate_value": 0.3557662069797516, "icl_sequence_length": 88, "num_contexts": 3, "step": 23500 }, { "grad_norm": 4.80068302154541, "learning_rate": 0.00011910847338513953, "loss": 0.3841, "step": 23510 }, { "gate_value": 0.35597875714302063, "icl_sequence_length": 84, "num_contexts": 3, "step": 23510 }, { "grad_norm": 6.734328746795654, "learning_rate": 0.00011898713201777963, "loss": 0.3725, "step": 23520 }, { "gate_value": 0.3559677302837372, "icl_sequence_length": 64, "num_contexts": 3, "step": 23520 }, { "grad_norm": 12.359806060791016, "learning_rate": 0.00011886581184742252, "loss": 0.3892, "step": 23530 }, { "gate_value": 0.35603559017181396, "icl_sequence_length": 86, "num_contexts": 3, "step": 23530 }, { "grad_norm": 7.419463634490967, "learning_rate": 0.00011874451295698951, "loss": 0.3613, "step": 23540 }, { "gate_value": 0.35623571276664734, "icl_sequence_length": 84, "num_contexts": 3, "step": 23540 }, { "grad_norm": 11.626155853271484, "learning_rate": 0.00011862323542938713, "loss": 0.3709, "step": 23550 }, { "gate_value": 0.35649681091308594, "icl_sequence_length": 56, "num_contexts": 3, "step": 23550 }, { "grad_norm": 4.980889797210693, "learning_rate": 0.00011850197934750746, "loss": 0.3743, "step": 23560 }, { "gate_value": 0.3565058708190918, "icl_sequence_length": 88, "num_contexts": 3, "step": 23560 }, { "grad_norm": 7.546865940093994, "learning_rate": 0.00011838074479422787, "loss": 0.3708, "step": 23570 }, { "gate_value": 0.3565681278705597, "icl_sequence_length": 76, "num_contexts": 3, "step": 23570 }, { "grad_norm": 8.491681098937988, "learning_rate": 0.0001182595318524111, "loss": 0.3679, "step": 23580 }, { "gate_value": 0.35658058524131775, "icl_sequence_length": 92, "num_contexts": 3, "step": 23580 }, { "grad_norm": 10.987086296081543, "learning_rate": 0.000118138340604905, "loss": 0.3723, "step": 23590 }, { "gate_value": 0.35657426714897156, "icl_sequence_length": 90, "num_contexts": 3, "step": 23590 }, { "grad_norm": 8.058330535888672, "learning_rate": 0.00011801717113454266, "loss": 0.3835, "step": 23600 }, { "gate_value": 0.35659709572792053, "icl_sequence_length": 88, "num_contexts": 3, "step": 23600 }, { "grad_norm": 9.122367858886719, "learning_rate": 0.00011789602352414227, "loss": 0.3658, "step": 23610 }, { "gate_value": 0.35666391253471375, "icl_sequence_length": 82, "num_contexts": 3, "step": 23610 }, { "grad_norm": 6.175402641296387, "learning_rate": 0.0001177748978565071, "loss": 0.3492, "step": 23620 }, { "gate_value": 0.3566833734512329, "icl_sequence_length": 82, "num_contexts": 3, "step": 23620 }, { "grad_norm": 10.220946311950684, "learning_rate": 0.0001176537942144254, "loss": 0.38, "step": 23630 }, { "gate_value": 0.3567415773868561, "icl_sequence_length": 82, "num_contexts": 3, "step": 23630 }, { "grad_norm": 15.779232025146484, "learning_rate": 0.0001175327126806703, "loss": 0.3552, "step": 23640 }, { "gate_value": 0.3568035364151001, "icl_sequence_length": 80, "num_contexts": 3, "step": 23640 }, { "grad_norm": 9.174988746643066, "learning_rate": 0.00011741165333799996, "loss": 0.3688, "step": 23650 }, { "gate_value": 0.35696929693222046, "icl_sequence_length": 70, "num_contexts": 3, "step": 23650 }, { "grad_norm": 11.645748138427734, "learning_rate": 0.00011729061626915723, "loss": 0.3737, "step": 23660 }, { "gate_value": 0.35708051919937134, "icl_sequence_length": 80, "num_contexts": 3, "step": 23660 }, { "grad_norm": 6.972232818603516, "learning_rate": 0.00011716960155686986, "loss": 0.3481, "step": 23670 }, { "gate_value": 0.3572835326194763, "icl_sequence_length": 76, "num_contexts": 3, "step": 23670 }, { "grad_norm": 6.440983772277832, "learning_rate": 0.00011704860928385028, "loss": 0.36, "step": 23680 }, { "gate_value": 0.3574623465538025, "icl_sequence_length": 90, "num_contexts": 3, "step": 23680 }, { "grad_norm": 12.463038444519043, "learning_rate": 0.00011692763953279552, "loss": 0.3795, "step": 23690 }, { "gate_value": 0.3576089143753052, "icl_sequence_length": 82, "num_contexts": 3, "step": 23690 }, { "grad_norm": 10.757335662841797, "learning_rate": 0.00011680669238638731, "loss": 0.3795, "step": 23700 }, { "gate_value": 0.3578938841819763, "icl_sequence_length": 66, "num_contexts": 3, "step": 23700 }, { "grad_norm": 8.984125137329102, "learning_rate": 0.00011668576792729182, "loss": 0.3545, "step": 23710 }, { "gate_value": 0.3580709993839264, "icl_sequence_length": 88, "num_contexts": 3, "step": 23710 }, { "grad_norm": 3.919456958770752, "learning_rate": 0.00011656486623815987, "loss": 0.3886, "step": 23720 }, { "gate_value": 0.35818031430244446, "icl_sequence_length": 82, "num_contexts": 3, "step": 23720 }, { "grad_norm": 5.882258892059326, "learning_rate": 0.00011644398740162659, "loss": 0.3733, "step": 23730 }, { "gate_value": 0.3582715690135956, "icl_sequence_length": 66, "num_contexts": 3, "step": 23730 }, { "grad_norm": 7.3552350997924805, "learning_rate": 0.00011632313150031144, "loss": 0.3645, "step": 23740 }, { "gate_value": 0.35839003324508667, "icl_sequence_length": 84, "num_contexts": 3, "step": 23740 }, { "grad_norm": 5.509767055511475, "learning_rate": 0.0001162022986168184, "loss": 0.3778, "step": 23750 }, { "gate_value": 0.35857802629470825, "icl_sequence_length": 72, "num_contexts": 3, "step": 23750 }, { "grad_norm": 3.078181743621826, "learning_rate": 0.00011608148883373552, "loss": 0.3715, "step": 23760 }, { "gate_value": 0.3587779402732849, "icl_sequence_length": 72, "num_contexts": 3, "step": 23760 }, { "grad_norm": 4.447902202606201, "learning_rate": 0.00011596070223363518, "loss": 0.3671, "step": 23770 }, { "gate_value": 0.3586958646774292, "icl_sequence_length": 88, "num_contexts": 3, "step": 23770 }, { "grad_norm": 171.47337341308594, "learning_rate": 0.00011583993889907394, "loss": 0.3671, "step": 23780 }, { "gate_value": 0.358342707157135, "icl_sequence_length": 70, "num_contexts": 3, "step": 23780 }, { "grad_norm": 979.877685546875, "learning_rate": 0.00011571919891259232, "loss": 0.3736, "step": 23790 }, { "gate_value": 0.3582010269165039, "icl_sequence_length": 88, "num_contexts": 3, "step": 23790 }, { "grad_norm": 796.0517578125, "learning_rate": 0.00011559848235671502, "loss": 0.3792, "step": 23800 }, { "gate_value": 0.3581511974334717, "icl_sequence_length": 70, "num_contexts": 3, "step": 23800 }, { "grad_norm": 504.9575500488281, "learning_rate": 0.00011547778931395063, "loss": 0.3782, "step": 23810 }, { "gate_value": 0.35812196135520935, "icl_sequence_length": 76, "num_contexts": 3, "step": 23810 }, { "grad_norm": 93.6175537109375, "learning_rate": 0.00011535711986679174, "loss": 0.3551, "step": 23820 }, { "gate_value": 0.35810139775276184, "icl_sequence_length": 76, "num_contexts": 3, "step": 23820 }, { "grad_norm": 245.25450134277344, "learning_rate": 0.00011523647409771476, "loss": 0.3864, "step": 23830 }, { "gate_value": 0.35808372497558594, "icl_sequence_length": 76, "num_contexts": 3, "step": 23830 }, { "grad_norm": 345.7886657714844, "learning_rate": 0.00011511585208917989, "loss": 0.3719, "step": 23840 }, { "gate_value": 0.35808631777763367, "icl_sequence_length": 64, "num_contexts": 3, "step": 23840 }, { "grad_norm": 258.2336730957031, "learning_rate": 0.00011499525392363123, "loss": 0.3563, "step": 23850 }, { "gate_value": 0.3580664396286011, "icl_sequence_length": 82, "num_contexts": 3, "step": 23850 }, { "grad_norm": 555.8143310546875, "learning_rate": 0.00011487467968349639, "loss": 0.3903, "step": 23860 }, { "gate_value": 0.3580431044101715, "icl_sequence_length": 82, "num_contexts": 3, "step": 23860 }, { "grad_norm": 139.11825561523438, "learning_rate": 0.00011475412945118677, "loss": 0.3692, "step": 23870 }, { "gate_value": 0.35803794860839844, "icl_sequence_length": 80, "num_contexts": 3, "step": 23870 }, { "grad_norm": 123.66503143310547, "learning_rate": 0.00011463360330909737, "loss": 0.3847, "step": 23880 }, { "gate_value": 0.35804980993270874, "icl_sequence_length": 82, "num_contexts": 3, "step": 23880 }, { "grad_norm": 554.3246459960938, "learning_rate": 0.00011451310133960658, "loss": 0.3758, "step": 23890 }, { "gate_value": 0.35810431838035583, "icl_sequence_length": 82, "num_contexts": 3, "step": 23890 }, { "grad_norm": 50.21940231323242, "learning_rate": 0.00011439262362507644, "loss": 0.3665, "step": 23900 }, { "gate_value": 0.3581521511077881, "icl_sequence_length": 68, "num_contexts": 3, "step": 23900 }, { "grad_norm": 32.6104736328125, "learning_rate": 0.00011427217024785232, "loss": 0.3556, "step": 23910 }, { "gate_value": 0.3581998348236084, "icl_sequence_length": 86, "num_contexts": 3, "step": 23910 }, { "grad_norm": 239.99615478515625, "learning_rate": 0.00011415174129026288, "loss": 0.3566, "step": 23920 }, { "gate_value": 0.3582267165184021, "icl_sequence_length": 80, "num_contexts": 3, "step": 23920 }, { "grad_norm": 405.5616455078125, "learning_rate": 0.00011403133683462027, "loss": 0.3926, "step": 23930 }, { "gate_value": 0.35823509097099304, "icl_sequence_length": 92, "num_contexts": 3, "step": 23930 }, { "grad_norm": 1989.842529296875, "learning_rate": 0.00011391095696321974, "loss": 0.3832, "step": 23940 }, { "gate_value": 0.3582380712032318, "icl_sequence_length": 82, "num_contexts": 3, "step": 23940 }, { "grad_norm": 7928.72412109375, "learning_rate": 0.00011379060175833986, "loss": 0.3617, "step": 23950 }, { "gate_value": 0.3582387864589691, "icl_sequence_length": 82, "num_contexts": 3, "step": 23950 }, { "grad_norm": 1746.5020751953125, "learning_rate": 0.0001136702713022422, "loss": 0.3707, "step": 23960 }, { "gate_value": 0.3582383990287781, "icl_sequence_length": 76, "num_contexts": 3, "step": 23960 }, { "grad_norm": 175.7129669189453, "learning_rate": 0.00011354996567717156, "loss": 0.3878, "step": 23970 }, { "gate_value": 0.3582378029823303, "icl_sequence_length": 88, "num_contexts": 3, "step": 23970 }, { "grad_norm": 361.2430419921875, "learning_rate": 0.00011342968496535568, "loss": 0.3773, "step": 23980 }, { "gate_value": 0.3582334816455841, "icl_sequence_length": 74, "num_contexts": 3, "step": 23980 }, { "grad_norm": 293.09912109375, "learning_rate": 0.00011330942924900529, "loss": 0.3734, "step": 23990 }, { "gate_value": 0.3582324683666229, "icl_sequence_length": 82, "num_contexts": 3, "step": 23990 }, { "grad_norm": 143.7857666015625, "learning_rate": 0.00011318919861031403, "loss": 0.3632, "step": 24000 }, { "gate_value": 0.35823488235473633, "icl_sequence_length": 60, "num_contexts": 3, "step": 24000 }, { "grad_norm": 1418.47119140625, "learning_rate": 0.00011306899313145848, "loss": 0.3703, "step": 24010 }, { "gate_value": 0.3582359850406647, "icl_sequence_length": 80, "num_contexts": 3, "step": 24010 }, { "grad_norm": 150.08474731445312, "learning_rate": 0.00011294881289459782, "loss": 0.3712, "step": 24020 }, { "gate_value": 0.3582356572151184, "icl_sequence_length": 86, "num_contexts": 3, "step": 24020 }, { "grad_norm": 91.68392944335938, "learning_rate": 0.00011282865798187417, "loss": 0.3697, "step": 24030 }, { "gate_value": 0.3582378625869751, "icl_sequence_length": 80, "num_contexts": 3, "step": 24030 }, { "grad_norm": 155.62538146972656, "learning_rate": 0.00011270852847541228, "loss": 0.382, "step": 24040 }, { "gate_value": 0.3582373559474945, "icl_sequence_length": 84, "num_contexts": 3, "step": 24040 }, { "grad_norm": 579.9232788085938, "learning_rate": 0.00011258842445731954, "loss": 0.352, "step": 24050 }, { "gate_value": 0.3582456409931183, "icl_sequence_length": 82, "num_contexts": 3, "step": 24050 }, { "grad_norm": 5258.4619140625, "learning_rate": 0.00011246834600968594, "loss": 0.3899, "step": 24060 }, { "gate_value": 0.3582465946674347, "icl_sequence_length": 82, "num_contexts": 3, "step": 24060 }, { "grad_norm": 508.95184326171875, "learning_rate": 0.00011234829321458392, "loss": 0.3826, "step": 24070 }, { "gate_value": 0.3582475483417511, "icl_sequence_length": 78, "num_contexts": 3, "step": 24070 }, { "grad_norm": 120.9805908203125, "learning_rate": 0.00011222826615406848, "loss": 0.3579, "step": 24080 }, { "gate_value": 0.3582461178302765, "icl_sequence_length": 78, "num_contexts": 3, "step": 24080 }, { "grad_norm": 90.79205322265625, "learning_rate": 0.00011210826491017692, "loss": 0.3545, "step": 24090 }, { "gate_value": 0.35825929045677185, "icl_sequence_length": 70, "num_contexts": 3, "step": 24090 }, { "grad_norm": 478.6510925292969, "learning_rate": 0.00011198828956492907, "loss": 0.3541, "step": 24100 }, { "gate_value": 0.3582655191421509, "icl_sequence_length": 84, "num_contexts": 3, "step": 24100 }, { "grad_norm": 139.04241943359375, "learning_rate": 0.00011186834020032682, "loss": 0.3758, "step": 24110 }, { "gate_value": 0.3582736849784851, "icl_sequence_length": 72, "num_contexts": 3, "step": 24110 }, { "grad_norm": 107.1037826538086, "learning_rate": 0.00011174841689835446, "loss": 0.3903, "step": 24120 }, { "gate_value": 0.3582884967327118, "icl_sequence_length": 86, "num_contexts": 3, "step": 24120 }, { "grad_norm": 246.1360626220703, "learning_rate": 0.0001116285197409785, "loss": 0.3665, "step": 24130 }, { "gate_value": 0.35830172896385193, "icl_sequence_length": 78, "num_contexts": 3, "step": 24130 }, { "grad_norm": 419.01885986328125, "learning_rate": 0.00011150864881014744, "loss": 0.36, "step": 24140 }, { "gate_value": 0.3583115041255951, "icl_sequence_length": 70, "num_contexts": 3, "step": 24140 }, { "grad_norm": 1557.142333984375, "learning_rate": 0.00011138880418779196, "loss": 0.354, "step": 24150 }, { "gate_value": 0.35832417011260986, "icl_sequence_length": 90, "num_contexts": 3, "step": 24150 }, { "grad_norm": 193.56314086914062, "learning_rate": 0.00011126898595582478, "loss": 0.3682, "step": 24160 }, { "gate_value": 0.3583417236804962, "icl_sequence_length": 86, "num_contexts": 3, "step": 24160 }, { "grad_norm": 2367.890625, "learning_rate": 0.00011114919419614045, "loss": 0.3756, "step": 24170 }, { "gate_value": 0.3583694100379944, "icl_sequence_length": 80, "num_contexts": 3, "step": 24170 }, { "grad_norm": 17580.013671875, "learning_rate": 0.00011102942899061557, "loss": 0.3674, "step": 24180 }, { "gate_value": 0.3583917021751404, "icl_sequence_length": 88, "num_contexts": 3, "step": 24180 }, { "grad_norm": 766.1683959960938, "learning_rate": 0.00011090969042110854, "loss": 0.3536, "step": 24190 }, { "gate_value": 0.35841184854507446, "icl_sequence_length": 80, "num_contexts": 3, "step": 24190 }, { "grad_norm": 102.75213623046875, "learning_rate": 0.00011078997856945947, "loss": 0.3688, "step": 24200 }, { "gate_value": 0.3584311306476593, "icl_sequence_length": 64, "num_contexts": 3, "step": 24200 }, { "grad_norm": 593.046630859375, "learning_rate": 0.00011067029351749032, "loss": 0.3568, "step": 24210 }, { "gate_value": 0.3584307134151459, "icl_sequence_length": 80, "num_contexts": 3, "step": 24210 }, { "grad_norm": 256.4482116699219, "learning_rate": 0.00011055063534700468, "loss": 0.358, "step": 24220 }, { "gate_value": 0.3584342896938324, "icl_sequence_length": 78, "num_contexts": 3, "step": 24220 }, { "grad_norm": 180.25892639160156, "learning_rate": 0.00011043100413978781, "loss": 0.3772, "step": 24230 }, { "gate_value": 0.3584383428096771, "icl_sequence_length": 72, "num_contexts": 3, "step": 24230 }, { "grad_norm": 46.64565658569336, "learning_rate": 0.00011031139997760648, "loss": 0.3727, "step": 24240 }, { "gate_value": 0.35844966769218445, "icl_sequence_length": 78, "num_contexts": 3, "step": 24240 }, { "grad_norm": 50854.26171875, "learning_rate": 0.000110191822942209, "loss": 0.3704, "step": 24250 }, { "gate_value": 0.35844776034355164, "icl_sequence_length": 86, "num_contexts": 3, "step": 24250 }, { "grad_norm": 65.85983276367188, "learning_rate": 0.00011007227311532522, "loss": 0.3737, "step": 24260 }, { "gate_value": 0.3584485948085785, "icl_sequence_length": 82, "num_contexts": 3, "step": 24260 }, { "grad_norm": 1420.6707763671875, "learning_rate": 0.00010995275057866624, "loss": 0.361, "step": 24270 }, { "gate_value": 0.35847553610801697, "icl_sequence_length": 78, "num_contexts": 3, "step": 24270 }, { "grad_norm": 78.28116607666016, "learning_rate": 0.00010983325541392469, "loss": 0.3515, "step": 24280 }, { "gate_value": 0.3584812879562378, "icl_sequence_length": 92, "num_contexts": 3, "step": 24280 }, { "grad_norm": 233.1837158203125, "learning_rate": 0.00010971378770277426, "loss": 0.3608, "step": 24290 }, { "gate_value": 0.3584992289543152, "icl_sequence_length": 70, "num_contexts": 3, "step": 24290 }, { "grad_norm": 728.71630859375, "learning_rate": 0.00010959434752687004, "loss": 0.3722, "step": 24300 }, { "gate_value": 0.3585113286972046, "icl_sequence_length": 90, "num_contexts": 3, "step": 24300 }, { "grad_norm": 5302.78564453125, "learning_rate": 0.00010947493496784829, "loss": 0.3637, "step": 24310 }, { "gate_value": 0.35852137207984924, "icl_sequence_length": 86, "num_contexts": 3, "step": 24310 }, { "grad_norm": 34129.52734375, "learning_rate": 0.00010935555010732636, "loss": 0.358, "step": 24320 }, { "gate_value": 0.3585313558578491, "icl_sequence_length": 82, "num_contexts": 3, "step": 24320 }, { "grad_norm": 298.13037109375, "learning_rate": 0.0001092361930269027, "loss": 0.3602, "step": 24330 }, { "gate_value": 0.3585515320301056, "icl_sequence_length": 90, "num_contexts": 3, "step": 24330 }, { "grad_norm": 56.191341400146484, "learning_rate": 0.00010911686380815671, "loss": 0.3825, "step": 24340 }, { "gate_value": 0.35857197642326355, "icl_sequence_length": 76, "num_contexts": 3, "step": 24340 }, { "grad_norm": 186.09190368652344, "learning_rate": 0.00010899756253264879, "loss": 0.3624, "step": 24350 }, { "gate_value": 0.35859212279319763, "icl_sequence_length": 72, "num_contexts": 3, "step": 24350 }, { "grad_norm": 259.10968017578125, "learning_rate": 0.00010887828928192026, "loss": 0.366, "step": 24360 }, { "gate_value": 0.35859882831573486, "icl_sequence_length": 86, "num_contexts": 3, "step": 24360 }, { "grad_norm": 705.4097290039062, "learning_rate": 0.00010875904413749324, "loss": 0.3629, "step": 24370 }, { "gate_value": 0.35860174894332886, "icl_sequence_length": 80, "num_contexts": 3, "step": 24370 }, { "grad_norm": 906.2277221679688, "learning_rate": 0.00010863982718087074, "loss": 0.3606, "step": 24380 }, { "gate_value": 0.35860127210617065, "icl_sequence_length": 62, "num_contexts": 3, "step": 24380 }, { "grad_norm": 122.35824584960938, "learning_rate": 0.0001085206384935363, "loss": 0.3591, "step": 24390 }, { "gate_value": 0.3586183786392212, "icl_sequence_length": 74, "num_contexts": 3, "step": 24390 }, { "grad_norm": 111.16336059570312, "learning_rate": 0.00010840147815695433, "loss": 0.3618, "step": 24400 }, { "gate_value": 0.3586495816707611, "icl_sequence_length": 76, "num_contexts": 3, "step": 24400 }, { "grad_norm": 131.8732147216797, "learning_rate": 0.0001082823462525698, "loss": 0.3708, "step": 24410 }, { "gate_value": 0.35866519808769226, "icl_sequence_length": 82, "num_contexts": 3, "step": 24410 }, { "grad_norm": 65.82928466796875, "learning_rate": 0.0001081632428618082, "loss": 0.3634, "step": 24420 }, { "gate_value": 0.3587072193622589, "icl_sequence_length": 74, "num_contexts": 3, "step": 24420 }, { "grad_norm": 30504.029296875, "learning_rate": 0.00010804416806607563, "loss": 0.372, "step": 24430 }, { "gate_value": 0.35872122645378113, "icl_sequence_length": 94, "num_contexts": 3, "step": 24430 }, { "grad_norm": 50.18363571166992, "learning_rate": 0.00010792512194675855, "loss": 0.3558, "step": 24440 }, { "gate_value": 0.35872602462768555, "icl_sequence_length": 70, "num_contexts": 3, "step": 24440 }, { "grad_norm": 92.87120056152344, "learning_rate": 0.0001078061045852239, "loss": 0.3688, "step": 24450 }, { "gate_value": 0.3587706387042999, "icl_sequence_length": 80, "num_contexts": 3, "step": 24450 }, { "grad_norm": 62.7234992980957, "learning_rate": 0.00010768711606281889, "loss": 0.3725, "step": 24460 }, { "gate_value": 0.3588024973869324, "icl_sequence_length": 88, "num_contexts": 3, "step": 24460 }, { "grad_norm": 70.35800170898438, "learning_rate": 0.00010756815646087111, "loss": 0.3723, "step": 24470 }, { "gate_value": 0.358820378780365, "icl_sequence_length": 78, "num_contexts": 3, "step": 24470 }, { "grad_norm": 96.2275161743164, "learning_rate": 0.00010744922586068823, "loss": 0.369, "step": 24480 }, { "gate_value": 0.3588525950908661, "icl_sequence_length": 74, "num_contexts": 3, "step": 24480 }, { "grad_norm": 71.24588775634766, "learning_rate": 0.00010733032434355827, "loss": 0.361, "step": 24490 }, { "gate_value": 0.35888391733169556, "icl_sequence_length": 90, "num_contexts": 3, "step": 24490 }, { "grad_norm": 69.96412658691406, "learning_rate": 0.00010721145199074923, "loss": 0.359, "step": 24500 }, { "gate_value": 0.3589116930961609, "icl_sequence_length": 92, "num_contexts": 3, "step": 24500 }, { "grad_norm": 99.85425567626953, "learning_rate": 0.00010709260888350931, "loss": 0.3724, "step": 24510 }, { "gate_value": 0.3589341938495636, "icl_sequence_length": 82, "num_contexts": 3, "step": 24510 }, { "grad_norm": 118.61331176757812, "learning_rate": 0.0001069737951030666, "loss": 0.3793, "step": 24520 }, { "gate_value": 0.35894277691841125, "icl_sequence_length": 70, "num_contexts": 3, "step": 24520 }, { "grad_norm": 524.7838134765625, "learning_rate": 0.00010685501073062927, "loss": 0.3666, "step": 24530 }, { "gate_value": 0.3589576482772827, "icl_sequence_length": 68, "num_contexts": 3, "step": 24530 }, { "grad_norm": 2431.64501953125, "learning_rate": 0.00010673625584738523, "loss": 0.3836, "step": 24540 }, { "gate_value": 0.3589611351490021, "icl_sequence_length": 86, "num_contexts": 3, "step": 24540 }, { "grad_norm": 881.9794311523438, "learning_rate": 0.00010661753053450237, "loss": 0.3519, "step": 24550 }, { "gate_value": 0.3589634299278259, "icl_sequence_length": 88, "num_contexts": 3, "step": 24550 }, { "grad_norm": 132.37796020507812, "learning_rate": 0.00010649883487312836, "loss": 0.371, "step": 24560 }, { "gate_value": 0.3589780330657959, "icl_sequence_length": 82, "num_contexts": 3, "step": 24560 }, { "grad_norm": 98.35554504394531, "learning_rate": 0.00010638016894439051, "loss": 0.3563, "step": 24570 }, { "gate_value": 0.35899582505226135, "icl_sequence_length": 74, "num_contexts": 3, "step": 24570 }, { "grad_norm": 107.20555114746094, "learning_rate": 0.00010626153282939586, "loss": 0.3631, "step": 24580 }, { "gate_value": 0.35901913046836853, "icl_sequence_length": 74, "num_contexts": 3, "step": 24580 }, { "grad_norm": 213.93869018554688, "learning_rate": 0.00010614292660923108, "loss": 0.3753, "step": 24590 }, { "gate_value": 0.3590436279773712, "icl_sequence_length": 70, "num_contexts": 3, "step": 24590 }, { "grad_norm": 93.29708099365234, "learning_rate": 0.00010602435036496243, "loss": 0.361, "step": 24600 }, { "gate_value": 0.35905933380126953, "icl_sequence_length": 76, "num_contexts": 3, "step": 24600 }, { "grad_norm": 68.2582015991211, "learning_rate": 0.00010590580417763564, "loss": 0.3781, "step": 24610 }, { "gate_value": 0.3590523898601532, "icl_sequence_length": 78, "num_contexts": 3, "step": 24610 }, { "grad_norm": 176.111572265625, "learning_rate": 0.00010578728812827589, "loss": 0.3723, "step": 24620 }, { "gate_value": 0.3590545058250427, "icl_sequence_length": 80, "num_contexts": 3, "step": 24620 }, { "grad_norm": 85.26171875, "learning_rate": 0.00010566880229788784, "loss": 0.3428, "step": 24630 }, { "gate_value": 0.359076589345932, "icl_sequence_length": 78, "num_contexts": 3, "step": 24630 }, { "grad_norm": 114.50090789794922, "learning_rate": 0.00010555034676745537, "loss": 0.3815, "step": 24640 }, { "gate_value": 0.35911232233047485, "icl_sequence_length": 78, "num_contexts": 3, "step": 24640 }, { "grad_norm": 80.09989166259766, "learning_rate": 0.00010543192161794174, "loss": 0.3737, "step": 24650 }, { "gate_value": 0.35913920402526855, "icl_sequence_length": 82, "num_contexts": 3, "step": 24650 }, { "grad_norm": 61.79484176635742, "learning_rate": 0.00010531352693028951, "loss": 0.349, "step": 24660 }, { "gate_value": 0.35916125774383545, "icl_sequence_length": 94, "num_contexts": 3, "step": 24660 }, { "grad_norm": 139.99099731445312, "learning_rate": 0.0001051951627854202, "loss": 0.3705, "step": 24670 }, { "gate_value": 0.35917332768440247, "icl_sequence_length": 84, "num_contexts": 3, "step": 24670 }, { "grad_norm": 70.59046936035156, "learning_rate": 0.00010507682926423463, "loss": 0.3577, "step": 24680 }, { "gate_value": 0.3591974973678589, "icl_sequence_length": 88, "num_contexts": 3, "step": 24680 }, { "grad_norm": 50.57143020629883, "learning_rate": 0.00010495852644761268, "loss": 0.3644, "step": 24690 }, { "gate_value": 0.35923057794570923, "icl_sequence_length": 84, "num_contexts": 3, "step": 24690 }, { "grad_norm": 81.95945739746094, "learning_rate": 0.00010484025441641315, "loss": 0.3672, "step": 24700 }, { "gate_value": 0.3592517673969269, "icl_sequence_length": 84, "num_contexts": 3, "step": 24700 }, { "grad_norm": 213.39706420898438, "learning_rate": 0.00010472201325147395, "loss": 0.3712, "step": 24710 }, { "gate_value": 0.35926130414009094, "icl_sequence_length": 68, "num_contexts": 3, "step": 24710 }, { "grad_norm": 67.57164764404297, "learning_rate": 0.0001046038030336117, "loss": 0.3726, "step": 24720 }, { "gate_value": 0.3592642843723297, "icl_sequence_length": 70, "num_contexts": 3, "step": 24720 }, { "grad_norm": 68.6855697631836, "learning_rate": 0.00010448562384362204, "loss": 0.3476, "step": 24730 }, { "gate_value": 0.35928410291671753, "icl_sequence_length": 80, "num_contexts": 3, "step": 24730 }, { "grad_norm": 804.9916381835938, "learning_rate": 0.00010436747576227928, "loss": 0.3575, "step": 24740 }, { "gate_value": 0.3593326210975647, "icl_sequence_length": 84, "num_contexts": 3, "step": 24740 }, { "grad_norm": 251.33657836914062, "learning_rate": 0.0001042493588703366, "loss": 0.3619, "step": 24750 }, { "gate_value": 0.35940855741500854, "icl_sequence_length": 92, "num_contexts": 3, "step": 24750 }, { "grad_norm": 360.7458190917969, "learning_rate": 0.00010413127324852569, "loss": 0.3761, "step": 24760 }, { "gate_value": 0.35946398973464966, "icl_sequence_length": 86, "num_contexts": 3, "step": 24760 }, { "grad_norm": 629.7045288085938, "learning_rate": 0.00010401321897755703, "loss": 0.357, "step": 24770 }, { "gate_value": 0.3594951033592224, "icl_sequence_length": 66, "num_contexts": 3, "step": 24770 }, { "grad_norm": 667.2528686523438, "learning_rate": 0.00010389519613811952, "loss": 0.3638, "step": 24780 }, { "gate_value": 0.359512060880661, "icl_sequence_length": 90, "num_contexts": 3, "step": 24780 }, { "grad_norm": 322.2542419433594, "learning_rate": 0.00010377720481088076, "loss": 0.3624, "step": 24790 }, { "gate_value": 0.3595242500305176, "icl_sequence_length": 70, "num_contexts": 3, "step": 24790 }, { "grad_norm": 302.0667724609375, "learning_rate": 0.0001036592450764866, "loss": 0.3551, "step": 24800 }, { "gate_value": 0.35954952239990234, "icl_sequence_length": 88, "num_contexts": 3, "step": 24800 }, { "grad_norm": 508.2187805175781, "learning_rate": 0.00010354131701556152, "loss": 0.3738, "step": 24810 }, { "gate_value": 0.3595583140850067, "icl_sequence_length": 70, "num_contexts": 3, "step": 24810 }, { "grad_norm": 240.36524963378906, "learning_rate": 0.00010342342070870813, "loss": 0.3611, "step": 24820 }, { "gate_value": 0.3595639169216156, "icl_sequence_length": 68, "num_contexts": 3, "step": 24820 }, { "grad_norm": 594.7509155273438, "learning_rate": 0.00010330555623650753, "loss": 0.3784, "step": 24830 }, { "gate_value": 0.35957181453704834, "icl_sequence_length": 76, "num_contexts": 3, "step": 24830 }, { "grad_norm": 141.8068084716797, "learning_rate": 0.00010318772367951898, "loss": 0.3478, "step": 24840 }, { "gate_value": 0.359592080116272, "icl_sequence_length": 90, "num_contexts": 3, "step": 24840 }, { "grad_norm": 89.36084747314453, "learning_rate": 0.00010306992311827981, "loss": 0.37, "step": 24850 }, { "gate_value": 0.35961490869522095, "icl_sequence_length": 82, "num_contexts": 3, "step": 24850 }, { "grad_norm": 53.824249267578125, "learning_rate": 0.00010295215463330568, "loss": 0.3767, "step": 24860 }, { "gate_value": 0.35963571071624756, "icl_sequence_length": 76, "num_contexts": 3, "step": 24860 }, { "grad_norm": 55.34401321411133, "learning_rate": 0.00010283441830509023, "loss": 0.3731, "step": 24870 }, { "gate_value": 0.3596397340297699, "icl_sequence_length": 72, "num_contexts": 3, "step": 24870 }, { "grad_norm": 100.6485366821289, "learning_rate": 0.0001027167142141051, "loss": 0.379, "step": 24880 }, { "gate_value": 0.35964563488960266, "icl_sequence_length": 80, "num_contexts": 3, "step": 24880 }, { "grad_norm": 31.52804946899414, "learning_rate": 0.00010259904244079998, "loss": 0.364, "step": 24890 }, { "gate_value": 0.359668493270874, "icl_sequence_length": 80, "num_contexts": 3, "step": 24890 }, { "grad_norm": 260.1759338378906, "learning_rate": 0.00010248140306560238, "loss": 0.3591, "step": 24900 }, { "gate_value": 0.3596940040588379, "icl_sequence_length": 86, "num_contexts": 3, "step": 24900 }, { "grad_norm": 939.1127319335938, "learning_rate": 0.00010236379616891772, "loss": 0.3494, "step": 24910 }, { "gate_value": 0.35971933603286743, "icl_sequence_length": 68, "num_contexts": 3, "step": 24910 }, { "grad_norm": 137.58267211914062, "learning_rate": 0.00010224622183112916, "loss": 0.3571, "step": 24920 }, { "gate_value": 0.3597603440284729, "icl_sequence_length": 62, "num_contexts": 3, "step": 24920 }, { "grad_norm": 361.5756530761719, "learning_rate": 0.00010212868013259772, "loss": 0.3558, "step": 24930 }, { "gate_value": 0.35976681113243103, "icl_sequence_length": 76, "num_contexts": 3, "step": 24930 }, { "grad_norm": 204.4798126220703, "learning_rate": 0.00010201117115366207, "loss": 0.3776, "step": 24940 }, { "gate_value": 0.35976898670196533, "icl_sequence_length": 86, "num_contexts": 3, "step": 24940 }, { "grad_norm": 919.7255249023438, "learning_rate": 0.00010189369497463835, "loss": 0.3559, "step": 24950 }, { "gate_value": 0.3597671389579773, "icl_sequence_length": 78, "num_contexts": 3, "step": 24950 }, { "grad_norm": 72.86383056640625, "learning_rate": 0.00010177625167582049, "loss": 0.3801, "step": 24960 }, { "gate_value": 0.359756737947464, "icl_sequence_length": 88, "num_contexts": 3, "step": 24960 }, { "grad_norm": 110.57682800292969, "learning_rate": 0.00010165884133747992, "loss": 0.3441, "step": 24970 }, { "gate_value": 0.3597812354564667, "icl_sequence_length": 82, "num_contexts": 3, "step": 24970 }, { "grad_norm": 171.7178497314453, "learning_rate": 0.00010154146403986543, "loss": 0.3577, "step": 24980 }, { "gate_value": 0.3598121106624603, "icl_sequence_length": 80, "num_contexts": 3, "step": 24980 }, { "grad_norm": 124.2730941772461, "learning_rate": 0.00010142411986320337, "loss": 0.3652, "step": 24990 }, { "gate_value": 0.35984185338020325, "icl_sequence_length": 88, "num_contexts": 3, "step": 24990 }, { "grad_norm": 577.5195922851562, "learning_rate": 0.00010130680888769732, "loss": 0.3576, "step": 25000 }, { "gate_value": 0.3598731458187103, "icl_sequence_length": 90, "num_contexts": 3, "step": 25000 }, { "grad_norm": 1175.1915283203125, "learning_rate": 0.00010118953119352826, "loss": 0.3549, "step": 25010 }, { "gate_value": 0.3598865568637848, "icl_sequence_length": 72, "num_contexts": 3, "step": 25010 }, { "grad_norm": 1535.7991943359375, "learning_rate": 0.00010107228686085436, "loss": 0.3691, "step": 25020 }, { "gate_value": 0.35989508032798767, "icl_sequence_length": 88, "num_contexts": 3, "step": 25020 }, { "grad_norm": 64.09869384765625, "learning_rate": 0.00010095507596981107, "loss": 0.3592, "step": 25030 }, { "gate_value": 0.35991427302360535, "icl_sequence_length": 82, "num_contexts": 3, "step": 25030 }, { "grad_norm": 45.077537536621094, "learning_rate": 0.00010083789860051089, "loss": 0.3735, "step": 25040 }, { "gate_value": 0.35991591215133667, "icl_sequence_length": 70, "num_contexts": 3, "step": 25040 }, { "grad_norm": 28.698278427124023, "learning_rate": 0.0001007207548330434, "loss": 0.3663, "step": 25050 }, { "gate_value": 0.35997167229652405, "icl_sequence_length": 90, "num_contexts": 3, "step": 25050 }, { "grad_norm": 24.68749237060547, "learning_rate": 0.00010060364474747528, "loss": 0.3595, "step": 25060 }, { "gate_value": 0.3600045144557953, "icl_sequence_length": 84, "num_contexts": 3, "step": 25060 }, { "grad_norm": 243.453125, "learning_rate": 0.00010048656842385024, "loss": 0.3663, "step": 25070 }, { "gate_value": 0.3599931597709656, "icl_sequence_length": 86, "num_contexts": 3, "step": 25070 }, { "grad_norm": 103.076904296875, "learning_rate": 0.0001003695259421888, "loss": 0.3619, "step": 25080 }, { "gate_value": 0.3600130081176758, "icl_sequence_length": 88, "num_contexts": 3, "step": 25080 }, { "grad_norm": 85.31201934814453, "learning_rate": 0.00010025251738248838, "loss": 0.3681, "step": 25090 }, { "gate_value": 0.36003732681274414, "icl_sequence_length": 86, "num_contexts": 3, "step": 25090 }, { "grad_norm": 267.9420166015625, "learning_rate": 0.00010013554282472323, "loss": 0.3666, "step": 25100 }, { "gate_value": 0.3600807189941406, "icl_sequence_length": 72, "num_contexts": 3, "step": 25100 }, { "grad_norm": 149.02096557617188, "learning_rate": 0.00010001860234884439, "loss": 0.3587, "step": 25110 }, { "gate_value": 0.3601251244544983, "icl_sequence_length": 60, "num_contexts": 3, "step": 25110 }, { "grad_norm": 601.15576171875, "learning_rate": 9.990169603477957e-05, "loss": 0.3519, "step": 25120 }, { "gate_value": 0.360137939453125, "icl_sequence_length": 88, "num_contexts": 3, "step": 25120 }, { "grad_norm": 28.030532836914062, "learning_rate": 9.978482396243307e-05, "loss": 0.3592, "step": 25130 }, { "gate_value": 0.3601973354816437, "icl_sequence_length": 76, "num_contexts": 3, "step": 25130 }, { "grad_norm": 64.57613372802734, "learning_rate": 9.96679862116859e-05, "loss": 0.3585, "step": 25140 }, { "gate_value": 0.3602369725704193, "icl_sequence_length": 80, "num_contexts": 3, "step": 25140 }, { "grad_norm": 77.8062515258789, "learning_rate": 9.955118286239554e-05, "loss": 0.3532, "step": 25150 }, { "gate_value": 0.36026737093925476, "icl_sequence_length": 92, "num_contexts": 3, "step": 25150 }, { "grad_norm": 41.49631118774414, "learning_rate": 9.943441399439599e-05, "loss": 0.3543, "step": 25160 }, { "gate_value": 0.3602963984012604, "icl_sequence_length": 70, "num_contexts": 3, "step": 25160 }, { "grad_norm": 114.31262969970703, "learning_rate": 9.931767968749768e-05, "loss": 0.3705, "step": 25170 }, { "gate_value": 0.360359251499176, "icl_sequence_length": 76, "num_contexts": 3, "step": 25170 }, { "grad_norm": 54.857337951660156, "learning_rate": 9.920098002148738e-05, "loss": 0.3624, "step": 25180 }, { "gate_value": 0.3604171574115753, "icl_sequence_length": 70, "num_contexts": 3, "step": 25180 }, { "grad_norm": 5679.38330078125, "learning_rate": 9.908431507612825e-05, "loss": 0.3712, "step": 25190 }, { "gate_value": 0.36049506068229675, "icl_sequence_length": 94, "num_contexts": 3, "step": 25190 }, { "grad_norm": 30.065919876098633, "learning_rate": 9.896768493115966e-05, "loss": 0.3559, "step": 25200 }, { "gate_value": 0.360569566488266, "icl_sequence_length": 78, "num_contexts": 3, "step": 25200 }, { "grad_norm": 34.60041809082031, "learning_rate": 9.885108966629721e-05, "loss": 0.3562, "step": 25210 }, { "gate_value": 0.3606504797935486, "icl_sequence_length": 80, "num_contexts": 3, "step": 25210 }, { "grad_norm": 291.844482421875, "learning_rate": 9.873452936123271e-05, "loss": 0.3677, "step": 25220 }, { "gate_value": 0.3607202172279358, "icl_sequence_length": 84, "num_contexts": 3, "step": 25220 }, { "grad_norm": 54.88591384887695, "learning_rate": 9.861800409563392e-05, "loss": 0.3562, "step": 25230 }, { "gate_value": 0.3607614040374756, "icl_sequence_length": 80, "num_contexts": 3, "step": 25230 }, { "grad_norm": 74439.96875, "learning_rate": 9.850151394914485e-05, "loss": 0.3719, "step": 25240 }, { "gate_value": 0.36080268025398254, "icl_sequence_length": 88, "num_contexts": 3, "step": 25240 }, { "grad_norm": 111.2549057006836, "learning_rate": 9.838505900138539e-05, "loss": 0.3777, "step": 25250 }, { "gate_value": 0.36084768176078796, "icl_sequence_length": 80, "num_contexts": 3, "step": 25250 }, { "grad_norm": 723.0596313476562, "learning_rate": 9.82686393319514e-05, "loss": 0.367, "step": 25260 }, { "gate_value": 0.3609139323234558, "icl_sequence_length": 88, "num_contexts": 3, "step": 25260 }, { "grad_norm": 111.521240234375, "learning_rate": 9.815225502041463e-05, "loss": 0.3632, "step": 25270 }, { "gate_value": 0.36096981167793274, "icl_sequence_length": 80, "num_contexts": 3, "step": 25270 }, { "grad_norm": 3092.884521484375, "learning_rate": 9.803590614632267e-05, "loss": 0.3654, "step": 25280 }, { "gate_value": 0.36098742485046387, "icl_sequence_length": 78, "num_contexts": 3, "step": 25280 }, { "grad_norm": 232.3154296875, "learning_rate": 9.791959278919887e-05, "loss": 0.35, "step": 25290 }, { "gate_value": 0.36098426580429077, "icl_sequence_length": 88, "num_contexts": 3, "step": 25290 }, { "grad_norm": 31.037199020385742, "learning_rate": 9.780331502854229e-05, "loss": 0.3785, "step": 25300 }, { "gate_value": 0.3610406219959259, "icl_sequence_length": 90, "num_contexts": 3, "step": 25300 }, { "grad_norm": 66.40868377685547, "learning_rate": 9.768707294382775e-05, "loss": 0.3707, "step": 25310 }, { "gate_value": 0.36106961965560913, "icl_sequence_length": 82, "num_contexts": 3, "step": 25310 }, { "grad_norm": 71.33839416503906, "learning_rate": 9.757086661450556e-05, "loss": 0.3671, "step": 25320 }, { "gate_value": 0.3610745370388031, "icl_sequence_length": 70, "num_contexts": 3, "step": 25320 }, { "grad_norm": 111.21866607666016, "learning_rate": 9.745469612000161e-05, "loss": 0.3707, "step": 25330 }, { "gate_value": 0.36109820008277893, "icl_sequence_length": 62, "num_contexts": 3, "step": 25330 }, { "grad_norm": 118.41226959228516, "learning_rate": 9.73385615397174e-05, "loss": 0.3745, "step": 25340 }, { "gate_value": 0.3611301779747009, "icl_sequence_length": 92, "num_contexts": 3, "step": 25340 }, { "grad_norm": 80.08364868164062, "learning_rate": 9.722246295302983e-05, "loss": 0.3817, "step": 25350 }, { "gate_value": 0.361128568649292, "icl_sequence_length": 66, "num_contexts": 3, "step": 25350 }, { "grad_norm": 28.752838134765625, "learning_rate": 9.710640043929116e-05, "loss": 0.3787, "step": 25360 }, { "gate_value": 0.36116471886634827, "icl_sequence_length": 86, "num_contexts": 3, "step": 25360 }, { "grad_norm": 29.80974769592285, "learning_rate": 9.699037407782905e-05, "loss": 0.3691, "step": 25370 }, { "gate_value": 0.3612363636493683, "icl_sequence_length": 84, "num_contexts": 3, "step": 25370 }, { "grad_norm": 23.709779739379883, "learning_rate": 9.687438394794637e-05, "loss": 0.353, "step": 25380 }, { "gate_value": 0.36146095395088196, "icl_sequence_length": 86, "num_contexts": 3, "step": 25380 }, { "grad_norm": 22.217329025268555, "learning_rate": 9.675843012892136e-05, "loss": 0.3595, "step": 25390 }, { "gate_value": 0.36164140701293945, "icl_sequence_length": 82, "num_contexts": 3, "step": 25390 }, { "grad_norm": 16.554452896118164, "learning_rate": 9.664251270000735e-05, "loss": 0.3723, "step": 25400 }, { "gate_value": 0.3616686165332794, "icl_sequence_length": 84, "num_contexts": 3, "step": 25400 }, { "grad_norm": 39.989200592041016, "learning_rate": 9.652663174043273e-05, "loss": 0.3671, "step": 25410 }, { "gate_value": 0.36170732975006104, "icl_sequence_length": 72, "num_contexts": 3, "step": 25410 }, { "grad_norm": 29.332035064697266, "learning_rate": 9.64107873294011e-05, "loss": 0.359, "step": 25420 }, { "gate_value": 0.36175501346588135, "icl_sequence_length": 88, "num_contexts": 3, "step": 25420 }, { "grad_norm": 19.313488006591797, "learning_rate": 9.629497954609098e-05, "loss": 0.3781, "step": 25430 }, { "gate_value": 0.36181315779685974, "icl_sequence_length": 78, "num_contexts": 3, "step": 25430 }, { "grad_norm": 26.557374954223633, "learning_rate": 9.617920846965595e-05, "loss": 0.3523, "step": 25440 }, { "gate_value": 0.3619093596935272, "icl_sequence_length": 80, "num_contexts": 3, "step": 25440 }, { "grad_norm": 29.910465240478516, "learning_rate": 9.606347417922444e-05, "loss": 0.3549, "step": 25450 }, { "gate_value": 0.36200904846191406, "icl_sequence_length": 72, "num_contexts": 3, "step": 25450 }, { "grad_norm": 37.020931243896484, "learning_rate": 9.594777675389973e-05, "loss": 0.3795, "step": 25460 }, { "gate_value": 0.3620182275772095, "icl_sequence_length": 78, "num_contexts": 3, "step": 25460 }, { "grad_norm": 24.904870986938477, "learning_rate": 9.583211627275995e-05, "loss": 0.3793, "step": 25470 }, { "gate_value": 0.36203330755233765, "icl_sequence_length": 82, "num_contexts": 3, "step": 25470 }, { "grad_norm": 22.63786506652832, "learning_rate": 9.571649281485788e-05, "loss": 0.3602, "step": 25480 }, { "gate_value": 0.362061470746994, "icl_sequence_length": 78, "num_contexts": 3, "step": 25480 }, { "grad_norm": 430.0863037109375, "learning_rate": 9.560090645922115e-05, "loss": 0.3583, "step": 25490 }, { "gate_value": 0.36207452416419983, "icl_sequence_length": 82, "num_contexts": 3, "step": 25490 }, { "grad_norm": 38.97439193725586, "learning_rate": 9.548535728485194e-05, "loss": 0.3654, "step": 25500 }, { "gate_value": 0.36213162541389465, "icl_sequence_length": 80, "num_contexts": 3, "step": 25500 }, { "grad_norm": 28.70587730407715, "learning_rate": 9.536984537072693e-05, "loss": 0.3579, "step": 25510 }, { "gate_value": 0.3622555732727051, "icl_sequence_length": 82, "num_contexts": 3, "step": 25510 }, { "grad_norm": 47.876365661621094, "learning_rate": 9.525437079579749e-05, "loss": 0.3678, "step": 25520 }, { "gate_value": 0.36234137415885925, "icl_sequence_length": 88, "num_contexts": 3, "step": 25520 }, { "grad_norm": 51.66835403442383, "learning_rate": 9.513893363898934e-05, "loss": 0.3694, "step": 25530 }, { "gate_value": 0.36236605048179626, "icl_sequence_length": 82, "num_contexts": 3, "step": 25530 }, { "grad_norm": 38.49001693725586, "learning_rate": 9.502353397920278e-05, "loss": 0.367, "step": 25540 }, { "gate_value": 0.36243191361427307, "icl_sequence_length": 74, "num_contexts": 3, "step": 25540 }, { "grad_norm": 26.23487663269043, "learning_rate": 9.490817189531236e-05, "loss": 0.3601, "step": 25550 }, { "gate_value": 0.3624683618545532, "icl_sequence_length": 68, "num_contexts": 3, "step": 25550 }, { "grad_norm": 74.08128356933594, "learning_rate": 9.479284746616693e-05, "loss": 0.3824, "step": 25560 }, { "gate_value": 0.3625290095806122, "icl_sequence_length": 74, "num_contexts": 3, "step": 25560 }, { "grad_norm": 134.24107360839844, "learning_rate": 9.467756077058973e-05, "loss": 0.3648, "step": 25570 }, { "gate_value": 0.3626077473163605, "icl_sequence_length": 90, "num_contexts": 3, "step": 25570 }, { "grad_norm": 49.08687210083008, "learning_rate": 9.456231188737805e-05, "loss": 0.3611, "step": 25580 }, { "gate_value": 0.36267179250717163, "icl_sequence_length": 84, "num_contexts": 3, "step": 25580 }, { "grad_norm": 76.60491180419922, "learning_rate": 9.444710089530349e-05, "loss": 0.3755, "step": 25590 }, { "gate_value": 0.36272481083869934, "icl_sequence_length": 92, "num_contexts": 3, "step": 25590 }, { "grad_norm": 38.12582778930664, "learning_rate": 9.433192787311161e-05, "loss": 0.3705, "step": 25600 }, { "gate_value": 0.36277905106544495, "icl_sequence_length": 82, "num_contexts": 3, "step": 25600 }, { "grad_norm": 74.66680908203125, "learning_rate": 9.42167928995221e-05, "loss": 0.3744, "step": 25610 }, { "gate_value": 0.36285629868507385, "icl_sequence_length": 80, "num_contexts": 3, "step": 25610 }, { "grad_norm": 92.35078430175781, "learning_rate": 9.410169605322864e-05, "loss": 0.3821, "step": 25620 }, { "gate_value": 0.36290407180786133, "icl_sequence_length": 78, "num_contexts": 3, "step": 25620 }, { "grad_norm": 352.0303649902344, "learning_rate": 9.398663741289883e-05, "loss": 0.3659, "step": 25630 }, { "gate_value": 0.36290207505226135, "icl_sequence_length": 90, "num_contexts": 3, "step": 25630 }, { "grad_norm": 266.07586669921875, "learning_rate": 9.387161705717418e-05, "loss": 0.3652, "step": 25640 }, { "gate_value": 0.36293622851371765, "icl_sequence_length": 82, "num_contexts": 3, "step": 25640 }, { "grad_norm": 235221.78125, "learning_rate": 9.375663506467004e-05, "loss": 0.353, "step": 25650 }, { "gate_value": 0.36297622323036194, "icl_sequence_length": 86, "num_contexts": 3, "step": 25650 }, { "grad_norm": 734.5687255859375, "learning_rate": 9.364169151397546e-05, "loss": 0.3647, "step": 25660 }, { "gate_value": 0.3630070984363556, "icl_sequence_length": 78, "num_contexts": 3, "step": 25660 }, { "grad_norm": 72.2925033569336, "learning_rate": 9.352678648365332e-05, "loss": 0.3741, "step": 25670 }, { "gate_value": 0.3630395531654358, "icl_sequence_length": 86, "num_contexts": 3, "step": 25670 }, { "grad_norm": 1104.2977294921875, "learning_rate": 9.341192005224013e-05, "loss": 0.356, "step": 25680 }, { "gate_value": 0.36307305097579956, "icl_sequence_length": 78, "num_contexts": 3, "step": 25680 }, { "grad_norm": 68.21739196777344, "learning_rate": 9.329709229824595e-05, "loss": 0.3667, "step": 25690 }, { "gate_value": 0.3630922734737396, "icl_sequence_length": 78, "num_contexts": 3, "step": 25690 }, { "grad_norm": 168.8650665283203, "learning_rate": 9.318230330015453e-05, "loss": 0.3719, "step": 25700 }, { "gate_value": 0.36309897899627686, "icl_sequence_length": 82, "num_contexts": 3, "step": 25700 }, { "grad_norm": 81.41837310791016, "learning_rate": 9.306755313642301e-05, "loss": 0.353, "step": 25710 }, { "gate_value": 0.3631479740142822, "icl_sequence_length": 94, "num_contexts": 3, "step": 25710 }, { "grad_norm": 167.31732177734375, "learning_rate": 9.295284188548212e-05, "loss": 0.3636, "step": 25720 }, { "gate_value": 0.3632040023803711, "icl_sequence_length": 82, "num_contexts": 3, "step": 25720 }, { "grad_norm": 62.20192337036133, "learning_rate": 9.283816962573586e-05, "loss": 0.3711, "step": 25730 }, { "gate_value": 0.36324024200439453, "icl_sequence_length": 92, "num_contexts": 3, "step": 25730 }, { "grad_norm": 167.32057189941406, "learning_rate": 9.272353643556163e-05, "loss": 0.3564, "step": 25740 }, { "gate_value": 0.3632681965827942, "icl_sequence_length": 84, "num_contexts": 3, "step": 25740 }, { "grad_norm": 121.6691665649414, "learning_rate": 9.260894239331023e-05, "loss": 0.3642, "step": 25750 }, { "gate_value": 0.36329635977745056, "icl_sequence_length": 76, "num_contexts": 3, "step": 25750 }, { "grad_norm": 71.03226470947266, "learning_rate": 9.249438757730547e-05, "loss": 0.3722, "step": 25760 }, { "gate_value": 0.36331915855407715, "icl_sequence_length": 92, "num_contexts": 3, "step": 25760 }, { "grad_norm": 73.70002746582031, "learning_rate": 9.237987206584462e-05, "loss": 0.3526, "step": 25770 }, { "gate_value": 0.363363116979599, "icl_sequence_length": 74, "num_contexts": 3, "step": 25770 }, { "grad_norm": 119.9651870727539, "learning_rate": 9.226539593719789e-05, "loss": 0.3611, "step": 25780 }, { "gate_value": 0.36337050795555115, "icl_sequence_length": 68, "num_contexts": 3, "step": 25780 }, { "grad_norm": 136.77578735351562, "learning_rate": 9.215095926960856e-05, "loss": 0.3763, "step": 25790 }, { "gate_value": 0.3633963465690613, "icl_sequence_length": 68, "num_contexts": 3, "step": 25790 }, { "grad_norm": 77.38294982910156, "learning_rate": 9.203656214129313e-05, "loss": 0.3457, "step": 25800 }, { "gate_value": 0.36343178153038025, "icl_sequence_length": 78, "num_contexts": 3, "step": 25800 }, { "grad_norm": 70.8493423461914, "learning_rate": 9.192220463044089e-05, "loss": 0.3741, "step": 25810 }, { "gate_value": 0.3634765148162842, "icl_sequence_length": 84, "num_contexts": 3, "step": 25810 }, { "grad_norm": 103.03206634521484, "learning_rate": 9.180788681521418e-05, "loss": 0.3752, "step": 25820 }, { "gate_value": 0.3635123074054718, "icl_sequence_length": 72, "num_contexts": 3, "step": 25820 }, { "grad_norm": 109.7791519165039, "learning_rate": 9.169360877374808e-05, "loss": 0.361, "step": 25830 }, { "gate_value": 0.3635486662387848, "icl_sequence_length": 70, "num_contexts": 3, "step": 25830 }, { "grad_norm": 38.19389724731445, "learning_rate": 9.157937058415058e-05, "loss": 0.3738, "step": 25840 }, { "gate_value": 0.3635652959346771, "icl_sequence_length": 76, "num_contexts": 3, "step": 25840 }, { "grad_norm": 112.97542572021484, "learning_rate": 9.146517232450244e-05, "loss": 0.3551, "step": 25850 }, { "gate_value": 0.3635725975036621, "icl_sequence_length": 92, "num_contexts": 3, "step": 25850 }, { "grad_norm": 108.82026672363281, "learning_rate": 9.135101407285704e-05, "loss": 0.3769, "step": 25860 }, { "gate_value": 0.3635868430137634, "icl_sequence_length": 86, "num_contexts": 3, "step": 25860 }, { "grad_norm": 1242.8558349609375, "learning_rate": 9.123689590724056e-05, "loss": 0.381, "step": 25870 }, { "gate_value": 0.36360442638397217, "icl_sequence_length": 76, "num_contexts": 3, "step": 25870 }, { "grad_norm": 216.93637084960938, "learning_rate": 9.112281790565159e-05, "loss": 0.3533, "step": 25880 }, { "gate_value": 0.3636187016963959, "icl_sequence_length": 74, "num_contexts": 3, "step": 25880 }, { "grad_norm": 205.6892547607422, "learning_rate": 9.100878014606137e-05, "loss": 0.3564, "step": 25890 }, { "gate_value": 0.36363720893859863, "icl_sequence_length": 72, "num_contexts": 3, "step": 25890 }, { "grad_norm": 3794.68994140625, "learning_rate": 9.089478270641368e-05, "loss": 0.3688, "step": 25900 }, { "gate_value": 0.36364829540252686, "icl_sequence_length": 72, "num_contexts": 3, "step": 25900 }, { "grad_norm": 2179.69482421875, "learning_rate": 9.078082566462469e-05, "loss": 0.3596, "step": 25910 }, { "gate_value": 0.3636602759361267, "icl_sequence_length": 86, "num_contexts": 3, "step": 25910 }, { "grad_norm": 2942.58740234375, "learning_rate": 9.066690909858296e-05, "loss": 0.3491, "step": 25920 }, { "gate_value": 0.3636675775051117, "icl_sequence_length": 74, "num_contexts": 3, "step": 25920 }, { "grad_norm": 31502.810546875, "learning_rate": 9.055303308614935e-05, "loss": 0.3668, "step": 25930 }, { "gate_value": 0.36367008090019226, "icl_sequence_length": 84, "num_contexts": 3, "step": 25930 }, { "grad_norm": 12904.177734375, "learning_rate": 9.04391977051571e-05, "loss": 0.3676, "step": 25940 }, { "gate_value": 0.36367350816726685, "icl_sequence_length": 68, "num_contexts": 3, "step": 25940 }, { "grad_norm": 1463.4390869140625, "learning_rate": 9.032540303341158e-05, "loss": 0.3664, "step": 25950 }, { "gate_value": 0.36367329955101013, "icl_sequence_length": 68, "num_contexts": 3, "step": 25950 }, { "grad_norm": 100.67179870605469, "learning_rate": 9.021164914869046e-05, "loss": 0.3667, "step": 25960 }, { "gate_value": 0.36367326974868774, "icl_sequence_length": 62, "num_contexts": 3, "step": 25960 }, { "grad_norm": 25260.05078125, "learning_rate": 9.00979361287433e-05, "loss": 0.3707, "step": 25970 }, { "gate_value": 0.36368072032928467, "icl_sequence_length": 96, "num_contexts": 3, "step": 25970 }, { "grad_norm": 126501.125, "learning_rate": 8.998426405129198e-05, "loss": 0.3662, "step": 25980 }, { "gate_value": 0.3636826276779175, "icl_sequence_length": 80, "num_contexts": 3, "step": 25980 }, { "grad_norm": 3729.87744140625, "learning_rate": 8.987063299403024e-05, "loss": 0.36, "step": 25990 }, { "gate_value": 0.36368346214294434, "icl_sequence_length": 82, "num_contexts": 3, "step": 25990 }, { "grad_norm": 408.446533203125, "learning_rate": 8.97570430346239e-05, "loss": 0.3716, "step": 26000 }, { "gate_value": 0.36368486285209656, "icl_sequence_length": 82, "num_contexts": 3, "step": 26000 }, { "grad_norm": 819.3681640625, "learning_rate": 8.964349425071056e-05, "loss": 0.3739, "step": 26010 }, { "gate_value": 0.36369675397872925, "icl_sequence_length": 78, "num_contexts": 3, "step": 26010 }, { "grad_norm": 33350.34375, "learning_rate": 8.952998671989977e-05, "loss": 0.3791, "step": 26020 }, { "gate_value": 0.3637213706970215, "icl_sequence_length": 76, "num_contexts": 3, "step": 26020 }, { "grad_norm": 1744.698486328125, "learning_rate": 8.941652051977286e-05, "loss": 0.3665, "step": 26030 }, { "gate_value": 0.36372846364974976, "icl_sequence_length": 78, "num_contexts": 3, "step": 26030 }, { "grad_norm": 151.7212371826172, "learning_rate": 8.930309572788289e-05, "loss": 0.3709, "step": 26040 }, { "gate_value": 0.36373579502105713, "icl_sequence_length": 76, "num_contexts": 3, "step": 26040 }, { "grad_norm": 119.85762023925781, "learning_rate": 8.918971242175473e-05, "loss": 0.3653, "step": 26050 }, { "gate_value": 0.36377817392349243, "icl_sequence_length": 76, "num_contexts": 3, "step": 26050 }, { "grad_norm": 741.9052734375, "learning_rate": 8.907637067888468e-05, "loss": 0.3534, "step": 26060 }, { "gate_value": 0.363788366317749, "icl_sequence_length": 70, "num_contexts": 3, "step": 26060 }, { "grad_norm": 1784.32470703125, "learning_rate": 8.896307057674078e-05, "loss": 0.3607, "step": 26070 }, { "gate_value": 0.363790899515152, "icl_sequence_length": 82, "num_contexts": 3, "step": 26070 }, { "grad_norm": 98.61390686035156, "learning_rate": 8.88498121927626e-05, "loss": 0.3609, "step": 26080 }, { "gate_value": 0.36381369829177856, "icl_sequence_length": 80, "num_contexts": 3, "step": 26080 }, { "grad_norm": 67.09032440185547, "learning_rate": 8.873659560436119e-05, "loss": 0.3603, "step": 26090 }, { "gate_value": 0.3638378381729126, "icl_sequence_length": 74, "num_contexts": 3, "step": 26090 }, { "grad_norm": 454.86260986328125, "learning_rate": 8.8623420888919e-05, "loss": 0.3503, "step": 26100 }, { "gate_value": 0.36386924982070923, "icl_sequence_length": 88, "num_contexts": 3, "step": 26100 }, { "grad_norm": 12161.5546875, "learning_rate": 8.851028812378986e-05, "loss": 0.3569, "step": 26110 }, { "gate_value": 0.36391758918762207, "icl_sequence_length": 82, "num_contexts": 3, "step": 26110 }, { "grad_norm": 117.7813949584961, "learning_rate": 8.8397197386299e-05, "loss": 0.3522, "step": 26120 }, { "gate_value": 0.36397236585617065, "icl_sequence_length": 64, "num_contexts": 3, "step": 26120 }, { "grad_norm": 106.986083984375, "learning_rate": 8.828414875374281e-05, "loss": 0.3651, "step": 26130 }, { "gate_value": 0.3640117347240448, "icl_sequence_length": 80, "num_contexts": 3, "step": 26130 }, { "grad_norm": 74.00639343261719, "learning_rate": 8.817114230338902e-05, "loss": 0.3615, "step": 26140 }, { "gate_value": 0.3640309274196625, "icl_sequence_length": 74, "num_contexts": 3, "step": 26140 }, { "grad_norm": 2831.8740234375, "learning_rate": 8.805817811247651e-05, "loss": 0.3649, "step": 26150 }, { "gate_value": 0.36402857303619385, "icl_sequence_length": 86, "num_contexts": 3, "step": 26150 }, { "grad_norm": 287.6517028808594, "learning_rate": 8.794525625821514e-05, "loss": 0.3715, "step": 26160 }, { "gate_value": 0.36404845118522644, "icl_sequence_length": 88, "num_contexts": 3, "step": 26160 }, { "grad_norm": 98.52863311767578, "learning_rate": 8.783237681778597e-05, "loss": 0.3434, "step": 26170 }, { "gate_value": 0.3640698492527008, "icl_sequence_length": 90, "num_contexts": 3, "step": 26170 }, { "grad_norm": 109.10649108886719, "learning_rate": 8.771953986834106e-05, "loss": 0.37, "step": 26180 }, { "gate_value": 0.3640887439250946, "icl_sequence_length": 70, "num_contexts": 3, "step": 26180 }, { "grad_norm": 763.0786743164062, "learning_rate": 8.760674548700336e-05, "loss": 0.3443, "step": 26190 }, { "gate_value": 0.36416587233543396, "icl_sequence_length": 84, "num_contexts": 3, "step": 26190 }, { "grad_norm": 243.73666381835938, "learning_rate": 8.74939937508668e-05, "loss": 0.3742, "step": 26200 }, { "gate_value": 0.3642013967037201, "icl_sequence_length": 80, "num_contexts": 3, "step": 26200 }, { "grad_norm": 77.64506530761719, "learning_rate": 8.738128473699609e-05, "loss": 0.3718, "step": 26210 }, { "gate_value": 0.364237517118454, "icl_sequence_length": 94, "num_contexts": 3, "step": 26210 }, { "grad_norm": 80.43402099609375, "learning_rate": 8.72686185224268e-05, "loss": 0.3706, "step": 26220 }, { "gate_value": 0.3642655909061432, "icl_sequence_length": 88, "num_contexts": 3, "step": 26220 }, { "grad_norm": 717.6832275390625, "learning_rate": 8.715599518416523e-05, "loss": 0.3657, "step": 26230 }, { "gate_value": 0.3643067181110382, "icl_sequence_length": 76, "num_contexts": 3, "step": 26230 }, { "grad_norm": 155.02511596679688, "learning_rate": 8.704341479918843e-05, "loss": 0.3736, "step": 26240 }, { "gate_value": 0.3643386662006378, "icl_sequence_length": 88, "num_contexts": 3, "step": 26240 }, { "grad_norm": 333.3760681152344, "learning_rate": 8.693087744444398e-05, "loss": 0.3678, "step": 26250 }, { "gate_value": 0.3643483519554138, "icl_sequence_length": 72, "num_contexts": 3, "step": 26250 }, { "grad_norm": 112.97294616699219, "learning_rate": 8.681838319685e-05, "loss": 0.3718, "step": 26260 }, { "gate_value": 0.36436960101127625, "icl_sequence_length": 76, "num_contexts": 3, "step": 26260 }, { "grad_norm": 129.95945739746094, "learning_rate": 8.670593213329537e-05, "loss": 0.3698, "step": 26270 }, { "gate_value": 0.3643975853919983, "icl_sequence_length": 82, "num_contexts": 3, "step": 26270 }, { "grad_norm": 58.365657806396484, "learning_rate": 8.659352433063928e-05, "loss": 0.3561, "step": 26280 }, { "gate_value": 0.3644281029701233, "icl_sequence_length": 84, "num_contexts": 3, "step": 26280 }, { "grad_norm": 114.4413833618164, "learning_rate": 8.64811598657115e-05, "loss": 0.3587, "step": 26290 }, { "gate_value": 0.3644927740097046, "icl_sequence_length": 70, "num_contexts": 3, "step": 26290 }, { "grad_norm": 58.796695709228516, "learning_rate": 8.636883881531194e-05, "loss": 0.3701, "step": 26300 }, { "gate_value": 0.3645520508289337, "icl_sequence_length": 82, "num_contexts": 3, "step": 26300 }, { "grad_norm": 121.88422393798828, "learning_rate": 8.625656125621103e-05, "loss": 0.3662, "step": 26310 }, { "gate_value": 0.36459311842918396, "icl_sequence_length": 84, "num_contexts": 3, "step": 26310 }, { "grad_norm": 76.7275619506836, "learning_rate": 8.614432726514944e-05, "loss": 0.3492, "step": 26320 }, { "gate_value": 0.3646358847618103, "icl_sequence_length": 84, "num_contexts": 3, "step": 26320 }, { "grad_norm": 76.93498229980469, "learning_rate": 8.60321369188381e-05, "loss": 0.3613, "step": 26330 }, { "gate_value": 0.3646531105041504, "icl_sequence_length": 82, "num_contexts": 3, "step": 26330 }, { "grad_norm": 42.61845779418945, "learning_rate": 8.591999029395795e-05, "loss": 0.3684, "step": 26340 }, { "gate_value": 0.3646797239780426, "icl_sequence_length": 68, "num_contexts": 3, "step": 26340 }, { "grad_norm": 109.5250015258789, "learning_rate": 8.580788746716024e-05, "loss": 0.3689, "step": 26350 }, { "gate_value": 0.3647087514400482, "icl_sequence_length": 80, "num_contexts": 3, "step": 26350 }, { "grad_norm": 99.25482940673828, "learning_rate": 8.56958285150661e-05, "loss": 0.3449, "step": 26360 }, { "gate_value": 0.36472317576408386, "icl_sequence_length": 76, "num_contexts": 3, "step": 26360 }, { "grad_norm": 54.96424102783203, "learning_rate": 8.558381351426681e-05, "loss": 0.3416, "step": 26370 }, { "gate_value": 0.3647831678390503, "icl_sequence_length": 70, "num_contexts": 3, "step": 26370 }, { "grad_norm": 72.82083129882812, "learning_rate": 8.547184254132358e-05, "loss": 0.3728, "step": 26380 }, { "gate_value": 0.36487022042274475, "icl_sequence_length": 82, "num_contexts": 3, "step": 26380 }, { "grad_norm": 103.92755889892578, "learning_rate": 8.535991567276758e-05, "loss": 0.3583, "step": 26390 }, { "gate_value": 0.3649507462978363, "icl_sequence_length": 80, "num_contexts": 3, "step": 26390 }, { "grad_norm": 129.11090087890625, "learning_rate": 8.524803298509963e-05, "loss": 0.371, "step": 26400 }, { "gate_value": 0.364972323179245, "icl_sequence_length": 64, "num_contexts": 3, "step": 26400 }, { "grad_norm": 61.047943115234375, "learning_rate": 8.513619455479056e-05, "loss": 0.3592, "step": 26410 }, { "gate_value": 0.3650073707103729, "icl_sequence_length": 86, "num_contexts": 3, "step": 26410 }, { "grad_norm": 808.7735595703125, "learning_rate": 8.502440045828087e-05, "loss": 0.3536, "step": 26420 }, { "gate_value": 0.36508437991142273, "icl_sequence_length": 92, "num_contexts": 3, "step": 26420 }, { "grad_norm": 65.29513549804688, "learning_rate": 8.491265077198085e-05, "loss": 0.3723, "step": 26430 }, { "gate_value": 0.3651318848133087, "icl_sequence_length": 88, "num_contexts": 3, "step": 26430 }, { "grad_norm": 45.95816421508789, "learning_rate": 8.480094557227022e-05, "loss": 0.3473, "step": 26440 }, { "gate_value": 0.3651773929595947, "icl_sequence_length": 88, "num_contexts": 3, "step": 26440 }, { "grad_norm": 62.62614822387695, "learning_rate": 8.468928493549858e-05, "loss": 0.3681, "step": 26450 }, { "gate_value": 0.3652338981628418, "icl_sequence_length": 84, "num_contexts": 3, "step": 26450 }, { "grad_norm": 57.45429229736328, "learning_rate": 8.457766893798478e-05, "loss": 0.3672, "step": 26460 }, { "gate_value": 0.3652498722076416, "icl_sequence_length": 84, "num_contexts": 3, "step": 26460 }, { "grad_norm": 89.05564880371094, "learning_rate": 8.446609765601736e-05, "loss": 0.3666, "step": 26470 }, { "gate_value": 0.3652762174606323, "icl_sequence_length": 70, "num_contexts": 3, "step": 26470 }, { "grad_norm": 177.4752655029297, "learning_rate": 8.435457116585426e-05, "loss": 0.3681, "step": 26480 }, { "gate_value": 0.365335613489151, "icl_sequence_length": 90, "num_contexts": 3, "step": 26480 }, { "grad_norm": 618.664306640625, "learning_rate": 8.424308954372282e-05, "loss": 0.3676, "step": 26490 }, { "gate_value": 0.3653985857963562, "icl_sequence_length": 70, "num_contexts": 3, "step": 26490 }, { "grad_norm": 649.4828491210938, "learning_rate": 8.413165286581956e-05, "loss": 0.3701, "step": 26500 }, { "gate_value": 0.3654293417930603, "icl_sequence_length": 90, "num_contexts": 3, "step": 26500 }, { "grad_norm": 119.72760772705078, "learning_rate": 8.402026120831047e-05, "loss": 0.3764, "step": 26510 }, { "gate_value": 0.36545631289482117, "icl_sequence_length": 70, "num_contexts": 3, "step": 26510 }, { "grad_norm": 58.20319747924805, "learning_rate": 8.390891464733074e-05, "loss": 0.3595, "step": 26520 }, { "gate_value": 0.365516722202301, "icl_sequence_length": 88, "num_contexts": 3, "step": 26520 }, { "grad_norm": 68.46391296386719, "learning_rate": 8.37976132589846e-05, "loss": 0.3445, "step": 26530 }, { "gate_value": 0.36553603410720825, "icl_sequence_length": 78, "num_contexts": 3, "step": 26530 }, { "grad_norm": 48.550350189208984, "learning_rate": 8.368635711934554e-05, "loss": 0.3589, "step": 26540 }, { "gate_value": 0.3655546009540558, "icl_sequence_length": 72, "num_contexts": 3, "step": 26540 }, { "grad_norm": 99.2035140991211, "learning_rate": 8.357514630445617e-05, "loss": 0.342, "step": 26550 }, { "gate_value": 0.36558255553245544, "icl_sequence_length": 74, "num_contexts": 3, "step": 26550 }, { "grad_norm": 108.59980773925781, "learning_rate": 8.346398089032788e-05, "loss": 0.3757, "step": 26560 }, { "gate_value": 0.36561352014541626, "icl_sequence_length": 66, "num_contexts": 3, "step": 26560 }, { "grad_norm": 74.10098266601562, "learning_rate": 8.335286095294122e-05, "loss": 0.3563, "step": 26570 }, { "gate_value": 0.3656443953514099, "icl_sequence_length": 82, "num_contexts": 3, "step": 26570 }, { "grad_norm": 107.44097137451172, "learning_rate": 8.324178656824569e-05, "loss": 0.3707, "step": 26580 }, { "gate_value": 0.3657311499118805, "icl_sequence_length": 78, "num_contexts": 3, "step": 26580 }, { "grad_norm": 43.990760803222656, "learning_rate": 8.313075781215961e-05, "loss": 0.3525, "step": 26590 }, { "gate_value": 0.36581742763519287, "icl_sequence_length": 90, "num_contexts": 3, "step": 26590 }, { "grad_norm": 93.85136413574219, "learning_rate": 8.301977476056995e-05, "loss": 0.3456, "step": 26600 }, { "gate_value": 0.36589711904525757, "icl_sequence_length": 76, "num_contexts": 3, "step": 26600 }, { "grad_norm": 79.03585052490234, "learning_rate": 8.290883748933273e-05, "loss": 0.3726, "step": 26610 }, { "gate_value": 0.3659816086292267, "icl_sequence_length": 92, "num_contexts": 3, "step": 26610 }, { "grad_norm": 112.67575073242188, "learning_rate": 8.279794607427236e-05, "loss": 0.3512, "step": 26620 }, { "gate_value": 0.3660266697406769, "icl_sequence_length": 84, "num_contexts": 3, "step": 26620 }, { "grad_norm": 66.10786437988281, "learning_rate": 8.268710059118221e-05, "loss": 0.3603, "step": 26630 }, { "gate_value": 0.3660682737827301, "icl_sequence_length": 74, "num_contexts": 3, "step": 26630 }, { "grad_norm": 112.40705108642578, "learning_rate": 8.257630111582408e-05, "loss": 0.3679, "step": 26640 }, { "gate_value": 0.36610719561576843, "icl_sequence_length": 86, "num_contexts": 3, "step": 26640 }, { "grad_norm": 102.23544311523438, "learning_rate": 8.246554772392842e-05, "loss": 0.3632, "step": 26650 }, { "gate_value": 0.36619168519973755, "icl_sequence_length": 86, "num_contexts": 3, "step": 26650 }, { "grad_norm": 90.14778900146484, "learning_rate": 8.235484049119402e-05, "loss": 0.3548, "step": 26660 }, { "gate_value": 0.3662635087966919, "icl_sequence_length": 80, "num_contexts": 3, "step": 26660 }, { "grad_norm": 88.61628723144531, "learning_rate": 8.224417949328828e-05, "loss": 0.354, "step": 26670 }, { "gate_value": 0.36625897884368896, "icl_sequence_length": 74, "num_contexts": 3, "step": 26670 }, { "grad_norm": 97.69658660888672, "learning_rate": 8.213356480584696e-05, "loss": 0.3518, "step": 26680 }, { "gate_value": 0.3662429451942444, "icl_sequence_length": 86, "num_contexts": 3, "step": 26680 }, { "grad_norm": 215.29074096679688, "learning_rate": 8.202299650447422e-05, "loss": 0.3617, "step": 26690 }, { "gate_value": 0.3662889301776886, "icl_sequence_length": 86, "num_contexts": 3, "step": 26690 }, { "grad_norm": 3471.608642578125, "learning_rate": 8.191247466474232e-05, "loss": 0.3551, "step": 26700 }, { "gate_value": 0.3663380444049835, "icl_sequence_length": 86, "num_contexts": 3, "step": 26700 }, { "grad_norm": 131.4551544189453, "learning_rate": 8.180199936219201e-05, "loss": 0.3416, "step": 26710 }, { "gate_value": 0.36638250946998596, "icl_sequence_length": 74, "num_contexts": 3, "step": 26710 }, { "grad_norm": 116.79961395263672, "learning_rate": 8.169157067233204e-05, "loss": 0.3812, "step": 26720 }, { "gate_value": 0.36638423800468445, "icl_sequence_length": 92, "num_contexts": 3, "step": 26720 }, { "grad_norm": 115.24409484863281, "learning_rate": 8.158118867063939e-05, "loss": 0.3785, "step": 26730 }, { "gate_value": 0.36644190549850464, "icl_sequence_length": 66, "num_contexts": 3, "step": 26730 }, { "grad_norm": 103.70458221435547, "learning_rate": 8.14708534325591e-05, "loss": 0.3611, "step": 26740 }, { "gate_value": 0.36655059456825256, "icl_sequence_length": 86, "num_contexts": 3, "step": 26740 }, { "grad_norm": 101.19654083251953, "learning_rate": 8.136056503350441e-05, "loss": 0.3733, "step": 26750 }, { "gate_value": 0.36665982007980347, "icl_sequence_length": 74, "num_contexts": 3, "step": 26750 }, { "grad_norm": 134.09664916992188, "learning_rate": 8.12503235488562e-05, "loss": 0.3731, "step": 26760 }, { "gate_value": 0.3667478561401367, "icl_sequence_length": 76, "num_contexts": 3, "step": 26760 }, { "grad_norm": 113.2645492553711, "learning_rate": 8.114012905396356e-05, "loss": 0.3596, "step": 26770 }, { "gate_value": 0.3667839467525482, "icl_sequence_length": 88, "num_contexts": 3, "step": 26770 }, { "grad_norm": 400.9145202636719, "learning_rate": 8.102998162414342e-05, "loss": 0.3667, "step": 26780 }, { "gate_value": 0.3667834401130676, "icl_sequence_length": 74, "num_contexts": 3, "step": 26780 }, { "grad_norm": 60948.7890625, "learning_rate": 8.091988133468056e-05, "loss": 0.3567, "step": 26790 }, { "gate_value": 0.3667881488800049, "icl_sequence_length": 80, "num_contexts": 3, "step": 26790 }, { "grad_norm": 91.94977569580078, "learning_rate": 8.08098282608274e-05, "loss": 0.3669, "step": 26800 }, { "gate_value": 0.36683157086372375, "icl_sequence_length": 78, "num_contexts": 3, "step": 26800 }, { "grad_norm": 136.93983459472656, "learning_rate": 8.069982247780416e-05, "loss": 0.3668, "step": 26810 }, { "gate_value": 0.366877943277359, "icl_sequence_length": 78, "num_contexts": 3, "step": 26810 }, { "grad_norm": 3072.138671875, "learning_rate": 8.058986406079878e-05, "loss": 0.3674, "step": 26820 }, { "gate_value": 0.36689361929893494, "icl_sequence_length": 62, "num_contexts": 3, "step": 26820 }, { "grad_norm": 4344.18505859375, "learning_rate": 8.047995308496684e-05, "loss": 0.3614, "step": 26830 }, { "gate_value": 0.3669084906578064, "icl_sequence_length": 94, "num_contexts": 3, "step": 26830 }, { "grad_norm": 2586.574462890625, "learning_rate": 8.037008962543139e-05, "loss": 0.3793, "step": 26840 }, { "gate_value": 0.3669230043888092, "icl_sequence_length": 78, "num_contexts": 3, "step": 26840 }, { "grad_norm": 254.2022705078125, "learning_rate": 8.02602737572832e-05, "loss": 0.3756, "step": 26850 }, { "gate_value": 0.3669438064098358, "icl_sequence_length": 88, "num_contexts": 3, "step": 26850 }, { "grad_norm": 5953.83935546875, "learning_rate": 8.015050555558022e-05, "loss": 0.3569, "step": 26860 }, { "gate_value": 0.3669627010822296, "icl_sequence_length": 68, "num_contexts": 3, "step": 26860 }, { "grad_norm": 683.7999267578125, "learning_rate": 8.004078509534807e-05, "loss": 0.3645, "step": 26870 }, { "gate_value": 0.3669717013835907, "icl_sequence_length": 82, "num_contexts": 3, "step": 26870 }, { "grad_norm": 13406.55859375, "learning_rate": 7.99311124515796e-05, "loss": 0.3823, "step": 26880 }, { "gate_value": 0.3669739067554474, "icl_sequence_length": 88, "num_contexts": 3, "step": 26880 }, { "grad_norm": 28604.58203125, "learning_rate": 7.982148769923513e-05, "loss": 0.358, "step": 26890 }, { "gate_value": 0.366974800825119, "icl_sequence_length": 78, "num_contexts": 3, "step": 26890 }, { "grad_norm": 23105.599609375, "learning_rate": 7.971191091324209e-05, "loss": 0.3573, "step": 26900 }, { "gate_value": 0.3669755160808563, "icl_sequence_length": 84, "num_contexts": 3, "step": 26900 }, { "grad_norm": 8306.990234375, "learning_rate": 7.960238216849508e-05, "loss": 0.3597, "step": 26910 }, { "gate_value": 0.3669755756855011, "icl_sequence_length": 84, "num_contexts": 3, "step": 26910 }, { "grad_norm": 2806.118408203125, "learning_rate": 7.949290153985608e-05, "loss": 0.363, "step": 26920 }, { "gate_value": 0.36697500944137573, "icl_sequence_length": 76, "num_contexts": 3, "step": 26920 }, { "grad_norm": 1918.9229736328125, "learning_rate": 7.938346910215402e-05, "loss": 0.3462, "step": 26930 }, { "gate_value": 0.36697521805763245, "icl_sequence_length": 82, "num_contexts": 3, "step": 26930 }, { "grad_norm": 6161.009765625, "learning_rate": 7.927408493018493e-05, "loss": 0.3719, "step": 26940 }, { "gate_value": 0.36697572469711304, "icl_sequence_length": 72, "num_contexts": 3, "step": 26940 }, { "grad_norm": 3241.89599609375, "learning_rate": 7.916474909871199e-05, "loss": 0.3612, "step": 26950 }, { "gate_value": 0.36697638034820557, "icl_sequence_length": 76, "num_contexts": 3, "step": 26950 }, { "grad_norm": 18069.666015625, "learning_rate": 7.9055461682465e-05, "loss": 0.3615, "step": 26960 }, { "gate_value": 0.36697715520858765, "icl_sequence_length": 94, "num_contexts": 3, "step": 26960 }, { "grad_norm": 32112.46484375, "learning_rate": 7.894622275614102e-05, "loss": 0.3598, "step": 26970 }, { "gate_value": 0.36697834730148315, "icl_sequence_length": 76, "num_contexts": 3, "step": 26970 }, { "grad_norm": 9550.240234375, "learning_rate": 7.883703239440377e-05, "loss": 0.3606, "step": 26980 }, { "gate_value": 0.36697885394096375, "icl_sequence_length": 56, "num_contexts": 3, "step": 26980 }, { "grad_norm": 19501.4921875, "learning_rate": 7.872789067188391e-05, "loss": 0.3556, "step": 26990 }, { "gate_value": 0.36697813868522644, "icl_sequence_length": 88, "num_contexts": 3, "step": 26990 }, { "grad_norm": 8118.97607421875, "learning_rate": 7.861879766317873e-05, "loss": 0.3732, "step": 27000 }, { "gate_value": 0.36697742342948914, "icl_sequence_length": 90, "num_contexts": 3, "step": 27000 }, { "grad_norm": 17374.55859375, "learning_rate": 7.850975344285219e-05, "loss": 0.3678, "step": 27010 }, { "gate_value": 0.3669770359992981, "icl_sequence_length": 70, "num_contexts": 3, "step": 27010 }, { "grad_norm": 69148.9453125, "learning_rate": 7.840075808543508e-05, "loss": 0.3632, "step": 27020 }, { "gate_value": 0.3669770061969757, "icl_sequence_length": 94, "num_contexts": 3, "step": 27020 }, { "grad_norm": 14847.521484375, "learning_rate": 7.829181166542464e-05, "loss": 0.366, "step": 27030 }, { "gate_value": 0.36697685718536377, "icl_sequence_length": 74, "num_contexts": 3, "step": 27030 }, { "grad_norm": 24615.54296875, "learning_rate": 7.81829142572848e-05, "loss": 0.3668, "step": 27040 }, { "gate_value": 0.36697646975517273, "icl_sequence_length": 76, "num_contexts": 3, "step": 27040 }, { "grad_norm": 36081.73828125, "learning_rate": 7.807406593544592e-05, "loss": 0.3775, "step": 27050 }, { "gate_value": 0.3669762909412384, "icl_sequence_length": 90, "num_contexts": 3, "step": 27050 }, { "grad_norm": 19997.75, "learning_rate": 7.796526677430468e-05, "loss": 0.3613, "step": 27060 }, { "gate_value": 0.3669760525226593, "icl_sequence_length": 90, "num_contexts": 3, "step": 27060 }, { "grad_norm": 18589.345703125, "learning_rate": 7.785651684822436e-05, "loss": 0.377, "step": 27070 }, { "gate_value": 0.3669760227203369, "icl_sequence_length": 72, "num_contexts": 3, "step": 27070 }, { "grad_norm": 30868.748046875, "learning_rate": 7.774781623153455e-05, "loss": 0.348, "step": 27080 }, { "gate_value": 0.36697590351104736, "icl_sequence_length": 92, "num_contexts": 3, "step": 27080 }, { "grad_norm": 16647.330078125, "learning_rate": 7.7639164998531e-05, "loss": 0.3709, "step": 27090 }, { "gate_value": 0.366975873708725, "icl_sequence_length": 88, "num_contexts": 3, "step": 27090 }, { "grad_norm": 35517.39453125, "learning_rate": 7.75305632234759e-05, "loss": 0.3702, "step": 27100 }, { "gate_value": 0.3669756054878235, "icl_sequence_length": 86, "num_contexts": 3, "step": 27100 }, { "grad_norm": 31658.29296875, "learning_rate": 7.742201098059746e-05, "loss": 0.3734, "step": 27110 }, { "gate_value": 0.36697515845298767, "icl_sequence_length": 74, "num_contexts": 3, "step": 27110 }, { "grad_norm": 13528.32421875, "learning_rate": 7.731350834409011e-05, "loss": 0.3628, "step": 27120 }, { "gate_value": 0.36697474122047424, "icl_sequence_length": 82, "num_contexts": 3, "step": 27120 }, { "grad_norm": 17795.712890625, "learning_rate": 7.720505538811444e-05, "loss": 0.3656, "step": 27130 }, { "gate_value": 0.366974413394928, "icl_sequence_length": 68, "num_contexts": 3, "step": 27130 }, { "grad_norm": 13977.12109375, "learning_rate": 7.709665218679698e-05, "loss": 0.3694, "step": 27140 }, { "gate_value": 0.36697402596473694, "icl_sequence_length": 72, "num_contexts": 3, "step": 27140 }, { "grad_norm": 26547.1484375, "learning_rate": 7.698829881423039e-05, "loss": 0.3581, "step": 27150 }, { "gate_value": 0.3669734299182892, "icl_sequence_length": 74, "num_contexts": 3, "step": 27150 }, { "grad_norm": 9316.2724609375, "learning_rate": 7.687999534447303e-05, "loss": 0.3642, "step": 27160 }, { "gate_value": 0.366972953081131, "icl_sequence_length": 78, "num_contexts": 3, "step": 27160 }, { "grad_norm": 12746.78515625, "learning_rate": 7.677174185154943e-05, "loss": 0.3871, "step": 27170 }, { "gate_value": 0.36697226762771606, "icl_sequence_length": 82, "num_contexts": 3, "step": 27170 }, { "grad_norm": 11482.865234375, "learning_rate": 7.666353840944972e-05, "loss": 0.3691, "step": 27180 }, { "gate_value": 0.36697155237197876, "icl_sequence_length": 76, "num_contexts": 3, "step": 27180 }, { "grad_norm": 14710.9580078125, "learning_rate": 7.655538509212998e-05, "loss": 0.3585, "step": 27190 }, { "gate_value": 0.36697089672088623, "icl_sequence_length": 86, "num_contexts": 3, "step": 27190 }, { "grad_norm": 8220.4599609375, "learning_rate": 7.644728197351205e-05, "loss": 0.363, "step": 27200 }, { "gate_value": 0.36697012186050415, "icl_sequence_length": 78, "num_contexts": 3, "step": 27200 }, { "grad_norm": 2625.1962890625, "learning_rate": 7.633922912748328e-05, "loss": 0.3613, "step": 27210 }, { "gate_value": 0.36697015166282654, "icl_sequence_length": 78, "num_contexts": 3, "step": 27210 }, { "grad_norm": 662.7630004882812, "learning_rate": 7.623122662789681e-05, "loss": 0.3677, "step": 27220 }, { "gate_value": 0.3669702708721161, "icl_sequence_length": 62, "num_contexts": 3, "step": 27220 }, { "grad_norm": 1302.579345703125, "learning_rate": 7.612327454857134e-05, "loss": 0.3699, "step": 27230 }, { "gate_value": 0.3669745922088623, "icl_sequence_length": 66, "num_contexts": 3, "step": 27230 }, { "grad_norm": 712.6267700195312, "learning_rate": 7.601537296329109e-05, "loss": 0.3561, "step": 27240 }, { "gate_value": 0.36698177456855774, "icl_sequence_length": 80, "num_contexts": 3, "step": 27240 }, { "grad_norm": 1077.0098876953125, "learning_rate": 7.590752194580589e-05, "loss": 0.3671, "step": 27250 }, { "gate_value": 0.3669895827770233, "icl_sequence_length": 82, "num_contexts": 3, "step": 27250 }, { "grad_norm": 1208.8837890625, "learning_rate": 7.579972156983075e-05, "loss": 0.3763, "step": 27260 }, { "gate_value": 0.36700811982154846, "icl_sequence_length": 70, "num_contexts": 3, "step": 27260 }, { "grad_norm": 390.91094970703125, "learning_rate": 7.56919719090462e-05, "loss": 0.3694, "step": 27270 }, { "gate_value": 0.3670542538166046, "icl_sequence_length": 76, "num_contexts": 3, "step": 27270 }, { "grad_norm": 269.8956604003906, "learning_rate": 7.558427303709817e-05, "loss": 0.3803, "step": 27280 }, { "gate_value": 0.36706268787384033, "icl_sequence_length": 64, "num_contexts": 3, "step": 27280 }, { "grad_norm": 39.60105895996094, "learning_rate": 7.547662502759783e-05, "loss": 0.3695, "step": 27290 }, { "gate_value": 0.36706313490867615, "icl_sequence_length": 70, "num_contexts": 3, "step": 27290 }, { "grad_norm": 1831.5201416015625, "learning_rate": 7.536902795412159e-05, "loss": 0.3556, "step": 27300 }, { "gate_value": 0.366983562707901, "icl_sequence_length": 70, "num_contexts": 3, "step": 27300 }, { "grad_norm": 1605.810302734375, "learning_rate": 7.5261481890211e-05, "loss": 0.3733, "step": 27310 }, { "gate_value": 0.3669770359992981, "icl_sequence_length": 92, "num_contexts": 3, "step": 27310 }, { "grad_norm": 68.8116683959961, "learning_rate": 7.515398690937279e-05, "loss": 0.3808, "step": 27320 }, { "gate_value": 0.36701732873916626, "icl_sequence_length": 78, "num_contexts": 3, "step": 27320 }, { "grad_norm": 18.84232521057129, "learning_rate": 7.504654308507875e-05, "loss": 0.3664, "step": 27330 }, { "gate_value": 0.36712032556533813, "icl_sequence_length": 74, "num_contexts": 3, "step": 27330 }, { "grad_norm": 41.52174377441406, "learning_rate": 7.493915049076576e-05, "loss": 0.3641, "step": 27340 }, { "gate_value": 0.36718326807022095, "icl_sequence_length": 70, "num_contexts": 3, "step": 27340 }, { "grad_norm": 24.148374557495117, "learning_rate": 7.48318091998357e-05, "loss": 0.3496, "step": 27350 }, { "gate_value": 0.3673146963119507, "icl_sequence_length": 88, "num_contexts": 3, "step": 27350 }, { "grad_norm": 14.730865478515625, "learning_rate": 7.472451928565523e-05, "loss": 0.3738, "step": 27360 }, { "gate_value": 0.3674127161502838, "icl_sequence_length": 86, "num_contexts": 3, "step": 27360 }, { "grad_norm": 63.99334716796875, "learning_rate": 7.461728082155597e-05, "loss": 0.3546, "step": 27370 }, { "gate_value": 0.36764848232269287, "icl_sequence_length": 86, "num_contexts": 3, "step": 27370 }, { "grad_norm": 42.657657623291016, "learning_rate": 7.451009388083445e-05, "loss": 0.3724, "step": 27380 }, { "gate_value": 0.3677850663661957, "icl_sequence_length": 64, "num_contexts": 3, "step": 27380 }, { "grad_norm": 61.866371154785156, "learning_rate": 7.440295853675195e-05, "loss": 0.3768, "step": 27390 }, { "gate_value": 0.3676343560218811, "icl_sequence_length": 86, "num_contexts": 3, "step": 27390 }, { "grad_norm": 7950.65771484375, "learning_rate": 7.42958748625345e-05, "loss": 0.3605, "step": 27400 }, { "gate_value": 0.3675735890865326, "icl_sequence_length": 90, "num_contexts": 3, "step": 27400 }, { "grad_norm": 22.300975799560547, "learning_rate": 7.418884293137267e-05, "loss": 0.363, "step": 27410 }, { "gate_value": 0.3675788938999176, "icl_sequence_length": 76, "num_contexts": 3, "step": 27410 }, { "grad_norm": 2110.2626953125, "learning_rate": 7.408186281642186e-05, "loss": 0.3656, "step": 27420 }, { "gate_value": 0.3675514757633209, "icl_sequence_length": 80, "num_contexts": 3, "step": 27420 }, { "grad_norm": 32832.37890625, "learning_rate": 7.397493459080193e-05, "loss": 0.3811, "step": 27430 }, { "gate_value": 0.36745667457580566, "icl_sequence_length": 76, "num_contexts": 3, "step": 27430 }, { "grad_norm": 111.27005004882812, "learning_rate": 7.386805832759735e-05, "loss": 0.3723, "step": 27440 }, { "gate_value": 0.36754679679870605, "icl_sequence_length": 78, "num_contexts": 3, "step": 27440 }, { "grad_norm": 67.27931213378906, "learning_rate": 7.376123409985707e-05, "loss": 0.3629, "step": 27450 }, { "gate_value": 0.3676038086414337, "icl_sequence_length": 80, "num_contexts": 3, "step": 27450 }, { "grad_norm": 74.29624938964844, "learning_rate": 7.36544619805944e-05, "loss": 0.3606, "step": 27460 }, { "gate_value": 0.36755993962287903, "icl_sequence_length": 94, "num_contexts": 3, "step": 27460 }, { "grad_norm": 14.337113380432129, "learning_rate": 7.3547742042787e-05, "loss": 0.364, "step": 27470 }, { "gate_value": 0.36757075786590576, "icl_sequence_length": 80, "num_contexts": 3, "step": 27470 }, { "grad_norm": 26.758407592773438, "learning_rate": 7.344107435937703e-05, "loss": 0.3781, "step": 27480 }, { "gate_value": 0.3676011860370636, "icl_sequence_length": 88, "num_contexts": 3, "step": 27480 }, { "grad_norm": 250.94358825683594, "learning_rate": 7.333445900327082e-05, "loss": 0.3616, "step": 27490 }, { "gate_value": 0.3674643933773041, "icl_sequence_length": 88, "num_contexts": 3, "step": 27490 }, { "grad_norm": 16.786155700683594, "learning_rate": 7.322789604733902e-05, "loss": 0.3666, "step": 27500 }, { "gate_value": 0.36760422587394714, "icl_sequence_length": 90, "num_contexts": 3, "step": 27500 }, { "grad_norm": 35.89059066772461, "learning_rate": 7.31213855644163e-05, "loss": 0.3469, "step": 27510 }, { "gate_value": 0.3677339553833008, "icl_sequence_length": 88, "num_contexts": 3, "step": 27510 }, { "grad_norm": 19.958139419555664, "learning_rate": 7.301492762730162e-05, "loss": 0.3761, "step": 27520 }, { "gate_value": 0.36780139803886414, "icl_sequence_length": 70, "num_contexts": 3, "step": 27520 }, { "grad_norm": 78.30398559570312, "learning_rate": 7.2908522308758e-05, "loss": 0.3981, "step": 27530 }, { "gate_value": 0.36783716082572937, "icl_sequence_length": 70, "num_contexts": 3, "step": 27530 }, { "grad_norm": 608.5565795898438, "learning_rate": 7.280216968151249e-05, "loss": 0.3739, "step": 27540 }, { "gate_value": 0.3678564429283142, "icl_sequence_length": 82, "num_contexts": 3, "step": 27540 }, { "grad_norm": 15003.1396484375, "learning_rate": 7.269586981825602e-05, "loss": 0.3671, "step": 27550 }, { "gate_value": 0.36783695220947266, "icl_sequence_length": 92, "num_contexts": 3, "step": 27550 }, { "grad_norm": 92.22562408447266, "learning_rate": 7.258962279164366e-05, "loss": 0.3814, "step": 27560 }, { "gate_value": 0.3677425980567932, "icl_sequence_length": 76, "num_contexts": 3, "step": 27560 }, { "grad_norm": 19.08238410949707, "learning_rate": 7.248342867429412e-05, "loss": 0.3819, "step": 27570 }, { "gate_value": 0.3676200211048126, "icl_sequence_length": 70, "num_contexts": 3, "step": 27570 }, { "grad_norm": 33.2855224609375, "learning_rate": 7.237728753879014e-05, "loss": 0.3657, "step": 27580 }, { "gate_value": 0.36787912249565125, "icl_sequence_length": 76, "num_contexts": 3, "step": 27580 }, { "grad_norm": 50.328094482421875, "learning_rate": 7.22711994576782e-05, "loss": 0.3521, "step": 27590 }, { "gate_value": 0.3680151700973511, "icl_sequence_length": 74, "num_contexts": 3, "step": 27590 }, { "grad_norm": 24.794818878173828, "learning_rate": 7.216516450346853e-05, "loss": 0.3669, "step": 27600 }, { "gate_value": 0.36789053678512573, "icl_sequence_length": 80, "num_contexts": 3, "step": 27600 }, { "grad_norm": 58.86216735839844, "learning_rate": 7.205918274863495e-05, "loss": 0.371, "step": 27610 }, { "gate_value": 0.3678717315196991, "icl_sequence_length": 70, "num_contexts": 3, "step": 27610 }, { "grad_norm": 21.35430145263672, "learning_rate": 7.195325426561501e-05, "loss": 0.353, "step": 27620 }, { "gate_value": 0.3678174912929535, "icl_sequence_length": 70, "num_contexts": 3, "step": 27620 }, { "grad_norm": 18.842321395874023, "learning_rate": 7.184737912680985e-05, "loss": 0.3591, "step": 27630 }, { "gate_value": 0.3679717481136322, "icl_sequence_length": 84, "num_contexts": 3, "step": 27630 }, { "grad_norm": 13.276396751403809, "learning_rate": 7.174155740458416e-05, "loss": 0.3645, "step": 27640 }, { "gate_value": 0.36811956763267517, "icl_sequence_length": 78, "num_contexts": 3, "step": 27640 }, { "grad_norm": 7.989704608917236, "learning_rate": 7.163578917126602e-05, "loss": 0.3766, "step": 27650 }, { "gate_value": 0.3683876693248749, "icl_sequence_length": 84, "num_contexts": 3, "step": 27650 }, { "grad_norm": 12.756558418273926, "learning_rate": 7.15300744991471e-05, "loss": 0.35, "step": 27660 }, { "gate_value": 0.3684402108192444, "icl_sequence_length": 84, "num_contexts": 3, "step": 27660 }, { "grad_norm": 20.83132553100586, "learning_rate": 7.142441346048227e-05, "loss": 0.3865, "step": 27670 }, { "gate_value": 0.36845216155052185, "icl_sequence_length": 82, "num_contexts": 3, "step": 27670 }, { "grad_norm": 19.569744110107422, "learning_rate": 7.131880612748991e-05, "loss": 0.3727, "step": 27680 }, { "gate_value": 0.3685762584209442, "icl_sequence_length": 78, "num_contexts": 3, "step": 27680 }, { "grad_norm": 185.0944366455078, "learning_rate": 7.121325257235165e-05, "loss": 0.366, "step": 27690 }, { "gate_value": 0.3684345483779907, "icl_sequence_length": 92, "num_contexts": 3, "step": 27690 }, { "grad_norm": 700.8203125, "learning_rate": 7.110775286721242e-05, "loss": 0.362, "step": 27700 }, { "gate_value": 0.3685183525085449, "icl_sequence_length": 84, "num_contexts": 3, "step": 27700 }, { "grad_norm": 26.56828498840332, "learning_rate": 7.10023070841801e-05, "loss": 0.3669, "step": 27710 }, { "gate_value": 0.3687787652015686, "icl_sequence_length": 74, "num_contexts": 3, "step": 27710 }, { "grad_norm": 26.16849136352539, "learning_rate": 7.0896915295326e-05, "loss": 0.3551, "step": 27720 }, { "gate_value": 0.36871036887168884, "icl_sequence_length": 72, "num_contexts": 3, "step": 27720 }, { "grad_norm": 28.85477066040039, "learning_rate": 7.079157757268446e-05, "loss": 0.3546, "step": 27730 }, { "gate_value": 0.3683936297893524, "icl_sequence_length": 88, "num_contexts": 3, "step": 27730 }, { "grad_norm": 54.32163619995117, "learning_rate": 7.06862939882527e-05, "loss": 0.3639, "step": 27740 }, { "gate_value": 0.3683428466320038, "icl_sequence_length": 80, "num_contexts": 3, "step": 27740 }, { "grad_norm": 954.1820068359375, "learning_rate": 7.058106461399111e-05, "loss": 0.3498, "step": 27750 }, { "gate_value": 0.3682975769042969, "icl_sequence_length": 76, "num_contexts": 3, "step": 27750 }, { "grad_norm": 58.116432189941406, "learning_rate": 7.047588952182304e-05, "loss": 0.3796, "step": 27760 }, { "gate_value": 0.36843323707580566, "icl_sequence_length": 84, "num_contexts": 3, "step": 27760 }, { "grad_norm": 24.86961555480957, "learning_rate": 7.037076878363458e-05, "loss": 0.3611, "step": 27770 }, { "gate_value": 0.36853909492492676, "icl_sequence_length": 76, "num_contexts": 3, "step": 27770 }, { "grad_norm": 23.842926025390625, "learning_rate": 7.026570247127476e-05, "loss": 0.3666, "step": 27780 }, { "gate_value": 0.3685752749443054, "icl_sequence_length": 86, "num_contexts": 3, "step": 27780 }, { "grad_norm": 81.14399719238281, "learning_rate": 7.016069065655547e-05, "loss": 0.3641, "step": 27790 }, { "gate_value": 0.3687325716018677, "icl_sequence_length": 76, "num_contexts": 3, "step": 27790 }, { "grad_norm": 14.452661514282227, "learning_rate": 7.005573341125133e-05, "loss": 0.3661, "step": 27800 }, { "gate_value": 0.36890509724617004, "icl_sequence_length": 76, "num_contexts": 3, "step": 27800 }, { "grad_norm": 12.318946838378906, "learning_rate": 6.995083080709951e-05, "loss": 0.3793, "step": 27810 }, { "gate_value": 0.36927101016044617, "icl_sequence_length": 74, "num_contexts": 3, "step": 27810 }, { "grad_norm": 9.148600578308105, "learning_rate": 6.98459829158001e-05, "loss": 0.3592, "step": 27820 }, { "gate_value": 0.36944547295570374, "icl_sequence_length": 74, "num_contexts": 3, "step": 27820 }, { "grad_norm": 11.932467460632324, "learning_rate": 6.974118980901546e-05, "loss": 0.3548, "step": 27830 }, { "gate_value": 0.3691197633743286, "icl_sequence_length": 82, "num_contexts": 3, "step": 27830 }, { "grad_norm": 13.40058422088623, "learning_rate": 6.963645155837084e-05, "loss": 0.3606, "step": 27840 }, { "gate_value": 0.3690016567707062, "icl_sequence_length": 72, "num_contexts": 3, "step": 27840 }, { "grad_norm": 9.346571922302246, "learning_rate": 6.953176823545375e-05, "loss": 0.381, "step": 27850 }, { "gate_value": 0.3696131706237793, "icl_sequence_length": 66, "num_contexts": 3, "step": 27850 }, { "grad_norm": 16.809316635131836, "learning_rate": 6.942713991181439e-05, "loss": 0.3705, "step": 27860 }, { "gate_value": 0.36983686685562134, "icl_sequence_length": 82, "num_contexts": 3, "step": 27860 }, { "grad_norm": 28.75189971923828, "learning_rate": 6.932256665896507e-05, "loss": 0.358, "step": 27870 }, { "gate_value": 0.3699440658092499, "icl_sequence_length": 88, "num_contexts": 3, "step": 27870 }, { "grad_norm": 6.663064002990723, "learning_rate": 6.92180485483807e-05, "loss": 0.3628, "step": 27880 }, { "gate_value": 0.3699015974998474, "icl_sequence_length": 86, "num_contexts": 3, "step": 27880 }, { "grad_norm": 16.0593204498291, "learning_rate": 6.911358565149842e-05, "loss": 0.3758, "step": 27890 }, { "gate_value": 0.36944106221199036, "icl_sequence_length": 68, "num_contexts": 3, "step": 27890 }, { "grad_norm": 8.876472473144531, "learning_rate": 6.90091780397177e-05, "loss": 0.373, "step": 27900 }, { "gate_value": 0.3694072961807251, "icl_sequence_length": 76, "num_contexts": 3, "step": 27900 }, { "grad_norm": 38.90917205810547, "learning_rate": 6.890482578440002e-05, "loss": 0.3733, "step": 27910 }, { "gate_value": 0.3693521320819855, "icl_sequence_length": 90, "num_contexts": 3, "step": 27910 }, { "grad_norm": 16.983854293823242, "learning_rate": 6.88005289568693e-05, "loss": 0.3763, "step": 27920 }, { "gate_value": 0.36918890476226807, "icl_sequence_length": 84, "num_contexts": 3, "step": 27920 }, { "grad_norm": 11.926179885864258, "learning_rate": 6.869628762841132e-05, "loss": 0.3716, "step": 27930 }, { "gate_value": 0.3690800368785858, "icl_sequence_length": 72, "num_contexts": 3, "step": 27930 }, { "grad_norm": 47.29314041137695, "learning_rate": 6.859210187027408e-05, "loss": 0.3642, "step": 27940 }, { "gate_value": 0.36900752782821655, "icl_sequence_length": 86, "num_contexts": 3, "step": 27940 }, { "grad_norm": 13.692339897155762, "learning_rate": 6.848797175366759e-05, "loss": 0.365, "step": 27950 }, { "gate_value": 0.36898988485336304, "icl_sequence_length": 74, "num_contexts": 3, "step": 27950 }, { "grad_norm": 15.379136085510254, "learning_rate": 6.838389734976386e-05, "loss": 0.3793, "step": 27960 }, { "gate_value": 0.36915260553359985, "icl_sequence_length": 66, "num_contexts": 3, "step": 27960 }, { "grad_norm": 34.57461929321289, "learning_rate": 6.827987872969663e-05, "loss": 0.3821, "step": 27970 }, { "gate_value": 0.3692905008792877, "icl_sequence_length": 88, "num_contexts": 3, "step": 27970 }, { "grad_norm": 23.75289535522461, "learning_rate": 6.817591596456173e-05, "loss": 0.3673, "step": 27980 }, { "gate_value": 0.36970794200897217, "icl_sequence_length": 84, "num_contexts": 3, "step": 27980 }, { "grad_norm": 43.34590148925781, "learning_rate": 6.80720091254167e-05, "loss": 0.3745, "step": 27990 }, { "gate_value": 0.36998069286346436, "icl_sequence_length": 88, "num_contexts": 3, "step": 27990 }, { "grad_norm": 9.07591438293457, "learning_rate": 6.796815828328096e-05, "loss": 0.3683, "step": 28000 }, { "gate_value": 0.37007784843444824, "icl_sequence_length": 80, "num_contexts": 3, "step": 28000 }, { "grad_norm": 15.812060356140137, "learning_rate": 6.78643635091355e-05, "loss": 0.368, "step": 28010 }, { "gate_value": 0.3701871633529663, "icl_sequence_length": 88, "num_contexts": 3, "step": 28010 }, { "grad_norm": 753.4508056640625, "learning_rate": 6.776062487392305e-05, "loss": 0.3807, "step": 28020 }, { "gate_value": 0.3702496886253357, "icl_sequence_length": 88, "num_contexts": 3, "step": 28020 }, { "grad_norm": 16.918176651000977, "learning_rate": 6.765694244854803e-05, "loss": 0.3642, "step": 28030 }, { "gate_value": 0.3702527582645416, "icl_sequence_length": 88, "num_contexts": 3, "step": 28030 }, { "grad_norm": 28.58259391784668, "learning_rate": 6.75533163038764e-05, "loss": 0.3822, "step": 28040 }, { "gate_value": 0.3701658546924591, "icl_sequence_length": 82, "num_contexts": 3, "step": 28040 }, { "grad_norm": 16.40875244140625, "learning_rate": 6.744974651073563e-05, "loss": 0.362, "step": 28050 }, { "gate_value": 0.37029552459716797, "icl_sequence_length": 84, "num_contexts": 3, "step": 28050 }, { "grad_norm": 6.374593734741211, "learning_rate": 6.734623313991478e-05, "loss": 0.384, "step": 28060 }, { "gate_value": 0.3704751133918762, "icl_sequence_length": 84, "num_contexts": 3, "step": 28060 }, { "grad_norm": 18.405296325683594, "learning_rate": 6.724277626216416e-05, "loss": 0.3776, "step": 28070 }, { "gate_value": 0.37067288160324097, "icl_sequence_length": 82, "num_contexts": 3, "step": 28070 }, { "grad_norm": 12.541691780090332, "learning_rate": 6.71393759481956e-05, "loss": 0.3754, "step": 28080 }, { "gate_value": 0.37075117230415344, "icl_sequence_length": 68, "num_contexts": 3, "step": 28080 }, { "grad_norm": 29.952152252197266, "learning_rate": 6.703603226868226e-05, "loss": 0.3633, "step": 28090 }, { "gate_value": 0.3708937466144562, "icl_sequence_length": 72, "num_contexts": 3, "step": 28090 }, { "grad_norm": 13.219239234924316, "learning_rate": 6.69327452942586e-05, "loss": 0.3807, "step": 28100 }, { "gate_value": 0.3709869384765625, "icl_sequence_length": 94, "num_contexts": 3, "step": 28100 }, { "grad_norm": 28.397701263427734, "learning_rate": 6.682951509552025e-05, "loss": 0.3772, "step": 28110 }, { "gate_value": 0.3711105287075043, "icl_sequence_length": 86, "num_contexts": 3, "step": 28110 }, { "grad_norm": 40.45359420776367, "learning_rate": 6.672634174302405e-05, "loss": 0.3859, "step": 28120 }, { "gate_value": 0.3712112307548523, "icl_sequence_length": 76, "num_contexts": 3, "step": 28120 }, { "grad_norm": 24.582040786743164, "learning_rate": 6.662322530728805e-05, "loss": 0.3726, "step": 28130 }, { "gate_value": 0.3713975250720978, "icl_sequence_length": 84, "num_contexts": 3, "step": 28130 }, { "grad_norm": 39473.47265625, "learning_rate": 6.652016585879133e-05, "loss": 0.376, "step": 28140 }, { "gate_value": 0.37156111001968384, "icl_sequence_length": 76, "num_contexts": 3, "step": 28140 }, { "grad_norm": 38.606075286865234, "learning_rate": 6.64171634679741e-05, "loss": 0.3557, "step": 28150 }, { "gate_value": 0.37173065543174744, "icl_sequence_length": 94, "num_contexts": 3, "step": 28150 }, { "grad_norm": 25.573253631591797, "learning_rate": 6.631421820523755e-05, "loss": 0.3831, "step": 28160 }, { "gate_value": 0.3719416856765747, "icl_sequence_length": 86, "num_contexts": 3, "step": 28160 }, { "grad_norm": 10.938640594482422, "learning_rate": 6.621133014094367e-05, "loss": 0.3608, "step": 28170 }, { "gate_value": 0.3720950484275818, "icl_sequence_length": 62, "num_contexts": 3, "step": 28170 }, { "grad_norm": 7.006185054779053, "learning_rate": 6.610849934541557e-05, "loss": 0.3519, "step": 28180 }, { "gate_value": 0.3721492886543274, "icl_sequence_length": 72, "num_contexts": 3, "step": 28180 }, { "grad_norm": 10.840231895446777, "learning_rate": 6.600572588893712e-05, "loss": 0.3721, "step": 28190 }, { "gate_value": 0.3722116947174072, "icl_sequence_length": 90, "num_contexts": 3, "step": 28190 }, { "grad_norm": 11.135544776916504, "learning_rate": 6.590300984175306e-05, "loss": 0.3454, "step": 28200 }, { "gate_value": 0.37256065011024475, "icl_sequence_length": 90, "num_contexts": 3, "step": 28200 }, { "grad_norm": 116.5069580078125, "learning_rate": 6.580035127406874e-05, "loss": 0.3552, "step": 28210 }, { "gate_value": 0.37304627895355225, "icl_sequence_length": 66, "num_contexts": 3, "step": 28210 }, { "grad_norm": 11.667895317077637, "learning_rate": 6.569775025605042e-05, "loss": 0.3636, "step": 28220 }, { "gate_value": 0.3732167184352875, "icl_sequence_length": 92, "num_contexts": 3, "step": 28220 }, { "grad_norm": 8.537809371948242, "learning_rate": 6.559520685782481e-05, "loss": 0.3553, "step": 28230 }, { "gate_value": 0.373494029045105, "icl_sequence_length": 64, "num_contexts": 3, "step": 28230 }, { "grad_norm": 13.881553649902344, "learning_rate": 6.549272114947945e-05, "loss": 0.3553, "step": 28240 }, { "gate_value": 0.37379199266433716, "icl_sequence_length": 76, "num_contexts": 3, "step": 28240 }, { "grad_norm": 16.013992309570312, "learning_rate": 6.539029320106232e-05, "loss": 0.3729, "step": 28250 }, { "gate_value": 0.3742316961288452, "icl_sequence_length": 80, "num_contexts": 3, "step": 28250 }, { "grad_norm": 831.845703125, "learning_rate": 6.5287923082582e-05, "loss": 0.3863, "step": 28260 }, { "gate_value": 0.3742944896221161, "icl_sequence_length": 66, "num_contexts": 3, "step": 28260 }, { "grad_norm": 21.637969970703125, "learning_rate": 6.518561086400742e-05, "loss": 0.3844, "step": 28270 }, { "gate_value": 0.3739367127418518, "icl_sequence_length": 86, "num_contexts": 3, "step": 28270 }, { "grad_norm": 46.8300895690918, "learning_rate": 6.508335661526808e-05, "loss": 0.3752, "step": 28280 }, { "gate_value": 0.3740937113761902, "icl_sequence_length": 76, "num_contexts": 3, "step": 28280 }, { "grad_norm": 19.60430145263672, "learning_rate": 6.498116040625382e-05, "loss": 0.3773, "step": 28290 }, { "gate_value": 0.3742987811565399, "icl_sequence_length": 70, "num_contexts": 3, "step": 28290 }, { "grad_norm": 13.661314010620117, "learning_rate": 6.487902230681468e-05, "loss": 0.3614, "step": 28300 }, { "gate_value": 0.3744341731071472, "icl_sequence_length": 70, "num_contexts": 3, "step": 28300 }, { "grad_norm": 103.14254760742188, "learning_rate": 6.477694238676116e-05, "loss": 0.3723, "step": 28310 }, { "gate_value": 0.37441954016685486, "icl_sequence_length": 76, "num_contexts": 3, "step": 28310 }, { "grad_norm": 2326.118896484375, "learning_rate": 6.467492071586395e-05, "loss": 0.3766, "step": 28320 }, { "gate_value": 0.37444210052490234, "icl_sequence_length": 68, "num_contexts": 3, "step": 28320 }, { "grad_norm": 409.30987548828125, "learning_rate": 6.457295736385383e-05, "loss": 0.3519, "step": 28330 }, { "gate_value": 0.37455645203590393, "icl_sequence_length": 74, "num_contexts": 3, "step": 28330 }, { "grad_norm": 36.271575927734375, "learning_rate": 6.447105240042181e-05, "loss": 0.3723, "step": 28340 }, { "gate_value": 0.3745886981487274, "icl_sequence_length": 90, "num_contexts": 3, "step": 28340 }, { "grad_norm": 9.298629760742188, "learning_rate": 6.4369205895219e-05, "loss": 0.3602, "step": 28350 }, { "gate_value": 0.37470346689224243, "icl_sequence_length": 82, "num_contexts": 3, "step": 28350 }, { "grad_norm": 19.39296531677246, "learning_rate": 6.426741791785656e-05, "loss": 0.3644, "step": 28360 }, { "gate_value": 0.37486281991004944, "icl_sequence_length": 80, "num_contexts": 3, "step": 28360 }, { "grad_norm": 15.315899848937988, "learning_rate": 6.416568853790549e-05, "loss": 0.3484, "step": 28370 }, { "gate_value": 0.3749953508377075, "icl_sequence_length": 86, "num_contexts": 3, "step": 28370 }, { "grad_norm": 19.064598083496094, "learning_rate": 6.406401782489702e-05, "loss": 0.3667, "step": 28380 }, { "gate_value": 0.37507161498069763, "icl_sequence_length": 96, "num_contexts": 3, "step": 28380 }, { "grad_norm": 31.075756072998047, "learning_rate": 6.396240584832196e-05, "loss": 0.3682, "step": 28390 }, { "gate_value": 0.37511423230171204, "icl_sequence_length": 78, "num_contexts": 3, "step": 28390 }, { "grad_norm": 23.362079620361328, "learning_rate": 6.386085267763122e-05, "loss": 0.3665, "step": 28400 }, { "gate_value": 0.3751569092273712, "icl_sequence_length": 66, "num_contexts": 3, "step": 28400 }, { "grad_norm": 125.1460189819336, "learning_rate": 6.375935838223545e-05, "loss": 0.3623, "step": 28410 }, { "gate_value": 0.37524327635765076, "icl_sequence_length": 86, "num_contexts": 3, "step": 28410 }, { "grad_norm": 112.93570709228516, "learning_rate": 6.365792303150505e-05, "loss": 0.3568, "step": 28420 }, { "gate_value": 0.3753010928630829, "icl_sequence_length": 66, "num_contexts": 3, "step": 28420 }, { "grad_norm": 41.79950714111328, "learning_rate": 6.355654669477006e-05, "loss": 0.3566, "step": 28430 }, { "gate_value": 0.37531712651252747, "icl_sequence_length": 76, "num_contexts": 3, "step": 28430 }, { "grad_norm": 49.83927917480469, "learning_rate": 6.345522944132031e-05, "loss": 0.3622, "step": 28440 }, { "gate_value": 0.37537771463394165, "icl_sequence_length": 66, "num_contexts": 3, "step": 28440 }, { "grad_norm": 21.510791778564453, "learning_rate": 6.335397134040515e-05, "loss": 0.3845, "step": 28450 }, { "gate_value": 0.3755156397819519, "icl_sequence_length": 86, "num_contexts": 3, "step": 28450 }, { "grad_norm": 161.8980255126953, "learning_rate": 6.325277246123362e-05, "loss": 0.3502, "step": 28460 }, { "gate_value": 0.3758998513221741, "icl_sequence_length": 80, "num_contexts": 3, "step": 28460 }, { "grad_norm": 12.291893005371094, "learning_rate": 6.315163287297407e-05, "loss": 0.3496, "step": 28470 }, { "gate_value": 0.3761850595474243, "icl_sequence_length": 76, "num_contexts": 3, "step": 28470 }, { "grad_norm": 58.854331970214844, "learning_rate": 6.305055264475457e-05, "loss": 0.3675, "step": 28480 }, { "gate_value": 0.3762570023536682, "icl_sequence_length": 94, "num_contexts": 3, "step": 28480 }, { "grad_norm": 75.17032623291016, "learning_rate": 6.294953184566241e-05, "loss": 0.3631, "step": 28490 }, { "gate_value": 0.3762133717536926, "icl_sequence_length": 90, "num_contexts": 3, "step": 28490 }, { "grad_norm": 26.945207595825195, "learning_rate": 6.284857054474439e-05, "loss": 0.3608, "step": 28500 }, { "gate_value": 0.3762674033641815, "icl_sequence_length": 92, "num_contexts": 3, "step": 28500 }, { "grad_norm": 4600.4619140625, "learning_rate": 6.274766881100662e-05, "loss": 0.3755, "step": 28510 }, { "gate_value": 0.37631040811538696, "icl_sequence_length": 70, "num_contexts": 3, "step": 28510 }, { "grad_norm": 8.499515533447266, "learning_rate": 6.264682671341452e-05, "loss": 0.3622, "step": 28520 }, { "gate_value": 0.3764779567718506, "icl_sequence_length": 68, "num_contexts": 3, "step": 28520 }, { "grad_norm": 24.589887619018555, "learning_rate": 6.254604432089263e-05, "loss": 0.3621, "step": 28530 }, { "gate_value": 0.3765389025211334, "icl_sequence_length": 88, "num_contexts": 3, "step": 28530 }, { "grad_norm": 1568.8673095703125, "learning_rate": 6.24453217023248e-05, "loss": 0.3611, "step": 28540 }, { "gate_value": 0.37654218077659607, "icl_sequence_length": 72, "num_contexts": 3, "step": 28540 }, { "grad_norm": 22.204059600830078, "learning_rate": 6.2344658926554e-05, "loss": 0.3413, "step": 28550 }, { "gate_value": 0.3768579065799713, "icl_sequence_length": 86, "num_contexts": 3, "step": 28550 }, { "grad_norm": 27.134193420410156, "learning_rate": 6.224405606238233e-05, "loss": 0.3641, "step": 28560 }, { "gate_value": 0.3769153356552124, "icl_sequence_length": 88, "num_contexts": 3, "step": 28560 }, { "grad_norm": 23.41484260559082, "learning_rate": 6.214351317857085e-05, "loss": 0.3559, "step": 28570 }, { "gate_value": 0.3770342767238617, "icl_sequence_length": 78, "num_contexts": 3, "step": 28570 }, { "grad_norm": 16.700103759765625, "learning_rate": 6.204303034383964e-05, "loss": 0.3532, "step": 28580 }, { "gate_value": 0.3770424425601959, "icl_sequence_length": 80, "num_contexts": 3, "step": 28580 }, { "grad_norm": 25.210561752319336, "learning_rate": 6.194260762686779e-05, "loss": 0.3631, "step": 28590 }, { "gate_value": 0.37705928087234497, "icl_sequence_length": 90, "num_contexts": 3, "step": 28590 }, { "grad_norm": 7.759881019592285, "learning_rate": 6.184224509629329e-05, "loss": 0.3506, "step": 28600 }, { "gate_value": 0.3770466148853302, "icl_sequence_length": 90, "num_contexts": 3, "step": 28600 }, { "grad_norm": 23.259002685546875, "learning_rate": 6.1741942820713e-05, "loss": 0.3729, "step": 28610 }, { "gate_value": 0.37719234824180603, "icl_sequence_length": 78, "num_contexts": 3, "step": 28610 }, { "grad_norm": 8.094666481018066, "learning_rate": 6.164170086868262e-05, "loss": 0.3661, "step": 28620 }, { "gate_value": 0.37746044993400574, "icl_sequence_length": 80, "num_contexts": 3, "step": 28620 }, { "grad_norm": 12.716617584228516, "learning_rate": 6.154151930871646e-05, "loss": 0.3748, "step": 28630 }, { "gate_value": 0.3775232434272766, "icl_sequence_length": 72, "num_contexts": 3, "step": 28630 }, { "grad_norm": 28.23755645751953, "learning_rate": 6.144139820928774e-05, "loss": 0.3596, "step": 28640 }, { "gate_value": 0.37761175632476807, "icl_sequence_length": 84, "num_contexts": 3, "step": 28640 }, { "grad_norm": 20.320539474487305, "learning_rate": 6.134133763882831e-05, "loss": 0.3725, "step": 28650 }, { "gate_value": 0.37779858708381653, "icl_sequence_length": 94, "num_contexts": 3, "step": 28650 }, { "grad_norm": 13.410603523254395, "learning_rate": 6.124133766572864e-05, "loss": 0.3821, "step": 28660 }, { "gate_value": 0.37793204188346863, "icl_sequence_length": 80, "num_contexts": 3, "step": 28660 }, { "grad_norm": 19.16880989074707, "learning_rate": 6.114139835833773e-05, "loss": 0.377, "step": 28670 }, { "gate_value": 0.3780597150325775, "icl_sequence_length": 76, "num_contexts": 3, "step": 28670 }, { "grad_norm": 23.84910774230957, "learning_rate": 6.10415197849631e-05, "loss": 0.3646, "step": 28680 }, { "gate_value": 0.37815070152282715, "icl_sequence_length": 74, "num_contexts": 3, "step": 28680 }, { "grad_norm": 13.212060928344727, "learning_rate": 6.094170201387089e-05, "loss": 0.3655, "step": 28690 }, { "gate_value": 0.3782776892185211, "icl_sequence_length": 76, "num_contexts": 3, "step": 28690 }, { "grad_norm": 45.62289810180664, "learning_rate": 6.084194511328556e-05, "loss": 0.3452, "step": 28700 }, { "gate_value": 0.3785243332386017, "icl_sequence_length": 90, "num_contexts": 3, "step": 28700 }, { "grad_norm": 36.22829818725586, "learning_rate": 6.0742249151390046e-05, "loss": 0.3604, "step": 28710 }, { "gate_value": 0.3787309229373932, "icl_sequence_length": 90, "num_contexts": 3, "step": 28710 }, { "grad_norm": 35.166202545166016, "learning_rate": 6.064261419632564e-05, "loss": 0.3781, "step": 28720 }, { "gate_value": 0.3788740932941437, "icl_sequence_length": 82, "num_contexts": 3, "step": 28720 }, { "grad_norm": 20.661985397338867, "learning_rate": 6.054304031619178e-05, "loss": 0.3627, "step": 28730 }, { "gate_value": 0.3791327178478241, "icl_sequence_length": 74, "num_contexts": 3, "step": 28730 }, { "grad_norm": 23.819217681884766, "learning_rate": 6.044352757904634e-05, "loss": 0.3651, "step": 28740 }, { "gate_value": 0.379306435585022, "icl_sequence_length": 72, "num_contexts": 3, "step": 28740 }, { "grad_norm": 276.656982421875, "learning_rate": 6.0344076052905324e-05, "loss": 0.3534, "step": 28750 }, { "gate_value": 0.37942585349082947, "icl_sequence_length": 86, "num_contexts": 3, "step": 28750 }, { "grad_norm": 19.64164924621582, "learning_rate": 6.024468580574299e-05, "loss": 0.3464, "step": 28760 }, { "gate_value": 0.3795641362667084, "icl_sequence_length": 92, "num_contexts": 3, "step": 28760 }, { "grad_norm": 38.871768951416016, "learning_rate": 6.014535690549156e-05, "loss": 0.3599, "step": 28770 }, { "gate_value": 0.3797716200351715, "icl_sequence_length": 84, "num_contexts": 3, "step": 28770 }, { "grad_norm": 26.767852783203125, "learning_rate": 6.004608942004135e-05, "loss": 0.3468, "step": 28780 }, { "gate_value": 0.37991610169410706, "icl_sequence_length": 82, "num_contexts": 3, "step": 28780 }, { "grad_norm": 17.371288299560547, "learning_rate": 5.994688341724081e-05, "loss": 0.3654, "step": 28790 }, { "gate_value": 0.3800429701805115, "icl_sequence_length": 84, "num_contexts": 3, "step": 28790 }, { "grad_norm": 44.015480041503906, "learning_rate": 5.9847738964896305e-05, "loss": 0.3749, "step": 28800 }, { "gate_value": 0.3801386058330536, "icl_sequence_length": 80, "num_contexts": 3, "step": 28800 }, { "grad_norm": 20.374393463134766, "learning_rate": 5.974865613077213e-05, "loss": 0.3558, "step": 28810 }, { "gate_value": 0.38034605979919434, "icl_sequence_length": 80, "num_contexts": 3, "step": 28810 }, { "grad_norm": 56.06901168823242, "learning_rate": 5.964963498259052e-05, "loss": 0.3534, "step": 28820 }, { "gate_value": 0.3804914653301239, "icl_sequence_length": 74, "num_contexts": 3, "step": 28820 }, { "grad_norm": 38.3012580871582, "learning_rate": 5.95506755880314e-05, "loss": 0.3617, "step": 28830 }, { "gate_value": 0.38055482506752014, "icl_sequence_length": 80, "num_contexts": 3, "step": 28830 }, { "grad_norm": 13.337339401245117, "learning_rate": 5.945177801473262e-05, "loss": 0.3702, "step": 28840 }, { "gate_value": 0.38060054183006287, "icl_sequence_length": 88, "num_contexts": 3, "step": 28840 }, { "grad_norm": 10.94632339477539, "learning_rate": 5.935294233028982e-05, "loss": 0.3725, "step": 28850 }, { "gate_value": 0.3807608187198639, "icl_sequence_length": 68, "num_contexts": 3, "step": 28850 }, { "grad_norm": 130.21417236328125, "learning_rate": 5.925416860225611e-05, "loss": 0.3602, "step": 28860 }, { "gate_value": 0.38078853487968445, "icl_sequence_length": 64, "num_contexts": 3, "step": 28860 }, { "grad_norm": 18.017894744873047, "learning_rate": 5.915545689814254e-05, "loss": 0.3474, "step": 28870 }, { "gate_value": 0.38082894682884216, "icl_sequence_length": 76, "num_contexts": 3, "step": 28870 }, { "grad_norm": 547.781494140625, "learning_rate": 5.905680728541752e-05, "loss": 0.3553, "step": 28880 }, { "gate_value": 0.3808595836162567, "icl_sequence_length": 90, "num_contexts": 3, "step": 28880 }, { "grad_norm": 23.99994468688965, "learning_rate": 5.895821983150718e-05, "loss": 0.3673, "step": 28890 }, { "gate_value": 0.38086777925491333, "icl_sequence_length": 80, "num_contexts": 3, "step": 28890 }, { "grad_norm": 29.337862014770508, "learning_rate": 5.8859694603795116e-05, "loss": 0.3717, "step": 28900 }, { "gate_value": 0.3809985816478729, "icl_sequence_length": 80, "num_contexts": 3, "step": 28900 }, { "grad_norm": 36.2806510925293, "learning_rate": 5.876123166962238e-05, "loss": 0.3684, "step": 28910 }, { "gate_value": 0.38116392493247986, "icl_sequence_length": 70, "num_contexts": 3, "step": 28910 }, { "grad_norm": 11.751944541931152, "learning_rate": 5.8662831096287515e-05, "loss": 0.3695, "step": 28920 }, { "gate_value": 0.38121894001960754, "icl_sequence_length": 80, "num_contexts": 3, "step": 28920 }, { "grad_norm": 16.78483772277832, "learning_rate": 5.8564492951046285e-05, "loss": 0.365, "step": 28930 }, { "gate_value": 0.38121622800827026, "icl_sequence_length": 82, "num_contexts": 3, "step": 28930 }, { "grad_norm": 11.2169828414917, "learning_rate": 5.846621730111199e-05, "loss": 0.366, "step": 28940 }, { "gate_value": 0.3812275826931, "icl_sequence_length": 88, "num_contexts": 3, "step": 28940 }, { "grad_norm": 14.724105834960938, "learning_rate": 5.836800421365502e-05, "loss": 0.3633, "step": 28950 }, { "gate_value": 0.3812483251094818, "icl_sequence_length": 72, "num_contexts": 3, "step": 28950 }, { "grad_norm": 21.191669464111328, "learning_rate": 5.826985375580312e-05, "loss": 0.3573, "step": 28960 }, { "gate_value": 0.38138577342033386, "icl_sequence_length": 82, "num_contexts": 3, "step": 28960 }, { "grad_norm": 20.08879852294922, "learning_rate": 5.8171765994641274e-05, "loss": 0.3494, "step": 28970 }, { "gate_value": 0.38147062063217163, "icl_sequence_length": 78, "num_contexts": 3, "step": 28970 }, { "grad_norm": 3683.35107421875, "learning_rate": 5.807374099721142e-05, "loss": 0.3737, "step": 28980 }, { "gate_value": 0.38168245553970337, "icl_sequence_length": 86, "num_contexts": 3, "step": 28980 }, { "grad_norm": 42.52603530883789, "learning_rate": 5.7975778830512784e-05, "loss": 0.3509, "step": 28990 }, { "gate_value": 0.38184642791748047, "icl_sequence_length": 94, "num_contexts": 3, "step": 28990 }, { "grad_norm": 34.126991271972656, "learning_rate": 5.7877879561501596e-05, "loss": 0.3557, "step": 29000 }, { "gate_value": 0.38198453187942505, "icl_sequence_length": 78, "num_contexts": 3, "step": 29000 }, { "grad_norm": 45.88672637939453, "learning_rate": 5.778004325709105e-05, "loss": 0.3735, "step": 29010 }, { "gate_value": 0.38210225105285645, "icl_sequence_length": 88, "num_contexts": 3, "step": 29010 }, { "grad_norm": 358.89947509765625, "learning_rate": 5.768226998415142e-05, "loss": 0.3599, "step": 29020 }, { "gate_value": 0.3822075128555298, "icl_sequence_length": 82, "num_contexts": 3, "step": 29020 }, { "grad_norm": 31.956314086914062, "learning_rate": 5.758455980950974e-05, "loss": 0.3661, "step": 29030 }, { "gate_value": 0.38228172063827515, "icl_sequence_length": 78, "num_contexts": 3, "step": 29030 }, { "grad_norm": 35.25785446166992, "learning_rate": 5.7486912799949956e-05, "loss": 0.3525, "step": 29040 }, { "gate_value": 0.3823954463005066, "icl_sequence_length": 86, "num_contexts": 3, "step": 29040 }, { "grad_norm": 57.509822845458984, "learning_rate": 5.738932902221294e-05, "loss": 0.3745, "step": 29050 }, { "gate_value": 0.38250795006752014, "icl_sequence_length": 94, "num_contexts": 3, "step": 29050 }, { "grad_norm": 16.257240295410156, "learning_rate": 5.7291808542996245e-05, "loss": 0.3593, "step": 29060 }, { "gate_value": 0.38265126943588257, "icl_sequence_length": 86, "num_contexts": 3, "step": 29060 }, { "grad_norm": 556.2877197265625, "learning_rate": 5.719435142895429e-05, "loss": 0.3626, "step": 29070 }, { "gate_value": 0.3827889859676361, "icl_sequence_length": 82, "num_contexts": 3, "step": 29070 }, { "grad_norm": 618.673583984375, "learning_rate": 5.709695774669799e-05, "loss": 0.3487, "step": 29080 }, { "gate_value": 0.3828449547290802, "icl_sequence_length": 84, "num_contexts": 3, "step": 29080 }, { "grad_norm": 479.97174072265625, "learning_rate": 5.699962756279504e-05, "loss": 0.3614, "step": 29090 }, { "gate_value": 0.38286563754081726, "icl_sequence_length": 78, "num_contexts": 3, "step": 29090 }, { "grad_norm": 36.629093170166016, "learning_rate": 5.690236094376969e-05, "loss": 0.3593, "step": 29100 }, { "gate_value": 0.38298630714416504, "icl_sequence_length": 60, "num_contexts": 3, "step": 29100 }, { "grad_norm": 30.256851196289062, "learning_rate": 5.68051579561028e-05, "loss": 0.3752, "step": 29110 }, { "gate_value": 0.38303476572036743, "icl_sequence_length": 68, "num_contexts": 3, "step": 29110 }, { "grad_norm": 31.483182907104492, "learning_rate": 5.670801866623171e-05, "loss": 0.3649, "step": 29120 }, { "gate_value": 0.38311877846717834, "icl_sequence_length": 58, "num_contexts": 3, "step": 29120 }, { "grad_norm": 9.485666275024414, "learning_rate": 5.661094314055018e-05, "loss": 0.3615, "step": 29130 }, { "gate_value": 0.3831852078437805, "icl_sequence_length": 84, "num_contexts": 3, "step": 29130 }, { "grad_norm": 12.52168083190918, "learning_rate": 5.651393144540834e-05, "loss": 0.3737, "step": 29140 }, { "gate_value": 0.3834601640701294, "icl_sequence_length": 58, "num_contexts": 3, "step": 29140 }, { "grad_norm": 245.93165588378906, "learning_rate": 5.641698364711286e-05, "loss": 0.3661, "step": 29150 }, { "gate_value": 0.3837195336818695, "icl_sequence_length": 78, "num_contexts": 3, "step": 29150 }, { "grad_norm": 12.20414924621582, "learning_rate": 5.632009981192661e-05, "loss": 0.375, "step": 29160 }, { "gate_value": 0.3836742043495178, "icl_sequence_length": 76, "num_contexts": 3, "step": 29160 }, { "grad_norm": 117.4782485961914, "learning_rate": 5.6223280006068835e-05, "loss": 0.3521, "step": 29170 }, { "gate_value": 0.3838871717453003, "icl_sequence_length": 78, "num_contexts": 3, "step": 29170 }, { "grad_norm": 23.043319702148438, "learning_rate": 5.612652429571487e-05, "loss": 0.3451, "step": 29180 }, { "gate_value": 0.38392624258995056, "icl_sequence_length": 82, "num_contexts": 3, "step": 29180 }, { "grad_norm": 22.326448440551758, "learning_rate": 5.6029832746996375e-05, "loss": 0.3823, "step": 29190 }, { "gate_value": 0.3842223286628723, "icl_sequence_length": 90, "num_contexts": 3, "step": 29190 }, { "grad_norm": 42.24995040893555, "learning_rate": 5.593320542600111e-05, "loss": 0.3669, "step": 29200 }, { "gate_value": 0.38453155755996704, "icl_sequence_length": 92, "num_contexts": 3, "step": 29200 }, { "grad_norm": 14.570834159851074, "learning_rate": 5.583664239877294e-05, "loss": 0.3474, "step": 29210 }, { "gate_value": 0.3847702741622925, "icl_sequence_length": 76, "num_contexts": 3, "step": 29210 }, { "grad_norm": 14.977248191833496, "learning_rate": 5.574014373131184e-05, "loss": 0.3551, "step": 29220 }, { "gate_value": 0.3847298324108124, "icl_sequence_length": 78, "num_contexts": 3, "step": 29220 }, { "grad_norm": 18.472740173339844, "learning_rate": 5.5643709489573675e-05, "loss": 0.3746, "step": 29230 }, { "gate_value": 0.3847012519836426, "icl_sequence_length": 74, "num_contexts": 3, "step": 29230 }, { "grad_norm": 23.11821937561035, "learning_rate": 5.554733973947029e-05, "loss": 0.374, "step": 29240 }, { "gate_value": 0.38457468152046204, "icl_sequence_length": 70, "num_contexts": 3, "step": 29240 }, { "grad_norm": 19.69847869873047, "learning_rate": 5.545103454686957e-05, "loss": 0.3472, "step": 29250 }, { "gate_value": 0.38453060388565063, "icl_sequence_length": 86, "num_contexts": 3, "step": 29250 }, { "grad_norm": 11.556638717651367, "learning_rate": 5.535479397759519e-05, "loss": 0.3721, "step": 29260 }, { "gate_value": 0.38458558917045593, "icl_sequence_length": 96, "num_contexts": 3, "step": 29260 }, { "grad_norm": 15350.7255859375, "learning_rate": 5.5258618097426735e-05, "loss": 0.3611, "step": 29270 }, { "gate_value": 0.3847283720970154, "icl_sequence_length": 70, "num_contexts": 3, "step": 29270 }, { "grad_norm": 26.339685440063477, "learning_rate": 5.516250697209938e-05, "loss": 0.3536, "step": 29280 }, { "gate_value": 0.38488730788230896, "icl_sequence_length": 64, "num_contexts": 3, "step": 29280 }, { "grad_norm": 34.276817321777344, "learning_rate": 5.5066460667304254e-05, "loss": 0.3833, "step": 29290 }, { "gate_value": 0.3849720358848572, "icl_sequence_length": 90, "num_contexts": 3, "step": 29290 }, { "grad_norm": 21.733354568481445, "learning_rate": 5.49704792486881e-05, "loss": 0.3801, "step": 29300 }, { "gate_value": 0.3850296437740326, "icl_sequence_length": 82, "num_contexts": 3, "step": 29300 }, { "grad_norm": 13.726507186889648, "learning_rate": 5.4874562781853356e-05, "loss": 0.3572, "step": 29310 }, { "gate_value": 0.3850591778755188, "icl_sequence_length": 74, "num_contexts": 3, "step": 29310 }, { "grad_norm": 5749.76025390625, "learning_rate": 5.477871133235791e-05, "loss": 0.3633, "step": 29320 }, { "gate_value": 0.38509833812713623, "icl_sequence_length": 72, "num_contexts": 3, "step": 29320 }, { "grad_norm": 772.3057861328125, "learning_rate": 5.468292496571545e-05, "loss": 0.3667, "step": 29330 }, { "gate_value": 0.38516709208488464, "icl_sequence_length": 92, "num_contexts": 3, "step": 29330 }, { "grad_norm": 20.018020629882812, "learning_rate": 5.458720374739493e-05, "loss": 0.3831, "step": 29340 }, { "gate_value": 0.38528576493263245, "icl_sequence_length": 66, "num_contexts": 3, "step": 29340 }, { "grad_norm": 13.0118989944458, "learning_rate": 5.449154774282096e-05, "loss": 0.3817, "step": 29350 }, { "gate_value": 0.38539865612983704, "icl_sequence_length": 82, "num_contexts": 3, "step": 29350 }, { "grad_norm": 18.307941436767578, "learning_rate": 5.4395957017373514e-05, "loss": 0.3649, "step": 29360 }, { "gate_value": 0.38549789786338806, "icl_sequence_length": 82, "num_contexts": 3, "step": 29360 }, { "grad_norm": 3333.202880859375, "learning_rate": 5.430043163638801e-05, "loss": 0.3654, "step": 29370 }, { "gate_value": 0.3855592608451843, "icl_sequence_length": 84, "num_contexts": 3, "step": 29370 }, { "grad_norm": 35.40414047241211, "learning_rate": 5.420497166515503e-05, "loss": 0.3538, "step": 29380 }, { "gate_value": 0.38564735651016235, "icl_sequence_length": 78, "num_contexts": 3, "step": 29380 }, { "grad_norm": 2308.856201171875, "learning_rate": 5.410957716892065e-05, "loss": 0.3699, "step": 29390 }, { "gate_value": 0.3856613337993622, "icl_sequence_length": 82, "num_contexts": 3, "step": 29390 }, { "grad_norm": 15.066496849060059, "learning_rate": 5.4014248212886044e-05, "loss": 0.3832, "step": 29400 }, { "gate_value": 0.3857213258743286, "icl_sequence_length": 88, "num_contexts": 3, "step": 29400 }, { "grad_norm": 244.73097229003906, "learning_rate": 5.391898486220778e-05, "loss": 0.3729, "step": 29410 }, { "gate_value": 0.38576260209083557, "icl_sequence_length": 78, "num_contexts": 3, "step": 29410 }, { "grad_norm": 16.249357223510742, "learning_rate": 5.38237871819973e-05, "loss": 0.3749, "step": 29420 }, { "gate_value": 0.38585683703422546, "icl_sequence_length": 76, "num_contexts": 3, "step": 29420 }, { "grad_norm": 82.86548614501953, "learning_rate": 5.3728655237321443e-05, "loss": 0.3643, "step": 29430 }, { "gate_value": 0.38592880964279175, "icl_sequence_length": 82, "num_contexts": 3, "step": 29430 }, { "grad_norm": 18.706005096435547, "learning_rate": 5.3633589093201906e-05, "loss": 0.3716, "step": 29440 }, { "gate_value": 0.3860245943069458, "icl_sequence_length": 80, "num_contexts": 3, "step": 29440 }, { "grad_norm": 14.105803489685059, "learning_rate": 5.353858881461555e-05, "loss": 0.3493, "step": 29450 }, { "gate_value": 0.3861307203769684, "icl_sequence_length": 80, "num_contexts": 3, "step": 29450 }, { "grad_norm": 53.31535720825195, "learning_rate": 5.344365446649414e-05, "loss": 0.383, "step": 29460 }, { "gate_value": 0.38630276918411255, "icl_sequence_length": 76, "num_contexts": 3, "step": 29460 }, { "grad_norm": 17.413074493408203, "learning_rate": 5.33487861137245e-05, "loss": 0.359, "step": 29470 }, { "gate_value": 0.3865601420402527, "icl_sequence_length": 88, "num_contexts": 3, "step": 29470 }, { "grad_norm": 262.77484130859375, "learning_rate": 5.3253983821148124e-05, "loss": 0.3752, "step": 29480 }, { "gate_value": 0.3866910934448242, "icl_sequence_length": 84, "num_contexts": 3, "step": 29480 }, { "grad_norm": 16.89266586303711, "learning_rate": 5.3159247653561555e-05, "loss": 0.3593, "step": 29490 }, { "gate_value": 0.3867685794830322, "icl_sequence_length": 84, "num_contexts": 3, "step": 29490 }, { "grad_norm": 14.136067390441895, "learning_rate": 5.30645776757161e-05, "loss": 0.3647, "step": 29500 }, { "gate_value": 0.3868660032749176, "icl_sequence_length": 76, "num_contexts": 3, "step": 29500 }, { "grad_norm": 14.746859550476074, "learning_rate": 5.2969973952317715e-05, "loss": 0.3463, "step": 29510 }, { "gate_value": 0.38711073994636536, "icl_sequence_length": 88, "num_contexts": 3, "step": 29510 }, { "grad_norm": 1038.1717529296875, "learning_rate": 5.28754365480272e-05, "loss": 0.3662, "step": 29520 }, { "gate_value": 0.3872504234313965, "icl_sequence_length": 78, "num_contexts": 3, "step": 29520 }, { "grad_norm": 6.718716621398926, "learning_rate": 5.278096552746001e-05, "loss": 0.3723, "step": 29530 }, { "gate_value": 0.3872128129005432, "icl_sequence_length": 88, "num_contexts": 3, "step": 29530 }, { "grad_norm": 27.14375877380371, "learning_rate": 5.268656095518613e-05, "loss": 0.3776, "step": 29540 }, { "gate_value": 0.3871647119522095, "icl_sequence_length": 86, "num_contexts": 3, "step": 29540 }, { "grad_norm": 24.749032974243164, "learning_rate": 5.25922228957302e-05, "loss": 0.3672, "step": 29550 }, { "gate_value": 0.3872618079185486, "icl_sequence_length": 88, "num_contexts": 3, "step": 29550 }, { "grad_norm": 21.76228141784668, "learning_rate": 5.249795141357145e-05, "loss": 0.371, "step": 29560 }, { "gate_value": 0.3873235583305359, "icl_sequence_length": 88, "num_contexts": 3, "step": 29560 }, { "grad_norm": 45.05788803100586, "learning_rate": 5.240374657314354e-05, "loss": 0.3514, "step": 29570 }, { "gate_value": 0.387417197227478, "icl_sequence_length": 84, "num_contexts": 3, "step": 29570 }, { "grad_norm": 19.219005584716797, "learning_rate": 5.2309608438834536e-05, "loss": 0.3775, "step": 29580 }, { "gate_value": 0.3874436318874359, "icl_sequence_length": 76, "num_contexts": 3, "step": 29580 }, { "grad_norm": 12.430703163146973, "learning_rate": 5.221553707498706e-05, "loss": 0.3608, "step": 29590 }, { "gate_value": 0.38745900988578796, "icl_sequence_length": 92, "num_contexts": 3, "step": 29590 }, { "grad_norm": 3910.81787109375, "learning_rate": 5.212153254589787e-05, "loss": 0.3574, "step": 29600 }, { "gate_value": 0.38752108812332153, "icl_sequence_length": 78, "num_contexts": 3, "step": 29600 }, { "grad_norm": 63.541481018066406, "learning_rate": 5.2027594915818263e-05, "loss": 0.3794, "step": 29610 }, { "gate_value": 0.3875715732574463, "icl_sequence_length": 76, "num_contexts": 3, "step": 29610 }, { "grad_norm": 47.07277297973633, "learning_rate": 5.193372424895368e-05, "loss": 0.367, "step": 29620 }, { "gate_value": 0.38757985830307007, "icl_sequence_length": 66, "num_contexts": 3, "step": 29620 }, { "grad_norm": 12.926732063293457, "learning_rate": 5.1839920609463936e-05, "loss": 0.3679, "step": 29630 }, { "gate_value": 0.38763460516929626, "icl_sequence_length": 80, "num_contexts": 3, "step": 29630 }, { "grad_norm": 28.26099395751953, "learning_rate": 5.174618406146282e-05, "loss": 0.3602, "step": 29640 }, { "gate_value": 0.387708455324173, "icl_sequence_length": 92, "num_contexts": 3, "step": 29640 }, { "grad_norm": 43.7656364440918, "learning_rate": 5.16525146690184e-05, "loss": 0.3784, "step": 29650 }, { "gate_value": 0.38778313994407654, "icl_sequence_length": 60, "num_contexts": 3, "step": 29650 }, { "grad_norm": 60.07194900512695, "learning_rate": 5.1558912496152854e-05, "loss": 0.349, "step": 29660 }, { "gate_value": 0.3878571689128876, "icl_sequence_length": 90, "num_contexts": 3, "step": 29660 }, { "grad_norm": 122.97605895996094, "learning_rate": 5.146537760684242e-05, "loss": 0.3726, "step": 29670 }, { "gate_value": 0.387951523065567, "icl_sequence_length": 78, "num_contexts": 3, "step": 29670 }, { "grad_norm": 20.661191940307617, "learning_rate": 5.13719100650172e-05, "loss": 0.3778, "step": 29680 }, { "gate_value": 0.388042151927948, "icl_sequence_length": 70, "num_contexts": 3, "step": 29680 }, { "grad_norm": 25.058385848999023, "learning_rate": 5.127850993456151e-05, "loss": 0.3575, "step": 29690 }, { "gate_value": 0.38809892535209656, "icl_sequence_length": 92, "num_contexts": 3, "step": 29690 }, { "grad_norm": 23.43122100830078, "learning_rate": 5.118517727931333e-05, "loss": 0.3552, "step": 29700 }, { "gate_value": 0.388154000043869, "icl_sequence_length": 76, "num_contexts": 3, "step": 29700 }, { "grad_norm": 387.5428161621094, "learning_rate": 5.1091912163064736e-05, "loss": 0.3781, "step": 29710 }, { "gate_value": 0.3881308436393738, "icl_sequence_length": 86, "num_contexts": 3, "step": 29710 }, { "grad_norm": 105.68052673339844, "learning_rate": 5.099871464956151e-05, "loss": 0.3556, "step": 29720 }, { "gate_value": 0.3881755471229553, "icl_sequence_length": 88, "num_contexts": 3, "step": 29720 }, { "grad_norm": 24.62873077392578, "learning_rate": 5.090558480250336e-05, "loss": 0.3699, "step": 29730 }, { "gate_value": 0.3882880210876465, "icl_sequence_length": 80, "num_contexts": 3, "step": 29730 }, { "grad_norm": 454.452880859375, "learning_rate": 5.081252268554352e-05, "loss": 0.3692, "step": 29740 }, { "gate_value": 0.38832375407218933, "icl_sequence_length": 76, "num_contexts": 3, "step": 29740 }, { "grad_norm": 42.048065185546875, "learning_rate": 5.0719528362289156e-05, "loss": 0.3536, "step": 29750 }, { "gate_value": 0.38841527700424194, "icl_sequence_length": 86, "num_contexts": 3, "step": 29750 }, { "grad_norm": 140.90931701660156, "learning_rate": 5.062660189630101e-05, "loss": 0.3695, "step": 29760 }, { "gate_value": 0.3884676992893219, "icl_sequence_length": 86, "num_contexts": 3, "step": 29760 }, { "grad_norm": 65.93880462646484, "learning_rate": 5.053374335109346e-05, "loss": 0.3732, "step": 29770 }, { "gate_value": 0.38848721981048584, "icl_sequence_length": 86, "num_contexts": 3, "step": 29770 }, { "grad_norm": 823.0439453125, "learning_rate": 5.0440952790134426e-05, "loss": 0.3649, "step": 29780 }, { "gate_value": 0.3884487748146057, "icl_sequence_length": 76, "num_contexts": 3, "step": 29780 }, { "grad_norm": 339.5392761230469, "learning_rate": 5.034823027684533e-05, "loss": 0.346, "step": 29790 }, { "gate_value": 0.388444185256958, "icl_sequence_length": 72, "num_contexts": 3, "step": 29790 }, { "grad_norm": 92.40780639648438, "learning_rate": 5.025557587460118e-05, "loss": 0.3621, "step": 29800 }, { "gate_value": 0.38856759667396545, "icl_sequence_length": 68, "num_contexts": 3, "step": 29800 }, { "grad_norm": 23.564212799072266, "learning_rate": 5.016298964673038e-05, "loss": 0.366, "step": 29810 }, { "gate_value": 0.38881248235702515, "icl_sequence_length": 88, "num_contexts": 3, "step": 29810 }, { "grad_norm": 30.643938064575195, "learning_rate": 5.007047165651474e-05, "loss": 0.3718, "step": 29820 }, { "gate_value": 0.38913866877555847, "icl_sequence_length": 82, "num_contexts": 3, "step": 29820 }, { "grad_norm": 2424.154052734375, "learning_rate": 4.997802196718951e-05, "loss": 0.3672, "step": 29830 }, { "gate_value": 0.3892226219177246, "icl_sequence_length": 78, "num_contexts": 3, "step": 29830 }, { "grad_norm": 80.18303680419922, "learning_rate": 4.988564064194306e-05, "loss": 0.3458, "step": 29840 }, { "gate_value": 0.3894360065460205, "icl_sequence_length": 74, "num_contexts": 3, "step": 29840 }, { "grad_norm": 21.752769470214844, "learning_rate": 4.979332774391721e-05, "loss": 0.367, "step": 29850 }, { "gate_value": 0.3895873725414276, "icl_sequence_length": 88, "num_contexts": 3, "step": 29850 }, { "grad_norm": 19.119279861450195, "learning_rate": 4.970108333620696e-05, "loss": 0.37, "step": 29860 }, { "gate_value": 0.3896397650241852, "icl_sequence_length": 84, "num_contexts": 3, "step": 29860 }, { "grad_norm": 20.726579666137695, "learning_rate": 4.960890748186052e-05, "loss": 0.3599, "step": 29870 }, { "gate_value": 0.3897121250629425, "icl_sequence_length": 90, "num_contexts": 3, "step": 29870 }, { "grad_norm": 24.870521545410156, "learning_rate": 4.95168002438792e-05, "loss": 0.3601, "step": 29880 }, { "gate_value": 0.3897797763347626, "icl_sequence_length": 84, "num_contexts": 3, "step": 29880 }, { "grad_norm": 73.91206359863281, "learning_rate": 4.9424761685217353e-05, "loss": 0.3679, "step": 29890 }, { "gate_value": 0.38991779088974, "icl_sequence_length": 72, "num_contexts": 3, "step": 29890 }, { "grad_norm": 52.439510345458984, "learning_rate": 4.933279186878255e-05, "loss": 0.3735, "step": 29900 }, { "gate_value": 0.3900202810764313, "icl_sequence_length": 80, "num_contexts": 3, "step": 29900 }, { "grad_norm": 47.01316833496094, "learning_rate": 4.924089085743524e-05, "loss": 0.3725, "step": 29910 }, { "gate_value": 0.3900734782218933, "icl_sequence_length": 90, "num_contexts": 3, "step": 29910 }, { "grad_norm": 90.96566009521484, "learning_rate": 4.9149058713988945e-05, "loss": 0.3616, "step": 29920 }, { "gate_value": 0.3901205062866211, "icl_sequence_length": 90, "num_contexts": 3, "step": 29920 }, { "grad_norm": 8.128710746765137, "learning_rate": 4.9057295501210105e-05, "loss": 0.3827, "step": 29930 }, { "gate_value": 0.39021652936935425, "icl_sequence_length": 94, "num_contexts": 3, "step": 29930 }, { "grad_norm": 208.60455322265625, "learning_rate": 4.8965601281817884e-05, "loss": 0.3523, "step": 29940 }, { "gate_value": 0.39037322998046875, "icl_sequence_length": 74, "num_contexts": 3, "step": 29940 }, { "grad_norm": 47.41590118408203, "learning_rate": 4.88739761184845e-05, "loss": 0.3634, "step": 29950 }, { "gate_value": 0.39055919647216797, "icl_sequence_length": 84, "num_contexts": 3, "step": 29950 }, { "grad_norm": 32.718605041503906, "learning_rate": 4.87824200738349e-05, "loss": 0.3642, "step": 29960 }, { "gate_value": 0.39067938923835754, "icl_sequence_length": 82, "num_contexts": 3, "step": 29960 }, { "grad_norm": 32.24725341796875, "learning_rate": 4.869093321044678e-05, "loss": 0.3687, "step": 29970 }, { "gate_value": 0.3905954658985138, "icl_sequence_length": 70, "num_contexts": 3, "step": 29970 }, { "grad_norm": 27.35053062438965, "learning_rate": 4.859951559085053e-05, "loss": 0.3506, "step": 29980 }, { "gate_value": 0.3906969130039215, "icl_sequence_length": 78, "num_contexts": 3, "step": 29980 }, { "grad_norm": 18.87845230102539, "learning_rate": 4.850816727752917e-05, "loss": 0.3724, "step": 29990 }, { "gate_value": 0.3908305764198303, "icl_sequence_length": 76, "num_contexts": 3, "step": 29990 }, { "grad_norm": 13.033040046691895, "learning_rate": 4.8416888332918474e-05, "loss": 0.3616, "step": 30000 }, { "gate_value": 0.3909852206707001, "icl_sequence_length": 74, "num_contexts": 3, "step": 30000 }, { "grad_norm": 10490.21875, "learning_rate": 4.832567881940672e-05, "loss": 0.3711, "step": 30010 }, { "gate_value": 0.3910848796367645, "icl_sequence_length": 90, "num_contexts": 3, "step": 30010 }, { "grad_norm": 153.7618408203125, "learning_rate": 4.823453879933477e-05, "loss": 0.3687, "step": 30020 }, { "gate_value": 0.3911765217781067, "icl_sequence_length": 68, "num_contexts": 3, "step": 30020 }, { "grad_norm": 28.18154525756836, "learning_rate": 4.814346833499601e-05, "loss": 0.3604, "step": 30030 }, { "gate_value": 0.39127495884895325, "icl_sequence_length": 90, "num_contexts": 3, "step": 30030 }, { "grad_norm": 16.653223037719727, "learning_rate": 4.8052467488636134e-05, "loss": 0.3747, "step": 30040 }, { "gate_value": 0.3913731276988983, "icl_sequence_length": 82, "num_contexts": 3, "step": 30040 }, { "grad_norm": 17.24098777770996, "learning_rate": 4.796153632245343e-05, "loss": 0.3581, "step": 30050 }, { "gate_value": 0.39150476455688477, "icl_sequence_length": 80, "num_contexts": 3, "step": 30050 }, { "grad_norm": 16.153825759887695, "learning_rate": 4.787067489859854e-05, "loss": 0.3816, "step": 30060 }, { "gate_value": 0.39168140292167664, "icl_sequence_length": 70, "num_contexts": 3, "step": 30060 }, { "grad_norm": 22.805763244628906, "learning_rate": 4.777988327917427e-05, "loss": 0.3484, "step": 30070 }, { "gate_value": 0.39185601472854614, "icl_sequence_length": 86, "num_contexts": 3, "step": 30070 }, { "grad_norm": 14.978759765625, "learning_rate": 4.768916152623595e-05, "loss": 0.3715, "step": 30080 }, { "gate_value": 0.39189791679382324, "icl_sequence_length": 88, "num_contexts": 3, "step": 30080 }, { "grad_norm": 40.283058166503906, "learning_rate": 4.759850970179096e-05, "loss": 0.3555, "step": 30090 }, { "gate_value": 0.3919513523578644, "icl_sequence_length": 50, "num_contexts": 3, "step": 30090 }, { "grad_norm": 63.80585861206055, "learning_rate": 4.7507927867799004e-05, "loss": 0.3506, "step": 30100 }, { "gate_value": 0.3920625150203705, "icl_sequence_length": 88, "num_contexts": 3, "step": 30100 }, { "grad_norm": 11.960375785827637, "learning_rate": 4.741741608617188e-05, "loss": 0.3532, "step": 30110 }, { "gate_value": 0.3922727406024933, "icl_sequence_length": 72, "num_contexts": 3, "step": 30110 }, { "grad_norm": 56.46166229248047, "learning_rate": 4.732697441877359e-05, "loss": 0.3511, "step": 30120 }, { "gate_value": 0.39238861203193665, "icl_sequence_length": 80, "num_contexts": 3, "step": 30120 }, { "grad_norm": 179.3865203857422, "learning_rate": 4.723660292742017e-05, "loss": 0.3617, "step": 30130 }, { "gate_value": 0.3924447000026703, "icl_sequence_length": 92, "num_contexts": 3, "step": 30130 }, { "grad_norm": 44.34315490722656, "learning_rate": 4.7146301673879615e-05, "loss": 0.3425, "step": 30140 }, { "gate_value": 0.3926564157009125, "icl_sequence_length": 76, "num_contexts": 3, "step": 30140 }, { "grad_norm": 237.02188110351562, "learning_rate": 4.705607071987204e-05, "loss": 0.3739, "step": 30150 }, { "gate_value": 0.392910361289978, "icl_sequence_length": 86, "num_contexts": 3, "step": 30150 }, { "grad_norm": 21.830909729003906, "learning_rate": 4.6965910127069394e-05, "loss": 0.3441, "step": 30160 }, { "gate_value": 0.39306795597076416, "icl_sequence_length": 74, "num_contexts": 3, "step": 30160 }, { "grad_norm": 32.624427795410156, "learning_rate": 4.687581995709562e-05, "loss": 0.3646, "step": 30170 }, { "gate_value": 0.3931129276752472, "icl_sequence_length": 90, "num_contexts": 3, "step": 30170 }, { "grad_norm": 17.580211639404297, "learning_rate": 4.678580027152655e-05, "loss": 0.3533, "step": 30180 }, { "gate_value": 0.3931795060634613, "icl_sequence_length": 92, "num_contexts": 3, "step": 30180 }, { "grad_norm": 84.48542022705078, "learning_rate": 4.66958511318897e-05, "loss": 0.3736, "step": 30190 }, { "gate_value": 0.3933153450489044, "icl_sequence_length": 86, "num_contexts": 3, "step": 30190 }, { "grad_norm": 23.2664794921875, "learning_rate": 4.660597259966448e-05, "loss": 0.3454, "step": 30200 }, { "gate_value": 0.3933710753917694, "icl_sequence_length": 72, "num_contexts": 3, "step": 30200 }, { "grad_norm": 28.477397918701172, "learning_rate": 4.6516164736282056e-05, "loss": 0.3743, "step": 30210 }, { "gate_value": 0.3934413492679596, "icl_sequence_length": 84, "num_contexts": 3, "step": 30210 }, { "grad_norm": 128.1862335205078, "learning_rate": 4.642642760312524e-05, "loss": 0.3719, "step": 30220 }, { "gate_value": 0.39349788427352905, "icl_sequence_length": 86, "num_contexts": 3, "step": 30220 }, { "grad_norm": 14.590642929077148, "learning_rate": 4.633676126152858e-05, "loss": 0.3627, "step": 30230 }, { "gate_value": 0.39354342222213745, "icl_sequence_length": 82, "num_contexts": 3, "step": 30230 }, { "grad_norm": 18.493053436279297, "learning_rate": 4.624716577277803e-05, "loss": 0.3571, "step": 30240 }, { "gate_value": 0.3936373293399811, "icl_sequence_length": 70, "num_contexts": 3, "step": 30240 }, { "grad_norm": 24.06255531311035, "learning_rate": 4.615764119811141e-05, "loss": 0.3757, "step": 30250 }, { "gate_value": 0.3937382102012634, "icl_sequence_length": 88, "num_contexts": 3, "step": 30250 }, { "grad_norm": 30.321077346801758, "learning_rate": 4.606818759871782e-05, "loss": 0.3617, "step": 30260 }, { "gate_value": 0.3939821124076843, "icl_sequence_length": 88, "num_contexts": 3, "step": 30260 }, { "grad_norm": 24.308691024780273, "learning_rate": 4.597880503573797e-05, "loss": 0.3588, "step": 30270 }, { "gate_value": 0.3941650092601776, "icl_sequence_length": 82, "num_contexts": 3, "step": 30270 }, { "grad_norm": 12.186562538146973, "learning_rate": 4.5889493570264074e-05, "loss": 0.3683, "step": 30280 }, { "gate_value": 0.39424392580986023, "icl_sequence_length": 84, "num_contexts": 3, "step": 30280 }, { "grad_norm": 18.442607879638672, "learning_rate": 4.580025326333956e-05, "loss": 0.3561, "step": 30290 }, { "gate_value": 0.3942907452583313, "icl_sequence_length": 92, "num_contexts": 3, "step": 30290 }, { "grad_norm": 28.805355072021484, "learning_rate": 4.571108417595942e-05, "loss": 0.3622, "step": 30300 }, { "gate_value": 0.39430415630340576, "icl_sequence_length": 74, "num_contexts": 3, "step": 30300 }, { "grad_norm": 12.717275619506836, "learning_rate": 4.562198636906983e-05, "loss": 0.3833, "step": 30310 }, { "gate_value": 0.3942856788635254, "icl_sequence_length": 74, "num_contexts": 3, "step": 30310 }, { "grad_norm": 15.916489601135254, "learning_rate": 4.553295990356836e-05, "loss": 0.3707, "step": 30320 }, { "gate_value": 0.39437082409858704, "icl_sequence_length": 74, "num_contexts": 3, "step": 30320 }, { "grad_norm": 889.87060546875, "learning_rate": 4.5444004840303757e-05, "loss": 0.3611, "step": 30330 }, { "gate_value": 0.3944534361362457, "icl_sequence_length": 84, "num_contexts": 3, "step": 30330 }, { "grad_norm": 38.44389343261719, "learning_rate": 4.5355121240075944e-05, "loss": 0.3608, "step": 30340 }, { "gate_value": 0.3944818675518036, "icl_sequence_length": 78, "num_contexts": 3, "step": 30340 }, { "grad_norm": 26.342105865478516, "learning_rate": 4.526630916363597e-05, "loss": 0.3827, "step": 30350 }, { "gate_value": 0.3946109116077423, "icl_sequence_length": 84, "num_contexts": 3, "step": 30350 }, { "grad_norm": 99.67547607421875, "learning_rate": 4.517756867168612e-05, "loss": 0.3616, "step": 30360 }, { "gate_value": 0.3946753144264221, "icl_sequence_length": 64, "num_contexts": 3, "step": 30360 }, { "grad_norm": 134.35885620117188, "learning_rate": 4.508889982487965e-05, "loss": 0.3555, "step": 30370 }, { "gate_value": 0.3948347568511963, "icl_sequence_length": 78, "num_contexts": 3, "step": 30370 }, { "grad_norm": 25.946739196777344, "learning_rate": 4.500030268382096e-05, "loss": 0.3771, "step": 30380 }, { "gate_value": 0.3949762284755707, "icl_sequence_length": 74, "num_contexts": 3, "step": 30380 }, { "grad_norm": 23.79796028137207, "learning_rate": 4.4911777309065236e-05, "loss": 0.3443, "step": 30390 }, { "gate_value": 0.3950321674346924, "icl_sequence_length": 82, "num_contexts": 3, "step": 30390 }, { "grad_norm": 26.93970489501953, "learning_rate": 4.4823323761118807e-05, "loss": 0.3567, "step": 30400 }, { "gate_value": 0.39504531025886536, "icl_sequence_length": 82, "num_contexts": 3, "step": 30400 }, { "grad_norm": 33.7324104309082, "learning_rate": 4.4734942100438835e-05, "loss": 0.347, "step": 30410 }, { "gate_value": 0.39515039324760437, "icl_sequence_length": 74, "num_contexts": 3, "step": 30410 }, { "grad_norm": 35.20787811279297, "learning_rate": 4.464663238743333e-05, "loss": 0.3602, "step": 30420 }, { "gate_value": 0.3952362537384033, "icl_sequence_length": 90, "num_contexts": 3, "step": 30420 }, { "grad_norm": 18.598276138305664, "learning_rate": 4.4558394682461236e-05, "loss": 0.3505, "step": 30430 }, { "gate_value": 0.39534491300582886, "icl_sequence_length": 82, "num_contexts": 3, "step": 30430 }, { "grad_norm": 17.984878540039062, "learning_rate": 4.44702290458321e-05, "loss": 0.3656, "step": 30440 }, { "gate_value": 0.3954671025276184, "icl_sequence_length": 62, "num_contexts": 3, "step": 30440 }, { "grad_norm": 18.157207489013672, "learning_rate": 4.438213553780628e-05, "loss": 0.3556, "step": 30450 }, { "gate_value": 0.3956170380115509, "icl_sequence_length": 88, "num_contexts": 3, "step": 30450 }, { "grad_norm": 2112.7578125, "learning_rate": 4.429411421859492e-05, "loss": 0.384, "step": 30460 }, { "gate_value": 0.39561453461647034, "icl_sequence_length": 94, "num_contexts": 3, "step": 30460 }, { "grad_norm": 30.692434310913086, "learning_rate": 4.420616514835973e-05, "loss": 0.3607, "step": 30470 }, { "gate_value": 0.39567017555236816, "icl_sequence_length": 88, "num_contexts": 3, "step": 30470 }, { "grad_norm": 115.94886016845703, "learning_rate": 4.411828838721313e-05, "loss": 0.3506, "step": 30480 }, { "gate_value": 0.3958011567592621, "icl_sequence_length": 84, "num_contexts": 3, "step": 30480 }, { "grad_norm": 60.980934143066406, "learning_rate": 4.403048399521798e-05, "loss": 0.3457, "step": 30490 }, { "gate_value": 0.3959410488605499, "icl_sequence_length": 90, "num_contexts": 3, "step": 30490 }, { "grad_norm": 22.16244125366211, "learning_rate": 4.394275203238778e-05, "loss": 0.3448, "step": 30500 }, { "gate_value": 0.39600443840026855, "icl_sequence_length": 66, "num_contexts": 3, "step": 30500 }, { "grad_norm": 32.711326599121094, "learning_rate": 4.38550925586865e-05, "loss": 0.3605, "step": 30510 }, { "gate_value": 0.3960520625114441, "icl_sequence_length": 84, "num_contexts": 3, "step": 30510 }, { "grad_norm": 28.291837692260742, "learning_rate": 4.3767505634028614e-05, "loss": 0.3721, "step": 30520 }, { "gate_value": 0.39630264043807983, "icl_sequence_length": 86, "num_contexts": 3, "step": 30520 }, { "grad_norm": 199.38694763183594, "learning_rate": 4.3679991318278875e-05, "loss": 0.3589, "step": 30530 }, { "gate_value": 0.39648500084877014, "icl_sequence_length": 74, "num_contexts": 3, "step": 30530 }, { "grad_norm": 33.74460220336914, "learning_rate": 4.3592549671252584e-05, "loss": 0.3695, "step": 30540 }, { "gate_value": 0.3966231048107147, "icl_sequence_length": 74, "num_contexts": 3, "step": 30540 }, { "grad_norm": 24.554481506347656, "learning_rate": 4.350518075271518e-05, "loss": 0.3546, "step": 30550 }, { "gate_value": 0.39659935235977173, "icl_sequence_length": 78, "num_contexts": 3, "step": 30550 }, { "grad_norm": 100.24298095703125, "learning_rate": 4.3417884622382536e-05, "loss": 0.355, "step": 30560 }, { "gate_value": 0.3965737521648407, "icl_sequence_length": 74, "num_contexts": 3, "step": 30560 }, { "grad_norm": 68.57526397705078, "learning_rate": 4.333066133992075e-05, "loss": 0.3643, "step": 30570 }, { "gate_value": 0.3965604901313782, "icl_sequence_length": 80, "num_contexts": 3, "step": 30570 }, { "grad_norm": 14.827048301696777, "learning_rate": 4.32435109649461e-05, "loss": 0.3357, "step": 30580 }, { "gate_value": 0.39664265513420105, "icl_sequence_length": 82, "num_contexts": 3, "step": 30580 }, { "grad_norm": 42.24051284790039, "learning_rate": 4.315643355702511e-05, "loss": 0.3607, "step": 30590 }, { "gate_value": 0.39675700664520264, "icl_sequence_length": 66, "num_contexts": 3, "step": 30590 }, { "grad_norm": 38.62068176269531, "learning_rate": 4.306942917567426e-05, "loss": 0.3586, "step": 30600 }, { "gate_value": 0.39687106013298035, "icl_sequence_length": 64, "num_contexts": 3, "step": 30600 }, { "grad_norm": 38.699825286865234, "learning_rate": 4.298249788036026e-05, "loss": 0.3761, "step": 30610 }, { "gate_value": 0.3969728648662567, "icl_sequence_length": 64, "num_contexts": 3, "step": 30610 }, { "grad_norm": 17.548847198486328, "learning_rate": 4.2895639730499906e-05, "loss": 0.3703, "step": 30620 }, { "gate_value": 0.3970944881439209, "icl_sequence_length": 78, "num_contexts": 3, "step": 30620 }, { "grad_norm": 17.83329200744629, "learning_rate": 4.2808854785459815e-05, "loss": 0.363, "step": 30630 }, { "gate_value": 0.3971506357192993, "icl_sequence_length": 86, "num_contexts": 3, "step": 30630 }, { "grad_norm": 18.071189880371094, "learning_rate": 4.272214310455677e-05, "loss": 0.3632, "step": 30640 }, { "gate_value": 0.39719945192337036, "icl_sequence_length": 90, "num_contexts": 3, "step": 30640 }, { "grad_norm": 34.34341812133789, "learning_rate": 4.2635504747057296e-05, "loss": 0.3485, "step": 30650 }, { "gate_value": 0.3972983956336975, "icl_sequence_length": 80, "num_contexts": 3, "step": 30650 }, { "grad_norm": 25.537384033203125, "learning_rate": 4.254893977217794e-05, "loss": 0.3661, "step": 30660 }, { "gate_value": 0.39736828207969666, "icl_sequence_length": 78, "num_contexts": 3, "step": 30660 }, { "grad_norm": 17.409698486328125, "learning_rate": 4.2462448239085044e-05, "loss": 0.3478, "step": 30670 }, { "gate_value": 0.39749452471733093, "icl_sequence_length": 84, "num_contexts": 3, "step": 30670 }, { "grad_norm": 37.236331939697266, "learning_rate": 4.237603020689477e-05, "loss": 0.362, "step": 30680 }, { "gate_value": 0.39765825867652893, "icl_sequence_length": 82, "num_contexts": 3, "step": 30680 }, { "grad_norm": 286.671875, "learning_rate": 4.228968573467306e-05, "loss": 0.356, "step": 30690 }, { "gate_value": 0.39775484800338745, "icl_sequence_length": 80, "num_contexts": 3, "step": 30690 }, { "grad_norm": 39.26069259643555, "learning_rate": 4.2203414881435436e-05, "loss": 0.3715, "step": 30700 }, { "gate_value": 0.3978050947189331, "icl_sequence_length": 94, "num_contexts": 3, "step": 30700 }, { "grad_norm": 3089.90478515625, "learning_rate": 4.211721770614734e-05, "loss": 0.3565, "step": 30710 }, { "gate_value": 0.3978253901004791, "icl_sequence_length": 88, "num_contexts": 3, "step": 30710 }, { "grad_norm": 5112.45068359375, "learning_rate": 4.203109426772363e-05, "loss": 0.3661, "step": 30720 }, { "gate_value": 0.3978503346443176, "icl_sequence_length": 80, "num_contexts": 3, "step": 30720 }, { "grad_norm": 84.64717102050781, "learning_rate": 4.19450446250289e-05, "loss": 0.3648, "step": 30730 }, { "gate_value": 0.3978942334651947, "icl_sequence_length": 66, "num_contexts": 3, "step": 30730 }, { "grad_norm": 39.20880889892578, "learning_rate": 4.1859068836877306e-05, "loss": 0.3401, "step": 30740 }, { "gate_value": 0.39794957637786865, "icl_sequence_length": 92, "num_contexts": 3, "step": 30740 }, { "grad_norm": 184.52806091308594, "learning_rate": 4.177316696203241e-05, "loss": 0.345, "step": 30750 }, { "gate_value": 0.3979901671409607, "icl_sequence_length": 86, "num_contexts": 3, "step": 30750 }, { "grad_norm": 58.26884841918945, "learning_rate": 4.168733905920739e-05, "loss": 0.3537, "step": 30760 }, { "gate_value": 0.3980367183685303, "icl_sequence_length": 82, "num_contexts": 3, "step": 30760 }, { "grad_norm": 43.49728775024414, "learning_rate": 4.160158518706479e-05, "loss": 0.35, "step": 30770 }, { "gate_value": 0.3981196880340576, "icl_sequence_length": 82, "num_contexts": 3, "step": 30770 }, { "grad_norm": 3105.703857421875, "learning_rate": 4.151590540421657e-05, "loss": 0.3687, "step": 30780 }, { "gate_value": 0.39822056889533997, "icl_sequence_length": 94, "num_contexts": 3, "step": 30780 }, { "grad_norm": 43.1259765625, "learning_rate": 4.143029976922411e-05, "loss": 0.3579, "step": 30790 }, { "gate_value": 0.39836224913597107, "icl_sequence_length": 90, "num_contexts": 3, "step": 30790 }, { "grad_norm": 136.4414520263672, "learning_rate": 4.134476834059801e-05, "loss": 0.3663, "step": 30800 }, { "gate_value": 0.39839082956314087, "icl_sequence_length": 78, "num_contexts": 3, "step": 30800 }, { "grad_norm": 31.794607162475586, "learning_rate": 4.1259311176798155e-05, "loss": 0.3516, "step": 30810 }, { "gate_value": 0.39832615852355957, "icl_sequence_length": 84, "num_contexts": 3, "step": 30810 }, { "grad_norm": 15.481108665466309, "learning_rate": 4.117392833623373e-05, "loss": 0.3661, "step": 30820 }, { "gate_value": 0.39843201637268066, "icl_sequence_length": 68, "num_contexts": 3, "step": 30820 }, { "grad_norm": 28.874069213867188, "learning_rate": 4.108861987726312e-05, "loss": 0.3641, "step": 30830 }, { "gate_value": 0.39849528670310974, "icl_sequence_length": 92, "num_contexts": 3, "step": 30830 }, { "grad_norm": 7868.61328125, "learning_rate": 4.100338585819391e-05, "loss": 0.3589, "step": 30840 }, { "gate_value": 0.3985292613506317, "icl_sequence_length": 80, "num_contexts": 3, "step": 30840 }, { "grad_norm": 92.6623306274414, "learning_rate": 4.091822633728264e-05, "loss": 0.3513, "step": 30850 }, { "gate_value": 0.39864015579223633, "icl_sequence_length": 72, "num_contexts": 3, "step": 30850 }, { "grad_norm": 68.68574523925781, "learning_rate": 4.0833141372735086e-05, "loss": 0.3591, "step": 30860 }, { "gate_value": 0.3987644910812378, "icl_sequence_length": 92, "num_contexts": 3, "step": 30860 }, { "grad_norm": 8631.802734375, "learning_rate": 4.074813102270603e-05, "loss": 0.3598, "step": 30870 }, { "gate_value": 0.3988719880580902, "icl_sequence_length": 92, "num_contexts": 3, "step": 30870 }, { "grad_norm": 22.09979248046875, "learning_rate": 4.066319534529922e-05, "loss": 0.348, "step": 30880 }, { "gate_value": 0.39896559715270996, "icl_sequence_length": 94, "num_contexts": 3, "step": 30880 }, { "grad_norm": 28.57960319519043, "learning_rate": 4.057833439856746e-05, "loss": 0.3653, "step": 30890 }, { "gate_value": 0.3991009294986725, "icl_sequence_length": 74, "num_contexts": 3, "step": 30890 }, { "grad_norm": 281.9774475097656, "learning_rate": 4.0493548240512355e-05, "loss": 0.3667, "step": 30900 }, { "gate_value": 0.39916902780532837, "icl_sequence_length": 76, "num_contexts": 3, "step": 30900 }, { "grad_norm": 19.42442512512207, "learning_rate": 4.0408836929084396e-05, "loss": 0.3551, "step": 30910 }, { "gate_value": 0.39917629957199097, "icl_sequence_length": 86, "num_contexts": 3, "step": 30910 }, { "grad_norm": 25.34986114501953, "learning_rate": 4.032420052218302e-05, "loss": 0.3536, "step": 30920 }, { "gate_value": 0.39924490451812744, "icl_sequence_length": 84, "num_contexts": 3, "step": 30920 }, { "grad_norm": 66.68396759033203, "learning_rate": 4.02396390776564e-05, "loss": 0.3603, "step": 30930 }, { "gate_value": 0.39930960536003113, "icl_sequence_length": 76, "num_contexts": 3, "step": 30930 }, { "grad_norm": 28.13355255126953, "learning_rate": 4.015515265330155e-05, "loss": 0.3505, "step": 30940 }, { "gate_value": 0.39939790964126587, "icl_sequence_length": 78, "num_contexts": 3, "step": 30940 }, { "grad_norm": 36.40007019042969, "learning_rate": 4.0070741306864026e-05, "loss": 0.3728, "step": 30950 }, { "gate_value": 0.399484783411026, "icl_sequence_length": 82, "num_contexts": 3, "step": 30950 }, { "grad_norm": 15.841800689697266, "learning_rate": 3.998640509603824e-05, "loss": 0.3561, "step": 30960 }, { "gate_value": 0.39953693747520447, "icl_sequence_length": 90, "num_contexts": 3, "step": 30960 }, { "grad_norm": 78.29736328125, "learning_rate": 3.9902144078467234e-05, "loss": 0.3626, "step": 30970 }, { "gate_value": 0.39955389499664307, "icl_sequence_length": 84, "num_contexts": 3, "step": 30970 }, { "grad_norm": 44.35389709472656, "learning_rate": 3.9817958311742564e-05, "loss": 0.3639, "step": 30980 }, { "gate_value": 0.39958593249320984, "icl_sequence_length": 82, "num_contexts": 3, "step": 30980 }, { "grad_norm": 54.38600158691406, "learning_rate": 3.973384785340449e-05, "loss": 0.3697, "step": 30990 }, { "gate_value": 0.3996659517288208, "icl_sequence_length": 86, "num_contexts": 3, "step": 30990 }, { "grad_norm": 23.38623046875, "learning_rate": 3.964981276094165e-05, "loss": 0.3511, "step": 31000 }, { "gate_value": 0.3997676968574524, "icl_sequence_length": 84, "num_contexts": 3, "step": 31000 }, { "grad_norm": 22.269174575805664, "learning_rate": 3.956585309179121e-05, "loss": 0.3786, "step": 31010 }, { "gate_value": 0.3999699652194977, "icl_sequence_length": 74, "num_contexts": 3, "step": 31010 }, { "grad_norm": 22.649744033813477, "learning_rate": 3.9481968903338864e-05, "loss": 0.3428, "step": 31020 }, { "gate_value": 0.40010973811149597, "icl_sequence_length": 80, "num_contexts": 3, "step": 31020 }, { "grad_norm": 210.9306640625, "learning_rate": 3.9398160252918626e-05, "loss": 0.3639, "step": 31030 }, { "gate_value": 0.40025028586387634, "icl_sequence_length": 70, "num_contexts": 3, "step": 31030 }, { "grad_norm": 59.51335525512695, "learning_rate": 3.9314427197812996e-05, "loss": 0.3542, "step": 31040 }, { "gate_value": 0.4003356099128723, "icl_sequence_length": 92, "num_contexts": 3, "step": 31040 }, { "grad_norm": 1145.820068359375, "learning_rate": 3.923076979525263e-05, "loss": 0.372, "step": 31050 }, { "gate_value": 0.400397390127182, "icl_sequence_length": 82, "num_contexts": 3, "step": 31050 }, { "grad_norm": 227.70965576171875, "learning_rate": 3.914718810241662e-05, "loss": 0.3483, "step": 31060 }, { "gate_value": 0.4004197418689728, "icl_sequence_length": 82, "num_contexts": 3, "step": 31060 }, { "grad_norm": 246.35362243652344, "learning_rate": 3.906368217643227e-05, "loss": 0.3558, "step": 31070 }, { "gate_value": 0.4004781246185303, "icl_sequence_length": 86, "num_contexts": 3, "step": 31070 }, { "grad_norm": 75.53336334228516, "learning_rate": 3.898025207437511e-05, "loss": 0.3559, "step": 31080 }, { "gate_value": 0.4005269706249237, "icl_sequence_length": 88, "num_contexts": 3, "step": 31080 }, { "grad_norm": 45.185482025146484, "learning_rate": 3.8896897853268765e-05, "loss": 0.3696, "step": 31090 }, { "gate_value": 0.4005460739135742, "icl_sequence_length": 72, "num_contexts": 3, "step": 31090 }, { "grad_norm": 61.928192138671875, "learning_rate": 3.881361957008516e-05, "loss": 0.3681, "step": 31100 }, { "gate_value": 0.40058523416519165, "icl_sequence_length": 86, "num_contexts": 3, "step": 31100 }, { "grad_norm": 1029.142333984375, "learning_rate": 3.873041728174409e-05, "loss": 0.3501, "step": 31110 }, { "gate_value": 0.40063005685806274, "icl_sequence_length": 68, "num_contexts": 3, "step": 31110 }, { "grad_norm": 40.88732147216797, "learning_rate": 3.864729104511361e-05, "loss": 0.3459, "step": 31120 }, { "gate_value": 0.4006367623806, "icl_sequence_length": 84, "num_contexts": 3, "step": 31120 }, { "grad_norm": 395.06768798828125, "learning_rate": 3.8564240917009695e-05, "loss": 0.3655, "step": 31130 }, { "gate_value": 0.4006264805793762, "icl_sequence_length": 82, "num_contexts": 3, "step": 31130 }, { "grad_norm": 76.37919616699219, "learning_rate": 3.848126695419639e-05, "loss": 0.3652, "step": 31140 }, { "gate_value": 0.40067169070243835, "icl_sequence_length": 62, "num_contexts": 3, "step": 31140 }, { "grad_norm": 153.97393798828125, "learning_rate": 3.839836921338551e-05, "loss": 0.3446, "step": 31150 }, { "gate_value": 0.4007301330566406, "icl_sequence_length": 74, "num_contexts": 3, "step": 31150 }, { "grad_norm": 96.89092254638672, "learning_rate": 3.831554775123694e-05, "loss": 0.3672, "step": 31160 }, { "gate_value": 0.4007788598537445, "icl_sequence_length": 82, "num_contexts": 3, "step": 31160 }, { "grad_norm": 60.73162078857422, "learning_rate": 3.823280262435837e-05, "loss": 0.36, "step": 31170 }, { "gate_value": 0.4008052945137024, "icl_sequence_length": 78, "num_contexts": 3, "step": 31170 }, { "grad_norm": 160.602294921875, "learning_rate": 3.8150133889305336e-05, "loss": 0.3401, "step": 31180 }, { "gate_value": 0.4008178412914276, "icl_sequence_length": 86, "num_contexts": 3, "step": 31180 }, { "grad_norm": 91.08993530273438, "learning_rate": 3.806754160258106e-05, "loss": 0.3578, "step": 31190 }, { "gate_value": 0.40085065364837646, "icl_sequence_length": 70, "num_contexts": 3, "step": 31190 }, { "grad_norm": 229.18531799316406, "learning_rate": 3.798502582063669e-05, "loss": 0.3692, "step": 31200 }, { "gate_value": 0.4008884131908417, "icl_sequence_length": 70, "num_contexts": 3, "step": 31200 }, { "grad_norm": 37.98585510253906, "learning_rate": 3.7902586599870895e-05, "loss": 0.3507, "step": 31210 }, { "gate_value": 0.4009264409542084, "icl_sequence_length": 82, "num_contexts": 3, "step": 31210 }, { "grad_norm": 19.111576080322266, "learning_rate": 3.782022399663014e-05, "loss": 0.3529, "step": 31220 }, { "gate_value": 0.40098142623901367, "icl_sequence_length": 84, "num_contexts": 3, "step": 31220 }, { "grad_norm": 1276.112548828125, "learning_rate": 3.773793806720848e-05, "loss": 0.3756, "step": 31230 }, { "gate_value": 0.40107500553131104, "icl_sequence_length": 70, "num_contexts": 3, "step": 31230 }, { "grad_norm": 62.54690933227539, "learning_rate": 3.765572886784764e-05, "loss": 0.3571, "step": 31240 }, { "gate_value": 0.40113914012908936, "icl_sequence_length": 94, "num_contexts": 3, "step": 31240 }, { "grad_norm": 255.1707000732422, "learning_rate": 3.7573596454736724e-05, "loss": 0.3786, "step": 31250 }, { "gate_value": 0.4011474549770355, "icl_sequence_length": 94, "num_contexts": 3, "step": 31250 }, { "grad_norm": 80.06964874267578, "learning_rate": 3.7491540884012516e-05, "loss": 0.3608, "step": 31260 }, { "gate_value": 0.40119272470474243, "icl_sequence_length": 80, "num_contexts": 3, "step": 31260 }, { "grad_norm": 60.66999435424805, "learning_rate": 3.7409562211759265e-05, "loss": 0.3532, "step": 31270 }, { "gate_value": 0.40127280354499817, "icl_sequence_length": 82, "num_contexts": 3, "step": 31270 }, { "grad_norm": 487.518310546875, "learning_rate": 3.732766049400853e-05, "loss": 0.3606, "step": 31280 }, { "gate_value": 0.4013500511646271, "icl_sequence_length": 92, "num_contexts": 3, "step": 31280 }, { "grad_norm": 9397.779296875, "learning_rate": 3.7245835786739425e-05, "loss": 0.3582, "step": 31290 }, { "gate_value": 0.4013952314853668, "icl_sequence_length": 68, "num_contexts": 3, "step": 31290 }, { "grad_norm": 50.94612121582031, "learning_rate": 3.716408814587837e-05, "loss": 0.3622, "step": 31300 }, { "gate_value": 0.4014360010623932, "icl_sequence_length": 86, "num_contexts": 3, "step": 31300 }, { "grad_norm": 87.3506851196289, "learning_rate": 3.7082417627299064e-05, "loss": 0.3476, "step": 31310 }, { "gate_value": 0.40153583884239197, "icl_sequence_length": 78, "num_contexts": 3, "step": 31310 }, { "grad_norm": 659.7249145507812, "learning_rate": 3.7000824286822566e-05, "loss": 0.3805, "step": 31320 }, { "gate_value": 0.4015822410583496, "icl_sequence_length": 76, "num_contexts": 3, "step": 31320 }, { "grad_norm": 25.28474235534668, "learning_rate": 3.6919308180217135e-05, "loss": 0.3714, "step": 31330 }, { "gate_value": 0.4015807509422302, "icl_sequence_length": 78, "num_contexts": 3, "step": 31330 }, { "grad_norm": 86.76029968261719, "learning_rate": 3.683786936319833e-05, "loss": 0.3685, "step": 31340 }, { "gate_value": 0.40166762471199036, "icl_sequence_length": 92, "num_contexts": 3, "step": 31340 }, { "grad_norm": 92.85562133789062, "learning_rate": 3.6756507891428714e-05, "loss": 0.3576, "step": 31350 }, { "gate_value": 0.40173211693763733, "icl_sequence_length": 72, "num_contexts": 3, "step": 31350 }, { "grad_norm": 158.87136840820312, "learning_rate": 3.6675223820518174e-05, "loss": 0.3606, "step": 31360 }, { "gate_value": 0.4017789661884308, "icl_sequence_length": 88, "num_contexts": 3, "step": 31360 }, { "grad_norm": 39.90798568725586, "learning_rate": 3.6594017206023514e-05, "loss": 0.3681, "step": 31370 }, { "gate_value": 0.4018018841743469, "icl_sequence_length": 84, "num_contexts": 3, "step": 31370 }, { "grad_norm": 33.672882080078125, "learning_rate": 3.651288810344875e-05, "loss": 0.3712, "step": 31380 }, { "gate_value": 0.40186256170272827, "icl_sequence_length": 80, "num_contexts": 3, "step": 31380 }, { "grad_norm": 40.18476104736328, "learning_rate": 3.643183656824485e-05, "loss": 0.3515, "step": 31390 }, { "gate_value": 0.40187448263168335, "icl_sequence_length": 86, "num_contexts": 3, "step": 31390 }, { "grad_norm": 150.2606658935547, "learning_rate": 3.635086265580979e-05, "loss": 0.3495, "step": 31400 }, { "gate_value": 0.401906818151474, "icl_sequence_length": 80, "num_contexts": 3, "step": 31400 }, { "grad_norm": 40.321495056152344, "learning_rate": 3.626996642148844e-05, "loss": 0.3561, "step": 31410 }, { "gate_value": 0.40194520354270935, "icl_sequence_length": 90, "num_contexts": 3, "step": 31410 }, { "grad_norm": 51.097068786621094, "learning_rate": 3.618914792057262e-05, "loss": 0.3419, "step": 31420 }, { "gate_value": 0.4019700586795807, "icl_sequence_length": 70, "num_contexts": 3, "step": 31420 }, { "grad_norm": 101.07646942138672, "learning_rate": 3.6108407208301035e-05, "loss": 0.3423, "step": 31430 }, { "gate_value": 0.4019973874092102, "icl_sequence_length": 68, "num_contexts": 3, "step": 31430 }, { "grad_norm": 1876.52587890625, "learning_rate": 3.602774433985922e-05, "loss": 0.3498, "step": 31440 }, { "gate_value": 0.4020547568798065, "icl_sequence_length": 84, "num_contexts": 3, "step": 31440 }, { "grad_norm": 708.6364135742188, "learning_rate": 3.594715937037942e-05, "loss": 0.3698, "step": 31450 }, { "gate_value": 0.4021237790584564, "icl_sequence_length": 76, "num_contexts": 3, "step": 31450 }, { "grad_norm": 1956.0980224609375, "learning_rate": 3.586665235494077e-05, "loss": 0.3627, "step": 31460 }, { "gate_value": 0.4021860957145691, "icl_sequence_length": 68, "num_contexts": 3, "step": 31460 }, { "grad_norm": 31.426523208618164, "learning_rate": 3.578622334856898e-05, "loss": 0.3758, "step": 31470 }, { "gate_value": 0.4022514224052429, "icl_sequence_length": 86, "num_contexts": 3, "step": 31470 }, { "grad_norm": 1016.490234375, "learning_rate": 3.570587240623658e-05, "loss": 0.358, "step": 31480 }, { "gate_value": 0.4023077189922333, "icl_sequence_length": 86, "num_contexts": 3, "step": 31480 }, { "grad_norm": 851.119873046875, "learning_rate": 3.5625599582862647e-05, "loss": 0.3458, "step": 31490 }, { "gate_value": 0.4023520350456238, "icl_sequence_length": 90, "num_contexts": 3, "step": 31490 }, { "grad_norm": 2855.093505859375, "learning_rate": 3.554540493331294e-05, "loss": 0.3623, "step": 31500 }, { "gate_value": 0.4023928940296173, "icl_sequence_length": 76, "num_contexts": 3, "step": 31500 }, { "grad_norm": 104.28507995605469, "learning_rate": 3.5465288512399694e-05, "loss": 0.3712, "step": 31510 }, { "gate_value": 0.40249326825141907, "icl_sequence_length": 64, "num_contexts": 3, "step": 31510 }, { "grad_norm": 888.3125, "learning_rate": 3.538525037488176e-05, "loss": 0.3659, "step": 31520 }, { "gate_value": 0.40257859230041504, "icl_sequence_length": 94, "num_contexts": 3, "step": 31520 }, { "grad_norm": 58.13582229614258, "learning_rate": 3.530529057546443e-05, "loss": 0.3424, "step": 31530 }, { "gate_value": 0.4026349186897278, "icl_sequence_length": 90, "num_contexts": 3, "step": 31530 }, { "grad_norm": 207.80984497070312, "learning_rate": 3.5225409168799526e-05, "loss": 0.3437, "step": 31540 }, { "gate_value": 0.40274718403816223, "icl_sequence_length": 66, "num_contexts": 3, "step": 31540 }, { "grad_norm": 82.31903076171875, "learning_rate": 3.51456062094852e-05, "loss": 0.3565, "step": 31550 }, { "gate_value": 0.40279620885849, "icl_sequence_length": 74, "num_contexts": 3, "step": 31550 }, { "grad_norm": 777.2052001953125, "learning_rate": 3.506588175206598e-05, "loss": 0.3518, "step": 31560 }, { "gate_value": 0.4028371274471283, "icl_sequence_length": 82, "num_contexts": 3, "step": 31560 }, { "grad_norm": 27.5644588470459, "learning_rate": 3.49862358510328e-05, "loss": 0.3613, "step": 31570 }, { "gate_value": 0.40281838178634644, "icl_sequence_length": 76, "num_contexts": 3, "step": 31570 }, { "grad_norm": 896.0078125, "learning_rate": 3.490666856082291e-05, "loss": 0.3719, "step": 31580 }, { "gate_value": 0.402811735868454, "icl_sequence_length": 74, "num_contexts": 3, "step": 31580 }, { "grad_norm": 36.032833099365234, "learning_rate": 3.48271799358198e-05, "loss": 0.3562, "step": 31590 }, { "gate_value": 0.4028680622577667, "icl_sequence_length": 76, "num_contexts": 3, "step": 31590 }, { "grad_norm": 383.2489318847656, "learning_rate": 3.474777003035323e-05, "loss": 0.3776, "step": 31600 }, { "gate_value": 0.4029342532157898, "icl_sequence_length": 74, "num_contexts": 3, "step": 31600 }, { "grad_norm": 76.16732788085938, "learning_rate": 3.466843889869903e-05, "loss": 0.3588, "step": 31610 }, { "gate_value": 0.40299057960510254, "icl_sequence_length": 74, "num_contexts": 3, "step": 31610 }, { "grad_norm": 37.22277069091797, "learning_rate": 3.458918659507935e-05, "loss": 0.3557, "step": 31620 }, { "gate_value": 0.4030371308326721, "icl_sequence_length": 76, "num_contexts": 3, "step": 31620 }, { "grad_norm": 50.39080047607422, "learning_rate": 3.4510013173662356e-05, "loss": 0.3446, "step": 31630 }, { "gate_value": 0.4031177759170532, "icl_sequence_length": 82, "num_contexts": 3, "step": 31630 }, { "grad_norm": 24.230958938598633, "learning_rate": 3.443091868856239e-05, "loss": 0.3653, "step": 31640 }, { "gate_value": 0.40322422981262207, "icl_sequence_length": 84, "num_contexts": 3, "step": 31640 }, { "grad_norm": 81.67859649658203, "learning_rate": 3.435190319383977e-05, "loss": 0.3573, "step": 31650 }, { "gate_value": 0.4033409655094147, "icl_sequence_length": 82, "num_contexts": 3, "step": 31650 }, { "grad_norm": 49.784027099609375, "learning_rate": 3.427296674350077e-05, "loss": 0.3486, "step": 31660 }, { "gate_value": 0.40337809920310974, "icl_sequence_length": 82, "num_contexts": 3, "step": 31660 }, { "grad_norm": 1278.3245849609375, "learning_rate": 3.419410939149775e-05, "loss": 0.3603, "step": 31670 }, { "gate_value": 0.4034092426300049, "icl_sequence_length": 74, "num_contexts": 3, "step": 31670 }, { "grad_norm": 78.81749725341797, "learning_rate": 3.411533119172898e-05, "loss": 0.3485, "step": 31680 }, { "gate_value": 0.4034452736377716, "icl_sequence_length": 66, "num_contexts": 3, "step": 31680 }, { "grad_norm": 101.64257049560547, "learning_rate": 3.403663219803862e-05, "loss": 0.348, "step": 31690 }, { "gate_value": 0.4035550653934479, "icl_sequence_length": 76, "num_contexts": 3, "step": 31690 }, { "grad_norm": 38.30369567871094, "learning_rate": 3.3958012464216714e-05, "loss": 0.368, "step": 31700 }, { "gate_value": 0.4036957025527954, "icl_sequence_length": 94, "num_contexts": 3, "step": 31700 }, { "grad_norm": 73.04335021972656, "learning_rate": 3.387947204399905e-05, "loss": 0.3643, "step": 31710 }, { "gate_value": 0.4038078486919403, "icl_sequence_length": 74, "num_contexts": 3, "step": 31710 }, { "grad_norm": 48.24745559692383, "learning_rate": 3.3801010991067286e-05, "loss": 0.3589, "step": 31720 }, { "gate_value": 0.40385109186172485, "icl_sequence_length": 86, "num_contexts": 3, "step": 31720 }, { "grad_norm": 18.187374114990234, "learning_rate": 3.372262935904882e-05, "loss": 0.3561, "step": 31730 }, { "gate_value": 0.40391314029693604, "icl_sequence_length": 76, "num_contexts": 3, "step": 31730 }, { "grad_norm": 14.42495346069336, "learning_rate": 3.3644327201516795e-05, "loss": 0.3498, "step": 31740 }, { "gate_value": 0.4040006101131439, "icl_sequence_length": 64, "num_contexts": 3, "step": 31740 }, { "grad_norm": 117.840087890625, "learning_rate": 3.356610457198997e-05, "loss": 0.3496, "step": 31750 }, { "gate_value": 0.4040774703025818, "icl_sequence_length": 86, "num_contexts": 3, "step": 31750 }, { "grad_norm": 32.17731475830078, "learning_rate": 3.348796152393271e-05, "loss": 0.3396, "step": 31760 }, { "gate_value": 0.4041006863117218, "icl_sequence_length": 76, "num_contexts": 3, "step": 31760 }, { "grad_norm": 99.77642059326172, "learning_rate": 3.340989811075512e-05, "loss": 0.3726, "step": 31770 }, { "gate_value": 0.40423011779785156, "icl_sequence_length": 62, "num_contexts": 3, "step": 31770 }, { "grad_norm": 34.55682373046875, "learning_rate": 3.333191438581278e-05, "loss": 0.3616, "step": 31780 }, { "gate_value": 0.40432289242744446, "icl_sequence_length": 70, "num_contexts": 3, "step": 31780 }, { "grad_norm": 8175.66162109375, "learning_rate": 3.3254010402406845e-05, "loss": 0.3582, "step": 31790 }, { "gate_value": 0.40442436933517456, "icl_sequence_length": 70, "num_contexts": 3, "step": 31790 }, { "grad_norm": 152.19418334960938, "learning_rate": 3.317618621378399e-05, "loss": 0.3625, "step": 31800 }, { "gate_value": 0.4044768214225769, "icl_sequence_length": 76, "num_contexts": 3, "step": 31800 }, { "grad_norm": 47.31957244873047, "learning_rate": 3.309844187313625e-05, "loss": 0.3767, "step": 31810 }, { "gate_value": 0.40453842282295227, "icl_sequence_length": 88, "num_contexts": 3, "step": 31810 }, { "grad_norm": 579.4370727539062, "learning_rate": 3.302077743360115e-05, "loss": 0.3608, "step": 31820 }, { "gate_value": 0.40464499592781067, "icl_sequence_length": 68, "num_contexts": 3, "step": 31820 }, { "grad_norm": 202.05191040039062, "learning_rate": 3.294319294826169e-05, "loss": 0.355, "step": 31830 }, { "gate_value": 0.40469077229499817, "icl_sequence_length": 68, "num_contexts": 3, "step": 31830 }, { "grad_norm": 48.13961410522461, "learning_rate": 3.286568847014602e-05, "loss": 0.3464, "step": 31840 }, { "gate_value": 0.4047562777996063, "icl_sequence_length": 68, "num_contexts": 3, "step": 31840 }, { "grad_norm": 85.94761657714844, "learning_rate": 3.27882640522278e-05, "loss": 0.3631, "step": 31850 }, { "gate_value": 0.4047923982143402, "icl_sequence_length": 90, "num_contexts": 3, "step": 31850 }, { "grad_norm": 59.6070556640625, "learning_rate": 3.271091974742583e-05, "loss": 0.3695, "step": 31860 }, { "gate_value": 0.40476715564727783, "icl_sequence_length": 94, "num_contexts": 3, "step": 31860 }, { "grad_norm": 33.71444320678711, "learning_rate": 3.263365560860424e-05, "loss": 0.358, "step": 31870 }, { "gate_value": 0.4048195779323578, "icl_sequence_length": 80, "num_contexts": 3, "step": 31870 }, { "grad_norm": 1352.75244140625, "learning_rate": 3.255647168857235e-05, "loss": 0.3632, "step": 31880 }, { "gate_value": 0.4048292934894562, "icl_sequence_length": 88, "num_contexts": 3, "step": 31880 }, { "grad_norm": 25.616077423095703, "learning_rate": 3.247936804008462e-05, "loss": 0.3667, "step": 31890 }, { "gate_value": 0.40487462282180786, "icl_sequence_length": 78, "num_contexts": 3, "step": 31890 }, { "grad_norm": 12.800342559814453, "learning_rate": 3.240234471584073e-05, "loss": 0.3556, "step": 31900 }, { "gate_value": 0.4049961566925049, "icl_sequence_length": 76, "num_contexts": 3, "step": 31900 }, { "grad_norm": 334.0349426269531, "learning_rate": 3.2325401768485315e-05, "loss": 0.3754, "step": 31910 }, { "gate_value": 0.4050622880458832, "icl_sequence_length": 70, "num_contexts": 3, "step": 31910 }, { "grad_norm": 39.117591857910156, "learning_rate": 3.224853925060821e-05, "loss": 0.3649, "step": 31920 }, { "gate_value": 0.40509703755378723, "icl_sequence_length": 78, "num_contexts": 3, "step": 31920 }, { "grad_norm": 101.0981674194336, "learning_rate": 3.217175721474416e-05, "loss": 0.3542, "step": 31930 }, { "gate_value": 0.40514880418777466, "icl_sequence_length": 64, "num_contexts": 3, "step": 31930 }, { "grad_norm": 21.74467658996582, "learning_rate": 3.2095055713373026e-05, "loss": 0.3583, "step": 31940 }, { "gate_value": 0.4052824079990387, "icl_sequence_length": 72, "num_contexts": 3, "step": 31940 }, { "grad_norm": 37.686012268066406, "learning_rate": 3.201843479891954e-05, "loss": 0.3629, "step": 31950 }, { "gate_value": 0.40541791915893555, "icl_sequence_length": 78, "num_contexts": 3, "step": 31950 }, { "grad_norm": 37.003089904785156, "learning_rate": 3.194189452375335e-05, "loss": 0.3502, "step": 31960 }, { "gate_value": 0.4054928123950958, "icl_sequence_length": 68, "num_contexts": 3, "step": 31960 }, { "grad_norm": 21.644052505493164, "learning_rate": 3.1865434940189015e-05, "loss": 0.3574, "step": 31970 }, { "gate_value": 0.40563321113586426, "icl_sequence_length": 84, "num_contexts": 3, "step": 31970 }, { "grad_norm": 10.192123413085938, "learning_rate": 3.1789056100485975e-05, "loss": 0.3514, "step": 31980 }, { "gate_value": 0.40571680665016174, "icl_sequence_length": 80, "num_contexts": 3, "step": 31980 }, { "grad_norm": 44.86257553100586, "learning_rate": 3.1712758056848424e-05, "loss": 0.3505, "step": 31990 }, { "gate_value": 0.4058789610862732, "icl_sequence_length": 70, "num_contexts": 3, "step": 31990 }, { "grad_norm": 363.2022399902344, "learning_rate": 3.1636540861425396e-05, "loss": 0.3619, "step": 32000 }, { "gate_value": 0.40589526295661926, "icl_sequence_length": 86, "num_contexts": 3, "step": 32000 }, { "grad_norm": 121.92304229736328, "learning_rate": 3.156040456631059e-05, "loss": 0.3694, "step": 32010 }, { "gate_value": 0.40586337447166443, "icl_sequence_length": 88, "num_contexts": 3, "step": 32010 }, { "grad_norm": 39.731449127197266, "learning_rate": 3.148434922354239e-05, "loss": 0.369, "step": 32020 }, { "gate_value": 0.40581846237182617, "icl_sequence_length": 82, "num_contexts": 3, "step": 32020 }, { "grad_norm": 23.1926212310791, "learning_rate": 3.1408374885103966e-05, "loss": 0.3698, "step": 32030 }, { "gate_value": 0.40583550930023193, "icl_sequence_length": 84, "num_contexts": 3, "step": 32030 }, { "grad_norm": 30.028512954711914, "learning_rate": 3.1332481602923066e-05, "loss": 0.3523, "step": 32040 }, { "gate_value": 0.4058579206466675, "icl_sequence_length": 78, "num_contexts": 3, "step": 32040 }, { "grad_norm": 61.59327697753906, "learning_rate": 3.125666942887206e-05, "loss": 0.3493, "step": 32050 }, { "gate_value": 0.40589094161987305, "icl_sequence_length": 88, "num_contexts": 3, "step": 32050 }, { "grad_norm": 521.9461059570312, "learning_rate": 3.118093841476777e-05, "loss": 0.3646, "step": 32060 }, { "gate_value": 0.4059288203716278, "icl_sequence_length": 72, "num_contexts": 3, "step": 32060 }, { "grad_norm": 1710.82080078125, "learning_rate": 3.110528861237169e-05, "loss": 0.3636, "step": 32070 }, { "gate_value": 0.40597227215766907, "icl_sequence_length": 68, "num_contexts": 3, "step": 32070 }, { "grad_norm": 39.675498962402344, "learning_rate": 3.102972007338972e-05, "loss": 0.3537, "step": 32080 }, { "gate_value": 0.4060346782207489, "icl_sequence_length": 72, "num_contexts": 3, "step": 32080 }, { "grad_norm": 610.4242553710938, "learning_rate": 3.095423284947225e-05, "loss": 0.3566, "step": 32090 }, { "gate_value": 0.4061087667942047, "icl_sequence_length": 66, "num_contexts": 3, "step": 32090 }, { "grad_norm": 33.1412467956543, "learning_rate": 3.0878826992214155e-05, "loss": 0.3763, "step": 32100 }, { "gate_value": 0.40613362193107605, "icl_sequence_length": 60, "num_contexts": 3, "step": 32100 }, { "grad_norm": 44.12626266479492, "learning_rate": 3.0803502553154544e-05, "loss": 0.3531, "step": 32110 }, { "gate_value": 0.40618664026260376, "icl_sequence_length": 54, "num_contexts": 3, "step": 32110 }, { "grad_norm": 65.63241577148438, "learning_rate": 3.0728259583776953e-05, "loss": 0.3517, "step": 32120 }, { "gate_value": 0.4062596261501312, "icl_sequence_length": 66, "num_contexts": 3, "step": 32120 }, { "grad_norm": 82.6564712524414, "learning_rate": 3.0653098135509274e-05, "loss": 0.3454, "step": 32130 }, { "gate_value": 0.40627121925354004, "icl_sequence_length": 70, "num_contexts": 3, "step": 32130 }, { "grad_norm": 18.433292388916016, "learning_rate": 3.0578018259723646e-05, "loss": 0.3601, "step": 32140 }, { "gate_value": 0.4063282907009125, "icl_sequence_length": 84, "num_contexts": 3, "step": 32140 }, { "grad_norm": 43.319366455078125, "learning_rate": 3.0503020007736488e-05, "loss": 0.3772, "step": 32150 }, { "gate_value": 0.4063490629196167, "icl_sequence_length": 82, "num_contexts": 3, "step": 32150 }, { "grad_norm": 56.99197769165039, "learning_rate": 3.0428103430808332e-05, "loss": 0.3804, "step": 32160 }, { "gate_value": 0.4063299298286438, "icl_sequence_length": 88, "num_contexts": 3, "step": 32160 }, { "grad_norm": 799.6575317382812, "learning_rate": 3.035326858014398e-05, "loss": 0.3503, "step": 32170 }, { "gate_value": 0.4063448905944824, "icl_sequence_length": 74, "num_contexts": 3, "step": 32170 }, { "grad_norm": 25.74651527404785, "learning_rate": 3.0278515506892355e-05, "loss": 0.379, "step": 32180 }, { "gate_value": 0.40639516711235046, "icl_sequence_length": 82, "num_contexts": 3, "step": 32180 }, { "grad_norm": 345.8633117675781, "learning_rate": 3.0203844262146483e-05, "loss": 0.3706, "step": 32190 }, { "gate_value": 0.40643778443336487, "icl_sequence_length": 72, "num_contexts": 3, "step": 32190 }, { "grad_norm": 4103.92041015625, "learning_rate": 3.0129254896943473e-05, "loss": 0.3518, "step": 32200 }, { "gate_value": 0.40648239850997925, "icl_sequence_length": 76, "num_contexts": 3, "step": 32200 }, { "grad_norm": 26.54749870300293, "learning_rate": 3.0054747462264444e-05, "loss": 0.3694, "step": 32210 }, { "gate_value": 0.4065358638763428, "icl_sequence_length": 74, "num_contexts": 3, "step": 32210 }, { "grad_norm": 25.705764770507812, "learning_rate": 2.9980322009034445e-05, "loss": 0.3517, "step": 32220 }, { "gate_value": 0.4066010117530823, "icl_sequence_length": 86, "num_contexts": 3, "step": 32220 }, { "grad_norm": 50.34804916381836, "learning_rate": 2.9905978588122654e-05, "loss": 0.3792, "step": 32230 }, { "gate_value": 0.4066932499408722, "icl_sequence_length": 76, "num_contexts": 3, "step": 32230 }, { "grad_norm": 100.87702178955078, "learning_rate": 2.983171725034207e-05, "loss": 0.3719, "step": 32240 }, { "gate_value": 0.4067370593547821, "icl_sequence_length": 82, "num_contexts": 3, "step": 32240 }, { "grad_norm": 24.019262313842773, "learning_rate": 2.9757538046449676e-05, "loss": 0.3638, "step": 32250 }, { "gate_value": 0.4067458212375641, "icl_sequence_length": 72, "num_contexts": 3, "step": 32250 }, { "grad_norm": 45.86870193481445, "learning_rate": 2.9683441027146166e-05, "loss": 0.3606, "step": 32260 }, { "gate_value": 0.4067707657814026, "icl_sequence_length": 62, "num_contexts": 3, "step": 32260 }, { "grad_norm": 229.07736206054688, "learning_rate": 2.9609426243076178e-05, "loss": 0.36, "step": 32270 }, { "gate_value": 0.40676602721214294, "icl_sequence_length": 96, "num_contexts": 3, "step": 32270 }, { "grad_norm": 37.639366149902344, "learning_rate": 2.9535493744828166e-05, "loss": 0.3589, "step": 32280 }, { "gate_value": 0.40678077936172485, "icl_sequence_length": 82, "num_contexts": 3, "step": 32280 }, { "grad_norm": 26.005895614624023, "learning_rate": 2.9461643582934285e-05, "loss": 0.3499, "step": 32290 }, { "gate_value": 0.4068147838115692, "icl_sequence_length": 68, "num_contexts": 3, "step": 32290 }, { "grad_norm": 20.593965530395508, "learning_rate": 2.938787580787038e-05, "loss": 0.3473, "step": 32300 }, { "gate_value": 0.4068923890590668, "icl_sequence_length": 86, "num_contexts": 3, "step": 32300 }, { "grad_norm": 69.46590423583984, "learning_rate": 2.9314190470056086e-05, "loss": 0.3698, "step": 32310 }, { "gate_value": 0.40692099928855896, "icl_sequence_length": 92, "num_contexts": 3, "step": 32310 }, { "grad_norm": 40623.91796875, "learning_rate": 2.9240587619854584e-05, "loss": 0.3534, "step": 32320 }, { "gate_value": 0.4068794250488281, "icl_sequence_length": 80, "num_contexts": 3, "step": 32320 }, { "grad_norm": 29.661853790283203, "learning_rate": 2.9167067307572727e-05, "loss": 0.3691, "step": 32330 }, { "gate_value": 0.4069285988807678, "icl_sequence_length": 76, "num_contexts": 3, "step": 32330 }, { "grad_norm": 37.21219253540039, "learning_rate": 2.909362958346099e-05, "loss": 0.3653, "step": 32340 }, { "gate_value": 0.4070126414299011, "icl_sequence_length": 78, "num_contexts": 3, "step": 32340 }, { "grad_norm": 60.33441162109375, "learning_rate": 2.902027449771339e-05, "loss": 0.3451, "step": 32350 }, { "gate_value": 0.4070773422718048, "icl_sequence_length": 72, "num_contexts": 3, "step": 32350 }, { "grad_norm": 1850.6201171875, "learning_rate": 2.894700210046737e-05, "loss": 0.3646, "step": 32360 }, { "gate_value": 0.407095342874527, "icl_sequence_length": 78, "num_contexts": 3, "step": 32360 }, { "grad_norm": 19.2963809967041, "learning_rate": 2.887381244180395e-05, "loss": 0.3791, "step": 32370 }, { "gate_value": 0.407134085893631, "icl_sequence_length": 90, "num_contexts": 3, "step": 32370 }, { "grad_norm": 59.54200744628906, "learning_rate": 2.880070557174757e-05, "loss": 0.3658, "step": 32380 }, { "gate_value": 0.40719518065452576, "icl_sequence_length": 68, "num_contexts": 3, "step": 32380 }, { "grad_norm": 17.734453201293945, "learning_rate": 2.8727681540266136e-05, "loss": 0.3709, "step": 32390 }, { "gate_value": 0.4072628319263458, "icl_sequence_length": 80, "num_contexts": 3, "step": 32390 }, { "grad_norm": 29.109493255615234, "learning_rate": 2.8654740397270793e-05, "loss": 0.3582, "step": 32400 }, { "gate_value": 0.407309353351593, "icl_sequence_length": 78, "num_contexts": 3, "step": 32400 }, { "grad_norm": 6415.794921875, "learning_rate": 2.8581882192616213e-05, "loss": 0.3657, "step": 32410 }, { "gate_value": 0.40732789039611816, "icl_sequence_length": 66, "num_contexts": 3, "step": 32410 }, { "grad_norm": 81.83065032958984, "learning_rate": 2.850910697610021e-05, "loss": 0.3758, "step": 32420 }, { "gate_value": 0.40734460949897766, "icl_sequence_length": 92, "num_contexts": 3, "step": 32420 }, { "grad_norm": 44.35910415649414, "learning_rate": 2.8436414797463996e-05, "loss": 0.3573, "step": 32430 }, { "gate_value": 0.4073881506919861, "icl_sequence_length": 72, "num_contexts": 3, "step": 32430 }, { "grad_norm": 134.13555908203125, "learning_rate": 2.8363805706391995e-05, "loss": 0.3541, "step": 32440 }, { "gate_value": 0.4074646234512329, "icl_sequence_length": 96, "num_contexts": 3, "step": 32440 }, { "grad_norm": 33.06184005737305, "learning_rate": 2.8291279752511874e-05, "loss": 0.3617, "step": 32450 }, { "gate_value": 0.4075838327407837, "icl_sequence_length": 60, "num_contexts": 3, "step": 32450 }, { "grad_norm": 56.15337371826172, "learning_rate": 2.821883698539435e-05, "loss": 0.3473, "step": 32460 }, { "gate_value": 0.4077083170413971, "icl_sequence_length": 80, "num_contexts": 3, "step": 32460 }, { "grad_norm": 37.95241165161133, "learning_rate": 2.814647745455343e-05, "loss": 0.3542, "step": 32470 }, { "gate_value": 0.4077601730823517, "icl_sequence_length": 76, "num_contexts": 3, "step": 32470 }, { "grad_norm": 96.07141876220703, "learning_rate": 2.8074201209446185e-05, "loss": 0.3692, "step": 32480 }, { "gate_value": 0.4078661799430847, "icl_sequence_length": 80, "num_contexts": 3, "step": 32480 }, { "grad_norm": 51.653724670410156, "learning_rate": 2.80020082994727e-05, "loss": 0.3529, "step": 32490 }, { "gate_value": 0.4079096019268036, "icl_sequence_length": 80, "num_contexts": 3, "step": 32490 }, { "grad_norm": 42464.8828125, "learning_rate": 2.7929898773976155e-05, "loss": 0.3816, "step": 32500 }, { "gate_value": 0.4079228341579437, "icl_sequence_length": 78, "num_contexts": 3, "step": 32500 }, { "grad_norm": 28.320405960083008, "learning_rate": 2.7857872682242792e-05, "loss": 0.3596, "step": 32510 }, { "gate_value": 0.407973974943161, "icl_sequence_length": 82, "num_contexts": 3, "step": 32510 }, { "grad_norm": 2898.142578125, "learning_rate": 2.7785930073501684e-05, "loss": 0.3773, "step": 32520 }, { "gate_value": 0.40802299976348877, "icl_sequence_length": 84, "num_contexts": 3, "step": 32520 }, { "grad_norm": 50.13523483276367, "learning_rate": 2.771407099692496e-05, "loss": 0.3706, "step": 32530 }, { "gate_value": 0.4080987870693207, "icl_sequence_length": 74, "num_contexts": 3, "step": 32530 }, { "grad_norm": 17.791433334350586, "learning_rate": 2.7642295501627613e-05, "loss": 0.3855, "step": 32540 }, { "gate_value": 0.4081122577190399, "icl_sequence_length": 70, "num_contexts": 3, "step": 32540 }, { "grad_norm": 25.099008560180664, "learning_rate": 2.7570603636667576e-05, "loss": 0.3452, "step": 32550 }, { "gate_value": 0.4081073999404907, "icl_sequence_length": 82, "num_contexts": 3, "step": 32550 }, { "grad_norm": 56.72127914428711, "learning_rate": 2.7498995451045463e-05, "loss": 0.379, "step": 32560 }, { "gate_value": 0.40811750292778015, "icl_sequence_length": 90, "num_contexts": 3, "step": 32560 }, { "grad_norm": 65.63322448730469, "learning_rate": 2.742747099370489e-05, "loss": 0.3653, "step": 32570 }, { "gate_value": 0.40815386176109314, "icl_sequence_length": 86, "num_contexts": 3, "step": 32570 }, { "grad_norm": 123.76846313476562, "learning_rate": 2.7356030313532074e-05, "loss": 0.3637, "step": 32580 }, { "gate_value": 0.40820392966270447, "icl_sequence_length": 78, "num_contexts": 3, "step": 32580 }, { "grad_norm": 55.345069885253906, "learning_rate": 2.7284673459356067e-05, "loss": 0.3666, "step": 32590 }, { "gate_value": 0.40824827551841736, "icl_sequence_length": 72, "num_contexts": 3, "step": 32590 }, { "grad_norm": 147.07064819335938, "learning_rate": 2.7213400479948607e-05, "loss": 0.3503, "step": 32600 }, { "gate_value": 0.4082948863506317, "icl_sequence_length": 84, "num_contexts": 3, "step": 32600 }, { "grad_norm": 56.571807861328125, "learning_rate": 2.7142211424024157e-05, "loss": 0.359, "step": 32610 }, { "gate_value": 0.4083560109138489, "icl_sequence_length": 86, "num_contexts": 3, "step": 32610 }, { "grad_norm": 246.4196014404297, "learning_rate": 2.707110634023967e-05, "loss": 0.3888, "step": 32620 }, { "gate_value": 0.4084080159664154, "icl_sequence_length": 86, "num_contexts": 3, "step": 32620 }, { "grad_norm": 27.42348289489746, "learning_rate": 2.700008527719486e-05, "loss": 0.3504, "step": 32630 }, { "gate_value": 0.40847039222717285, "icl_sequence_length": 74, "num_contexts": 3, "step": 32630 }, { "grad_norm": 32.832275390625, "learning_rate": 2.692914828343194e-05, "loss": 0.3558, "step": 32640 }, { "gate_value": 0.4085494875907898, "icl_sequence_length": 60, "num_contexts": 3, "step": 32640 }, { "grad_norm": 39.22434997558594, "learning_rate": 2.685829540743572e-05, "loss": 0.3718, "step": 32650 }, { "gate_value": 0.40860962867736816, "icl_sequence_length": 88, "num_contexts": 3, "step": 32650 }, { "grad_norm": 40.54300308227539, "learning_rate": 2.6787526697633383e-05, "loss": 0.3666, "step": 32660 }, { "gate_value": 0.40863949060440063, "icl_sequence_length": 72, "num_contexts": 3, "step": 32660 }, { "grad_norm": 283.6440734863281, "learning_rate": 2.671684220239477e-05, "loss": 0.3655, "step": 32670 }, { "gate_value": 0.40866610407829285, "icl_sequence_length": 84, "num_contexts": 3, "step": 32670 }, { "grad_norm": 1028.0677490234375, "learning_rate": 2.6646241970031995e-05, "loss": 0.3598, "step": 32680 }, { "gate_value": 0.40876686573028564, "icl_sequence_length": 74, "num_contexts": 3, "step": 32680 }, { "grad_norm": 30.990869522094727, "learning_rate": 2.6575726048799667e-05, "loss": 0.3731, "step": 32690 }, { "gate_value": 0.4088298976421356, "icl_sequence_length": 72, "num_contexts": 3, "step": 32690 }, { "grad_norm": 50.804988861083984, "learning_rate": 2.6505294486894764e-05, "loss": 0.3608, "step": 32700 }, { "gate_value": 0.40888458490371704, "icl_sequence_length": 66, "num_contexts": 3, "step": 32700 }, { "grad_norm": 19.51559829711914, "learning_rate": 2.6434947332456625e-05, "loss": 0.3637, "step": 32710 }, { "gate_value": 0.408957302570343, "icl_sequence_length": 92, "num_contexts": 3, "step": 32710 }, { "grad_norm": 225.3372344970703, "learning_rate": 2.6364684633566792e-05, "loss": 0.3581, "step": 32720 }, { "gate_value": 0.4090772271156311, "icl_sequence_length": 74, "num_contexts": 3, "step": 32720 }, { "grad_norm": 4149.73046875, "learning_rate": 2.6294506438249213e-05, "loss": 0.3603, "step": 32730 }, { "gate_value": 0.40914449095726013, "icl_sequence_length": 76, "num_contexts": 3, "step": 32730 }, { "grad_norm": 13865.302734375, "learning_rate": 2.6224412794470008e-05, "loss": 0.3636, "step": 32740 }, { "gate_value": 0.40925562381744385, "icl_sequence_length": 72, "num_contexts": 3, "step": 32740 }, { "grad_norm": 102.5394287109375, "learning_rate": 2.6154403750137565e-05, "loss": 0.3523, "step": 32750 }, { "gate_value": 0.4092884957790375, "icl_sequence_length": 72, "num_contexts": 3, "step": 32750 }, { "grad_norm": 607.7672729492188, "learning_rate": 2.608447935310236e-05, "loss": 0.3604, "step": 32760 }, { "gate_value": 0.40925315022468567, "icl_sequence_length": 70, "num_contexts": 3, "step": 32760 }, { "grad_norm": 148.76336669921875, "learning_rate": 2.6014639651157032e-05, "loss": 0.3512, "step": 32770 }, { "gate_value": 0.4092670679092407, "icl_sequence_length": 80, "num_contexts": 3, "step": 32770 }, { "grad_norm": 2247.474365234375, "learning_rate": 2.5944884692036393e-05, "loss": 0.3456, "step": 32780 }, { "gate_value": 0.40928658843040466, "icl_sequence_length": 90, "num_contexts": 3, "step": 32780 }, { "grad_norm": 160.3250274658203, "learning_rate": 2.5875214523417275e-05, "loss": 0.3594, "step": 32790 }, { "gate_value": 0.4093087911605835, "icl_sequence_length": 76, "num_contexts": 3, "step": 32790 }, { "grad_norm": 45.89481735229492, "learning_rate": 2.580562919291862e-05, "loss": 0.3675, "step": 32800 }, { "gate_value": 0.4093517065048218, "icl_sequence_length": 82, "num_contexts": 3, "step": 32800 }, { "grad_norm": 57.58846664428711, "learning_rate": 2.5736128748101365e-05, "loss": 0.3757, "step": 32810 }, { "gate_value": 0.4093685448169708, "icl_sequence_length": 82, "num_contexts": 3, "step": 32810 }, { "grad_norm": 71.6854019165039, "learning_rate": 2.5666713236468344e-05, "loss": 0.3645, "step": 32820 }, { "gate_value": 0.409373015165329, "icl_sequence_length": 86, "num_contexts": 3, "step": 32820 }, { "grad_norm": 26.344486236572266, "learning_rate": 2.5597382705464425e-05, "loss": 0.3556, "step": 32830 }, { "gate_value": 0.40939274430274963, "icl_sequence_length": 88, "num_contexts": 3, "step": 32830 }, { "grad_norm": 3726.2080078125, "learning_rate": 2.5528137202476384e-05, "loss": 0.3737, "step": 32840 }, { "gate_value": 0.4094216525554657, "icl_sequence_length": 68, "num_contexts": 3, "step": 32840 }, { "grad_norm": 14.94805908203125, "learning_rate": 2.5458976774832895e-05, "loss": 0.3538, "step": 32850 }, { "gate_value": 0.40946733951568604, "icl_sequence_length": 90, "num_contexts": 3, "step": 32850 }, { "grad_norm": 43.41163635253906, "learning_rate": 2.538990146980443e-05, "loss": 0.3801, "step": 32860 }, { "gate_value": 0.4094873070716858, "icl_sequence_length": 72, "num_contexts": 3, "step": 32860 }, { "grad_norm": 67.83159637451172, "learning_rate": 2.5320911334603273e-05, "loss": 0.3557, "step": 32870 }, { "gate_value": 0.40953534841537476, "icl_sequence_length": 86, "num_contexts": 3, "step": 32870 }, { "grad_norm": 44.28196334838867, "learning_rate": 2.525200641638357e-05, "loss": 0.3759, "step": 32880 }, { "gate_value": 0.4095556437969208, "icl_sequence_length": 68, "num_contexts": 3, "step": 32880 }, { "grad_norm": 51.88390350341797, "learning_rate": 2.5183186762241163e-05, "loss": 0.3531, "step": 32890 }, { "gate_value": 0.40955135226249695, "icl_sequence_length": 70, "num_contexts": 3, "step": 32890 }, { "grad_norm": 48.15521240234375, "learning_rate": 2.5114452419213665e-05, "loss": 0.3643, "step": 32900 }, { "gate_value": 0.40955406427383423, "icl_sequence_length": 88, "num_contexts": 3, "step": 32900 }, { "grad_norm": 81.54961395263672, "learning_rate": 2.5045803434280377e-05, "loss": 0.3921, "step": 32910 }, { "gate_value": 0.4095805287361145, "icl_sequence_length": 66, "num_contexts": 3, "step": 32910 }, { "grad_norm": 29.33376121520996, "learning_rate": 2.4977239854362146e-05, "loss": 0.3564, "step": 32920 }, { "gate_value": 0.40961137413978577, "icl_sequence_length": 76, "num_contexts": 3, "step": 32920 }, { "grad_norm": 19.240026473999023, "learning_rate": 2.4908761726321592e-05, "loss": 0.3555, "step": 32930 }, { "gate_value": 0.40965530276298523, "icl_sequence_length": 70, "num_contexts": 3, "step": 32930 }, { "grad_norm": 124.72468566894531, "learning_rate": 2.4840369096962852e-05, "loss": 0.3456, "step": 32940 }, { "gate_value": 0.40971893072128296, "icl_sequence_length": 80, "num_contexts": 3, "step": 32940 }, { "grad_norm": 29.96973419189453, "learning_rate": 2.4772062013031675e-05, "loss": 0.3706, "step": 32950 }, { "gate_value": 0.4097919464111328, "icl_sequence_length": 86, "num_contexts": 3, "step": 32950 }, { "grad_norm": 97.25133514404297, "learning_rate": 2.4703840521215258e-05, "loss": 0.3733, "step": 32960 }, { "gate_value": 0.4098777770996094, "icl_sequence_length": 78, "num_contexts": 3, "step": 32960 }, { "grad_norm": 62.41849136352539, "learning_rate": 2.463570466814242e-05, "loss": 0.3477, "step": 32970 }, { "gate_value": 0.4099633991718292, "icl_sequence_length": 84, "num_contexts": 3, "step": 32970 }, { "grad_norm": 41.57551193237305, "learning_rate": 2.4567654500383276e-05, "loss": 0.3871, "step": 32980 }, { "gate_value": 0.40999799966812134, "icl_sequence_length": 92, "num_contexts": 3, "step": 32980 }, { "grad_norm": 29.298206329345703, "learning_rate": 2.4499690064449522e-05, "loss": 0.3746, "step": 32990 }, { "gate_value": 0.41000160574913025, "icl_sequence_length": 78, "num_contexts": 3, "step": 32990 }, { "grad_norm": 193.3474884033203, "learning_rate": 2.4431811406794216e-05, "loss": 0.3659, "step": 33000 } ], "logging_steps": 10, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }