Upload 8 files

Browse files

Files changed (8) hide show

intervenable_model/config.json +54 -0
intervenable_model/intkey_layer.22.comp.block_output.unit.pos.nunit.1#0.bin +3 -0
intervenable_model/intkey_layer.24.comp.block_output.unit.pos.nunit.1#0.bin +3 -0
intervenable_model/pytorch_model.bin +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +1633 -0

intervenable_model/config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "intervention_constant_sources": [
+    true,
+    true
+  ],
+  "intervention_dimensions": [
+    3072,
+    3072
+  ],
+  "intervention_types": [
+    "<class 'pyreft.interventions.GatedLoreftIntervention_Linear'>",
+    "<class 'pyreft.interventions.GatedLoreftIntervention_Linear'>"
+  ],
+  "mode": "parallel",
+  "representations": [
+    [
+      22,
+      "block_output",
+      "pos",
+      1,
+      4,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null
+    ],
+    [
+      24,
+      "block_output",
+      "pos",
+      1,
+      4,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null
+    ]
+  ],
+  "sorted_keys": [
+    "layer.22.comp.block_output.unit.pos.nunit.1#0",
+    "layer.24.comp.block_output.unit.pos.nunit.1#0"
+  ],
+  "transformers_version": "4.45.2"
+}

intervenable_model/intkey_layer.22.comp.block_output.unit.pos.nunit.1#0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0acd9690863e5ce15eb3ff18b6730a99a5a6758b23aa2cc4300d15d9ae7496b
+size 82721

intervenable_model/intkey_layer.24.comp.block_output.unit.pos.nunit.1#0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2c3b27973dc642900401f2edf0a3d38690e4e4a1f91f06f7e061c94b5947813
+size 82721

intervenable_model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:049c26b844b79121ddd8379f7f69194e63f6fbf6aa007eeac0c66f17eebb8893
+size 888

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c1e1e3a602527b2607cdcabe51786782f779d1603eaa34bfaca517eadef9792
+size 328448

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ceda789da21d0a82a08e7169a02f149733976358427f19a0644fca78fcf4fed
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c64f2a890c26441da209d4d2660be5225e61c96fb16c8e91396996e345ad9e60
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1633 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 40.0,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.36,
+      "forget_cf_outputs.loss": -2.683056592941284,
+      "forget_loss": 2.683056592941284,
+      "gated_loss": 0.50390625,
+      "retain_loss": 0.2806420624256134,
+      "step": 9,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 12.48087215423584,
+      "learning_rate": 0.00396,
+      "loss": 10.0439,
+      "step": 10
+    },
+    {
+      "epoch": 0.76,
+      "forget_cf_outputs.loss": -2.3281075954437256,
+      "forget_loss": 2.3281075954437256,
+      "gated_loss": 0.447265625,
+      "retain_loss": 0.30973613262176514,
+      "step": 19,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 14.701934814453125,
+      "learning_rate": 0.00392,
+      "loss": 6.7906,
+      "step": 20
+    },
+    {
+      "epoch": 1.16,
+      "forget_cf_outputs.loss": -1.9086629152297974,
+      "forget_loss": 1.9086629152297974,
+      "gated_loss": 0.12890625,
+      "retain_loss": 0.25592905282974243,
+      "step": 29,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 6.575860500335693,
+      "learning_rate": 0.0038799999999999998,
+      "loss": 5.0776,
+      "step": 30
+    },
+    {
+      "epoch": 1.56,
+      "forget_cf_outputs.loss": -1.8418117761611938,
+      "forget_loss": 1.8418117761611938,
+      "gated_loss": 0.26953125,
+      "retain_loss": 0.2785710096359253,
+      "step": 39,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 16.85633087158203,
+      "learning_rate": 0.00384,
+      "loss": 4.3983,
+      "step": 40
+    },
+    {
+      "epoch": 1.96,
+      "forget_cf_outputs.loss": -1.5592725276947021,
+      "forget_loss": 1.5592725276947021,
+      "gated_loss": 0.09228515625,
+      "retain_loss": 0.2128174901008606,
+      "step": 49,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 6.894440650939941,
+      "learning_rate": 0.0038,
+      "loss": 3.8443,
+      "step": 50
+    },
+    {
+      "epoch": 2.36,
+      "forget_cf_outputs.loss": -1.8075147867202759,
+      "forget_loss": 1.8075147867202759,
+      "gated_loss": 0.2099609375,
+      "retain_loss": 0.20431871712207794,
+      "step": 59,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 12.495519638061523,
+      "learning_rate": 0.00376,
+      "loss": 3.1611,
+      "step": 60
+    },
+    {
+      "epoch": 2.76,
+      "forget_cf_outputs.loss": -1.648992896080017,
+      "forget_loss": 1.648992896080017,
+      "gated_loss": 0.08447265625,
+      "retain_loss": 0.23505711555480957,
+      "step": 69,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 4.081427574157715,
+      "learning_rate": 0.00372,
+      "loss": 3.1274,
+      "step": 70
+    },
+    {
+      "epoch": 3.16,
+      "forget_cf_outputs.loss": -1.7901757955551147,
+      "forget_loss": 1.7901757955551147,
+      "gated_loss": 0.07861328125,
+      "retain_loss": 0.19908858835697174,
+      "step": 79,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 5.019917011260986,
+      "learning_rate": 0.00368,
+      "loss": 2.9064,
+      "step": 80
+    },
+    {
+      "epoch": 3.56,
+      "forget_cf_outputs.loss": -1.6832540035247803,
+      "forget_loss": 1.6832540035247803,
+      "gated_loss": 0.1572265625,
+      "retain_loss": 0.31473198533058167,
+      "step": 89,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 25.354310989379883,
+      "learning_rate": 0.00364,
+      "loss": 2.7559,
+      "step": 90
+    },
+    {
+      "epoch": 3.96,
+      "forget_cf_outputs.loss": -1.5529592037200928,
+      "forget_loss": 1.5529592037200928,
+      "gated_loss": 0.0859375,
+      "retain_loss": 0.21396659314632416,
+      "step": 99,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 13.010091781616211,
+      "learning_rate": 0.0036000000000000003,
+      "loss": 4.6507,
+      "step": 100
+    },
+    {
+      "epoch": 4.36,
+      "forget_cf_outputs.loss": -1.4749836921691895,
+      "forget_loss": 1.4749836921691895,
+      "gated_loss": 0.0751953125,
+      "retain_loss": 0.3322639465332031,
+      "step": 109,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 19.40593147277832,
+      "learning_rate": 0.0035600000000000002,
+      "loss": 2.4806,
+      "step": 110
+    },
+    {
+      "epoch": 4.76,
+      "forget_cf_outputs.loss": -1.586808443069458,
+      "forget_loss": 1.586808443069458,
+      "gated_loss": 0.044677734375,
+      "retain_loss": 0.20972619950771332,
+      "step": 119,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 3.9136083126068115,
+      "learning_rate": 0.00352,
+      "loss": 2.3875,
+      "step": 120
+    },
+    {
+      "epoch": 5.16,
+      "forget_cf_outputs.loss": -1.3828651905059814,
+      "forget_loss": 1.3828651905059814,
+      "gated_loss": 0.0308837890625,
+      "retain_loss": 0.2637632191181183,
+      "step": 129,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 5.2,
+      "grad_norm": 2.676300525665283,
+      "learning_rate": 0.00348,
+      "loss": 2.3602,
+      "step": 130
+    },
+    {
+      "epoch": 5.5600000000000005,
+      "forget_cf_outputs.loss": -1.353875756263733,
+      "forget_loss": 1.353875756263733,
+      "gated_loss": 0.03076171875,
+      "retain_loss": 0.2234891653060913,
+      "step": 139,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 5.6,
+      "grad_norm": 2.6358110904693604,
+      "learning_rate": 0.00344,
+      "loss": 2.14,
+      "step": 140
+    },
+    {
+      "epoch": 5.96,
+      "forget_cf_outputs.loss": -1.4662984609603882,
+      "forget_loss": 1.4662984609603882,
+      "gated_loss": 0.044921875,
+      "retain_loss": 0.22434721887111664,
+      "step": 149,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 5.245201587677002,
+      "learning_rate": 0.0034,
+      "loss": 3.8186,
+      "step": 150
+    },
+    {
+      "epoch": 6.36,
+      "forget_cf_outputs.loss": -1.3115544319152832,
+      "forget_loss": 1.3115544319152832,
+      "gated_loss": 0.0208740234375,
+      "retain_loss": 0.27212581038475037,
+      "step": 159,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 2.41806697845459,
+      "learning_rate": 0.00336,
+      "loss": 2.0658,
+      "step": 160
+    },
+    {
+      "epoch": 6.76,
+      "forget_cf_outputs.loss": -1.389865517616272,
+      "forget_loss": 1.389865517616272,
+      "gated_loss": 0.0157470703125,
+      "retain_loss": 0.22109903395175934,
+      "step": 169,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 6.8,
+      "grad_norm": 1.6761163473129272,
+      "learning_rate": 0.00332,
+      "loss": 2.0436,
+      "step": 170
+    },
+    {
+      "epoch": 7.16,
+      "forget_cf_outputs.loss": -1.4114540815353394,
+      "forget_loss": 1.4114540815353394,
+      "gated_loss": 0.01953125,
+      "retain_loss": 0.28868579864501953,
+      "step": 179,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 7.2,
+      "grad_norm": 2.110581159591675,
+      "learning_rate": 0.00328,
+      "loss": 1.9892,
+      "step": 180
+    },
+    {
+      "epoch": 7.5600000000000005,
+      "forget_cf_outputs.loss": -1.5645467042922974,
+      "forget_loss": 1.5645467042922974,
+      "gated_loss": 0.0135498046875,
+      "retain_loss": 0.18681125342845917,
+      "step": 189,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 7.6,
+      "grad_norm": 2.8026249408721924,
+      "learning_rate": 0.0032400000000000003,
+      "loss": 1.9509,
+      "step": 190
+    },
+    {
+      "epoch": 7.96,
+      "forget_cf_outputs.loss": -1.368323802947998,
+      "forget_loss": 1.368323802947998,
+      "gated_loss": 0.0142822265625,
+      "retain_loss": 0.23697978258132935,
+      "step": 199,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 1.9209412336349487,
+      "learning_rate": 0.0032,
+      "loss": 1.8948,
+      "step": 200
+    },
+    {
+      "epoch": 8.36,
+      "forget_cf_outputs.loss": -1.199570894241333,
+      "forget_loss": 1.199570894241333,
+      "gated_loss": 0.00933837890625,
+      "retain_loss": 0.25221553444862366,
+      "step": 209,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 8.4,
+      "grad_norm": 1.5988783836364746,
+      "learning_rate": 0.00316,
+      "loss": 1.8466,
+      "step": 210
+    },
+    {
+      "epoch": 8.76,
+      "forget_cf_outputs.loss": -1.2841148376464844,
+      "forget_loss": 1.2841148376464844,
+      "gated_loss": 0.009765625,
+      "retain_loss": 0.24307847023010254,
+      "step": 219,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 8.8,
+      "grad_norm": 1.7333821058273315,
+      "learning_rate": 0.0031200000000000004,
+      "loss": 1.817,
+      "step": 220
+    },
+    {
+      "epoch": 9.16,
+      "forget_cf_outputs.loss": -1.0533504486083984,
+      "forget_loss": 1.0533504486083984,
+      "gated_loss": 0.005462646484375,
+      "retain_loss": 0.22837677597999573,
+      "step": 229,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 9.2,
+      "grad_norm": 1.5669026374816895,
+      "learning_rate": 0.0030800000000000003,
+      "loss": 1.7654,
+      "step": 230
+    },
+    {
+      "epoch": 9.56,
+      "forget_cf_outputs.loss": -1.2518203258514404,
+      "forget_loss": 1.2518203258514404,
+      "gated_loss": 0.0107421875,
+      "retain_loss": 0.27540236711502075,
+      "step": 239,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 9.6,
+      "grad_norm": 1.7850066423416138,
+      "learning_rate": 0.00304,
+      "loss": 1.7462,
+      "step": 240
+    },
+    {
+      "epoch": 9.96,
+      "forget_cf_outputs.loss": -1.2480342388153076,
+      "forget_loss": 1.2480342388153076,
+      "gated_loss": 0.0079345703125,
+      "retain_loss": 0.2164781242609024,
+      "step": 249,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 1.8285338878631592,
+      "learning_rate": 0.003,
+      "loss": 1.8334,
+      "step": 250
+    },
+    {
+      "epoch": 10.36,
+      "forget_cf_outputs.loss": -1.14595365524292,
+      "forget_loss": 1.14595365524292,
+      "gated_loss": 0.006072998046875,
+      "retain_loss": 0.22983184456825256,
+      "step": 259,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 10.4,
+      "grad_norm": 1.8580724000930786,
+      "learning_rate": 0.00296,
+      "loss": 1.6553,
+      "step": 260
+    },
+    {
+      "epoch": 10.76,
+      "forget_cf_outputs.loss": -1.2968733310699463,
+      "forget_loss": 1.2968733310699463,
+      "gated_loss": 0.008544921875,
+      "retain_loss": 0.20906659960746765,
+      "step": 269,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 10.8,
+      "grad_norm": 1.6227186918258667,
+      "learning_rate": 0.00292,
+      "loss": 1.7339,
+      "step": 270
+    },
+    {
+      "epoch": 11.16,
+      "forget_cf_outputs.loss": -1.21259605884552,
+      "forget_loss": 1.21259605884552,
+      "gated_loss": 0.00750732421875,
+      "retain_loss": 0.23951691389083862,
+      "step": 279,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 11.2,
+      "grad_norm": 1.3491921424865723,
+      "learning_rate": 0.0028799999999999997,
+      "loss": 1.6642,
+      "step": 280
+    },
+    {
+      "epoch": 11.56,
+      "forget_cf_outputs.loss": -1.203460454940796,
+      "forget_loss": 1.203460454940796,
+      "gated_loss": 0.0087890625,
+      "retain_loss": 0.23031194508075714,
+      "step": 289,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 11.6,
+      "grad_norm": 2.027022123336792,
+      "learning_rate": 0.00284,
+      "loss": 1.6413,
+      "step": 290
+    },
+    {
+      "epoch": 11.96,
+      "forget_cf_outputs.loss": -1.1864365339279175,
+      "forget_loss": 1.1864365339279175,
+      "gated_loss": 0.01043701171875,
+      "retain_loss": 0.23127436637878418,
+      "step": 299,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 12.0,
+      "grad_norm": 1.8040319681167603,
+      "learning_rate": 0.0028,
+      "loss": 1.761,
+      "step": 300
+    },
+    {
+      "epoch": 12.36,
+      "forget_cf_outputs.loss": -1.227767825126648,
+      "forget_loss": 1.227767825126648,
+      "gated_loss": 0.00616455078125,
+      "retain_loss": 0.22675465047359467,
+      "step": 309,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 12.4,
+      "grad_norm": 2.1620850563049316,
+      "learning_rate": 0.00276,
+      "loss": 1.5947,
+      "step": 310
+    },
+    {
+      "epoch": 12.76,
+      "forget_cf_outputs.loss": -1.0549200773239136,
+      "forget_loss": 1.0549200773239136,
+      "gated_loss": 0.0054931640625,
+      "retain_loss": 0.225913867354393,
+      "step": 319,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 12.8,
+      "grad_norm": 1.422467827796936,
+      "learning_rate": 0.00272,
+      "loss": 1.6839,
+      "step": 320
+    },
+    {
+      "epoch": 13.16,
+      "forget_cf_outputs.loss": -0.966583251953125,
+      "forget_loss": 0.966583251953125,
+      "gated_loss": 0.0050048828125,
+      "retain_loss": 0.21935530006885529,
+      "step": 329,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 13.2,
+      "grad_norm": 1.3767800331115723,
+      "learning_rate": 0.00268,
+      "loss": 1.6052,
+      "step": 330
+    },
+    {
+      "epoch": 13.56,
+      "forget_cf_outputs.loss": -1.0747449398040771,
+      "forget_loss": 1.0747449398040771,
+      "gated_loss": 0.008544921875,
+      "retain_loss": 0.23962758481502533,
+      "step": 339,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 13.6,
+      "grad_norm": 2.152151584625244,
+      "learning_rate": 0.00264,
+      "loss": 1.6239,
+      "step": 340
+    },
+    {
+      "epoch": 13.96,
+      "forget_cf_outputs.loss": -0.9583653211593628,
+      "forget_loss": 0.9583653211593628,
+      "gated_loss": 0.006500244140625,
+      "retain_loss": 0.21241800487041473,
+      "step": 349,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 14.0,
+      "grad_norm": 2.0913257598876953,
+      "learning_rate": 0.0026000000000000003,
+      "loss": 1.566,
+      "step": 350
+    },
+    {
+      "epoch": 14.36,
+      "forget_cf_outputs.loss": -0.8475239276885986,
+      "forget_loss": 0.8475239276885986,
+      "gated_loss": 0.01165771484375,
+      "retain_loss": 0.22438839077949524,
+      "step": 359,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 14.4,
+      "grad_norm": 1.8125321865081787,
+      "learning_rate": 0.00256,
+      "loss": 1.52,
+      "step": 360
+    },
+    {
+      "epoch": 14.76,
+      "forget_cf_outputs.loss": -1.2125965356826782,
+      "forget_loss": 1.2125965356826782,
+      "gated_loss": 0.004974365234375,
+      "retain_loss": 0.23040254414081573,
+      "step": 369,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 14.8,
+      "grad_norm": 1.811591386795044,
+      "learning_rate": 0.00252,
+      "loss": 1.5403,
+      "step": 370
+    },
+    {
+      "epoch": 15.16,
+      "forget_cf_outputs.loss": -0.9788862466812134,
+      "forget_loss": 0.9788862466812134,
+      "gated_loss": 0.006439208984375,
+      "retain_loss": 0.24656128883361816,
+      "step": 379,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 15.2,
+      "grad_norm": 1.5504097938537598,
+      "learning_rate": 0.00248,
+      "loss": 1.5363,
+      "step": 380
+    },
+    {
+      "epoch": 15.56,
+      "forget_cf_outputs.loss": -0.8573880195617676,
+      "forget_loss": 0.8573880195617676,
+      "gated_loss": 0.007080078125,
+      "retain_loss": 0.2557204067707062,
+      "step": 389,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 15.6,
+      "grad_norm": 1.5796666145324707,
+      "learning_rate": 0.00244,
+      "loss": 1.4844,
+      "step": 390
+    },
+    {
+      "epoch": 15.96,
+      "forget_cf_outputs.loss": -1.0210211277008057,
+      "forget_loss": 1.0210211277008057,
+      "gated_loss": 0.007080078125,
+      "retain_loss": 0.22587800025939941,
+      "step": 399,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 16.0,
+      "grad_norm": 1.902213215827942,
+      "learning_rate": 0.0024,
+      "loss": 1.5053,
+      "step": 400
+    },
+    {
+      "epoch": 16.36,
+      "forget_cf_outputs.loss": -0.7990767955780029,
+      "forget_loss": 0.7990767955780029,
+      "gated_loss": 0.0064697265625,
+      "retain_loss": 0.20731617510318756,
+      "step": 409,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 16.4,
+      "grad_norm": 1.7532883882522583,
+      "learning_rate": 0.00236,
+      "loss": 1.3702,
+      "step": 410
+    },
+    {
+      "epoch": 16.76,
+      "forget_cf_outputs.loss": -1.08427095413208,
+      "forget_loss": 1.08427095413208,
+      "gated_loss": 0.005950927734375,
+      "retain_loss": 0.23056094348430634,
+      "step": 419,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 16.8,
+      "grad_norm": 1.8573689460754395,
+      "learning_rate": 0.00232,
+      "loss": 1.4936,
+      "step": 420
+    },
+    {
+      "epoch": 17.16,
+      "forget_cf_outputs.loss": -0.8196872472763062,
+      "forget_loss": 0.8196872472763062,
+      "gated_loss": 0.006805419921875,
+      "retain_loss": 0.3019656836986542,
+      "step": 429,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 17.2,
+      "grad_norm": 1.6391781568527222,
+      "learning_rate": 0.00228,
+      "loss": 1.5101,
+      "step": 430
+    },
+    {
+      "epoch": 17.56,
+      "forget_cf_outputs.loss": -0.8601583242416382,
+      "forget_loss": 0.8601583242416382,
+      "gated_loss": 0.00677490234375,
+      "retain_loss": 0.30145329236984253,
+      "step": 439,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 17.6,
+      "grad_norm": 1.7495087385177612,
+      "learning_rate": 0.0022400000000000002,
+      "loss": 1.4067,
+      "step": 440
+    },
+    {
+      "epoch": 17.96,
+      "forget_cf_outputs.loss": -0.9261890053749084,
+      "forget_loss": 0.9261890053749084,
+      "gated_loss": 0.007110595703125,
+      "retain_loss": 0.2040254771709442,
+      "step": 449,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 18.0,
+      "grad_norm": 2.1609787940979004,
+      "learning_rate": 0.0022,
+      "loss": 1.3843,
+      "step": 450
+    },
+    {
+      "epoch": 18.36,
+      "forget_cf_outputs.loss": -0.7474625706672668,
+      "forget_loss": 0.7474625706672668,
+      "gated_loss": 0.006103515625,
+      "retain_loss": 0.20823714137077332,
+      "step": 459,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 18.4,
+      "grad_norm": 1.997226595878601,
+      "learning_rate": 0.00216,
+      "loss": 1.3081,
+      "step": 460
+    },
+    {
+      "epoch": 18.76,
+      "forget_cf_outputs.loss": -1.0060858726501465,
+      "forget_loss": 1.0060858726501465,
+      "gated_loss": 0.00579833984375,
+      "retain_loss": 0.23037730157375336,
+      "step": 469,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 18.8,
+      "grad_norm": 2.023531675338745,
+      "learning_rate": 0.0021200000000000004,
+      "loss": 1.4162,
+      "step": 470
+    },
+    {
+      "epoch": 19.16,
+      "forget_cf_outputs.loss": -0.7942442893981934,
+      "forget_loss": 0.7942442893981934,
+      "gated_loss": 0.00677490234375,
+      "retain_loss": 0.30291858315467834,
+      "step": 479,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 19.2,
+      "grad_norm": 2.0801267623901367,
+      "learning_rate": 0.0020800000000000003,
+      "loss": 1.4455,
+      "step": 480
+    },
+    {
+      "epoch": 19.56,
+      "forget_cf_outputs.loss": -0.7663432955741882,
+      "forget_loss": 0.7663432955741882,
+      "gated_loss": 0.00640869140625,
+      "retain_loss": 0.3002184331417084,
+      "step": 489,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 19.6,
+      "grad_norm": 1.6843624114990234,
+      "learning_rate": 0.00204,
+      "loss": 1.3348,
+      "step": 490
+    },
+    {
+      "epoch": 19.96,
+      "forget_cf_outputs.loss": -0.8520928025245667,
+      "forget_loss": 0.8520928025245667,
+      "gated_loss": 0.006591796875,
+      "retain_loss": 0.20439262688159943,
+      "step": 499,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 2.177591323852539,
+      "learning_rate": 0.002,
+      "loss": 1.3115,
+      "step": 500
+    },
+    {
+      "epoch": 20.36,
+      "forget_cf_outputs.loss": -0.7113033533096313,
+      "forget_loss": 0.7113033533096313,
+      "gated_loss": 0.0062255859375,
+      "retain_loss": 0.20779718458652496,
+      "step": 509,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 20.4,
+      "grad_norm": 2.2269105911254883,
+      "learning_rate": 0.00196,
+      "loss": 1.2523,
+      "step": 510
+    },
+    {
+      "epoch": 20.76,
+      "forget_cf_outputs.loss": -0.9132112860679626,
+      "forget_loss": 0.9132112860679626,
+      "gated_loss": 0.005615234375,
+      "retain_loss": 0.23009441792964935,
+      "step": 519,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 20.8,
+      "grad_norm": 1.955623745918274,
+      "learning_rate": 0.00192,
+      "loss": 1.3478,
+      "step": 520
+    },
+    {
+      "epoch": 21.16,
+      "forget_cf_outputs.loss": -0.7705625891685486,
+      "forget_loss": 0.7705625891685486,
+      "gated_loss": 0.00677490234375,
+      "retain_loss": 0.30292266607284546,
+      "step": 529,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 21.2,
+      "grad_norm": 2.0390868186950684,
+      "learning_rate": 0.00188,
+      "loss": 1.3883,
+      "step": 530
+    },
+    {
+      "epoch": 21.56,
+      "forget_cf_outputs.loss": -0.7298972606658936,
+      "forget_loss": 0.7298972606658936,
+      "gated_loss": 0.006195068359375,
+      "retain_loss": 0.3011211156845093,
+      "step": 539,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 21.6,
+      "grad_norm": 1.9718871116638184,
+      "learning_rate": 0.00184,
+      "loss": 1.2773,
+      "step": 540
+    },
+    {
+      "epoch": 21.96,
+      "forget_cf_outputs.loss": -0.7989807724952698,
+      "forget_loss": 0.7989807724952698,
+      "gated_loss": 0.00640869140625,
+      "retain_loss": 0.20371052622795105,
+      "step": 549,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 22.0,
+      "grad_norm": 2.304124116897583,
+      "learning_rate": 0.0018000000000000002,
+      "loss": 1.2614,
+      "step": 550
+    },
+    {
+      "epoch": 22.36,
+      "forget_cf_outputs.loss": -0.6765010952949524,
+      "forget_loss": 0.6765010952949524,
+      "gated_loss": 0.006134033203125,
+      "retain_loss": 0.20700086653232574,
+      "step": 559,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 22.4,
+      "grad_norm": 2.3407692909240723,
+      "learning_rate": 0.00176,
+      "loss": 1.2052,
+      "step": 560
+    },
+    {
+      "epoch": 22.76,
+      "forget_cf_outputs.loss": -0.8482251167297363,
+      "forget_loss": 0.8482251167297363,
+      "gated_loss": 0.00555419921875,
+      "retain_loss": 0.23026637732982635,
+      "step": 569,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 22.8,
+      "grad_norm": 1.9227235317230225,
+      "learning_rate": 0.00172,
+      "loss": 1.2995,
+      "step": 570
+    },
+    {
+      "epoch": 23.16,
+      "forget_cf_outputs.loss": -0.7452784776687622,
+      "forget_loss": 0.7452784776687622,
+      "gated_loss": 0.006744384765625,
+      "retain_loss": 0.30324649810791016,
+      "step": 579,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 23.2,
+      "grad_norm": 2.1659557819366455,
+      "learning_rate": 0.00168,
+      "loss": 1.3367,
+      "step": 580
+    },
+    {
+      "epoch": 23.56,
+      "forget_cf_outputs.loss": -0.6973183155059814,
+      "forget_loss": 0.6973183155059814,
+      "gated_loss": 0.006011962890625,
+      "retain_loss": 0.30067041516304016,
+      "step": 589,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 23.6,
+      "grad_norm": 2.140836238861084,
+      "learning_rate": 0.00164,
+      "loss": 1.2336,
+      "step": 590
+    },
+    {
+      "epoch": 23.96,
+      "forget_cf_outputs.loss": -0.7499862313270569,
+      "forget_loss": 0.7499862313270569,
+      "gated_loss": 0.006317138671875,
+      "retain_loss": 0.20415130257606506,
+      "step": 599,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 24.0,
+      "grad_norm": 2.5653772354125977,
+      "learning_rate": 0.0016,
+      "loss": 1.2199,
+      "step": 600
+    },
+    {
+      "epoch": 24.36,
+      "forget_cf_outputs.loss": -0.6281754374504089,
+      "forget_loss": 0.6281754374504089,
+      "gated_loss": 0.006103515625,
+      "retain_loss": 0.2073870748281479,
+      "step": 609,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 24.4,
+      "grad_norm": 2.2012555599212646,
+      "learning_rate": 0.0015600000000000002,
+      "loss": 1.1692,
+      "step": 610
+    },
+    {
+      "epoch": 24.76,
+      "forget_cf_outputs.loss": -0.7929825782775879,
+      "forget_loss": 0.7929825782775879,
+      "gated_loss": 0.005584716796875,
+      "retain_loss": 0.23124848306179047,
+      "step": 619,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 24.8,
+      "grad_norm": 1.9918410778045654,
+      "learning_rate": 0.00152,
+      "loss": 1.2547,
+      "step": 620
+    },
+    {
+      "epoch": 25.16,
+      "forget_cf_outputs.loss": -0.7136563062667847,
+      "forget_loss": 0.7136563062667847,
+      "gated_loss": 0.00677490234375,
+      "retain_loss": 0.3019442558288574,
+      "step": 629,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 25.2,
+      "grad_norm": 2.17232608795166,
+      "learning_rate": 0.00148,
+      "loss": 1.2906,
+      "step": 630
+    },
+    {
+      "epoch": 25.56,
+      "forget_cf_outputs.loss": -0.6733591556549072,
+      "forget_loss": 0.6733591556549072,
+      "gated_loss": 0.00604248046875,
+      "retain_loss": 0.3010113835334778,
+      "step": 639,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 25.6,
+      "grad_norm": 2.1968352794647217,
+      "learning_rate": 0.0014399999999999999,
+      "loss": 1.198,
+      "step": 640
+    },
+    {
+      "epoch": 25.96,
+      "forget_cf_outputs.loss": -0.6898148655891418,
+      "forget_loss": 0.6898148655891418,
+      "gated_loss": 0.00628662109375,
+      "retain_loss": 0.20451340079307556,
+      "step": 649,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 26.0,
+      "grad_norm": 2.708749294281006,
+      "learning_rate": 0.0014,
+      "loss": 1.1784,
+      "step": 650
+    },
+    {
+      "epoch": 26.36,
+      "forget_cf_outputs.loss": -0.5750948786735535,
+      "forget_loss": 0.5750948786735535,
+      "gated_loss": 0.006011962890625,
+      "retain_loss": 0.20745757222175598,
+      "step": 659,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 26.4,
+      "grad_norm": 2.0222957134246826,
+      "learning_rate": 0.00136,
+      "loss": 1.1326,
+      "step": 660
+    },
+    {
+      "epoch": 26.76,
+      "forget_cf_outputs.loss": -0.7313442826271057,
+      "forget_loss": 0.7313442826271057,
+      "gated_loss": 0.005523681640625,
+      "retain_loss": 0.2309209704399109,
+      "step": 669,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 26.8,
+      "grad_norm": 1.8242149353027344,
+      "learning_rate": 0.00132,
+      "loss": 1.2022,
+      "step": 670
+    },
+    {
+      "epoch": 27.16,
+      "forget_cf_outputs.loss": -0.6695391535758972,
+      "forget_loss": 0.6695391535758972,
+      "gated_loss": 0.006744384765625,
+      "retain_loss": 0.30264630913734436,
+      "step": 679,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 27.2,
+      "grad_norm": 2.184037208557129,
+      "learning_rate": 0.00128,
+      "loss": 1.24,
+      "step": 680
+    },
+    {
+      "epoch": 27.56,
+      "forget_cf_outputs.loss": -0.6346314549446106,
+      "forget_loss": 0.6346314549446106,
+      "gated_loss": 0.005950927734375,
+      "retain_loss": 0.3012670874595642,
+      "step": 689,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 27.6,
+      "grad_norm": 2.071834087371826,
+      "learning_rate": 0.00124,
+      "loss": 1.1582,
+      "step": 690
+    },
+    {
+      "epoch": 27.96,
+      "forget_cf_outputs.loss": -0.6438873410224915,
+      "forget_loss": 0.6438873410224915,
+      "gated_loss": 0.00634765625,
+      "retain_loss": 0.20389188826084137,
+      "step": 699,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 28.0,
+      "grad_norm": 2.627547264099121,
+      "learning_rate": 0.0012,
+      "loss": 1.1426,
+      "step": 700
+    },
+    {
+      "epoch": 28.36,
+      "forget_cf_outputs.loss": -0.5409132242202759,
+      "forget_loss": 0.5409132242202759,
+      "gated_loss": 0.00604248046875,
+      "retain_loss": 0.20821160078048706,
+      "step": 709,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 28.4,
+      "grad_norm": 1.8756951093673706,
+      "learning_rate": 0.00116,
+      "loss": 1.0961,
+      "step": 710
+    },
+    {
+      "epoch": 28.76,
+      "forget_cf_outputs.loss": -0.6844155788421631,
+      "forget_loss": 0.6844155788421631,
+      "gated_loss": 0.005523681640625,
+      "retain_loss": 0.23081088066101074,
+      "step": 719,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 28.8,
+      "grad_norm": 1.882070541381836,
+      "learning_rate": 0.0011200000000000001,
+      "loss": 1.1636,
+      "step": 720
+    },
+    {
+      "epoch": 29.16,
+      "forget_cf_outputs.loss": -0.6462154388427734,
+      "forget_loss": 0.6462154388427734,
+      "gated_loss": 0.0068359375,
+      "retain_loss": 0.30321410298347473,
+      "step": 729,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 29.2,
+      "grad_norm": 2.2409791946411133,
+      "learning_rate": 0.00108,
+      "loss": 1.1954,
+      "step": 730
+    },
+    {
+      "epoch": 29.56,
+      "forget_cf_outputs.loss": -0.5779778957366943,
+      "forget_loss": 0.5779778957366943,
+      "gated_loss": 0.005828857421875,
+      "retain_loss": 0.301031231880188,
+      "step": 739,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 29.6,
+      "grad_norm": 1.9507259130477905,
+      "learning_rate": 0.0010400000000000001,
+      "loss": 1.1284,
+      "step": 740
+    },
+    {
+      "epoch": 29.96,
+      "forget_cf_outputs.loss": -0.595399022102356,
+      "forget_loss": 0.595399022102356,
+      "gated_loss": 0.00628662109375,
+      "retain_loss": 0.20331645011901855,
+      "step": 749,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 2.5469954013824463,
+      "learning_rate": 0.001,
+      "loss": 1.1074,
+      "step": 750
+    },
+    {
+      "epoch": 30.36,
+      "forget_cf_outputs.loss": -0.5063520669937134,
+      "forget_loss": 0.5063520669937134,
+      "gated_loss": 0.0059814453125,
+      "retain_loss": 0.20728009939193726,
+      "step": 759,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 30.4,
+      "grad_norm": 1.9551663398742676,
+      "learning_rate": 0.00096,
+      "loss": 1.0691,
+      "step": 760
+    },
+    {
+      "epoch": 30.76,
+      "forget_cf_outputs.loss": -0.6612439751625061,
+      "forget_loss": 0.6612439751625061,
+      "gated_loss": 0.005523681640625,
+      "retain_loss": 0.23078040778636932,
+      "step": 769,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 30.8,
+      "grad_norm": 2.112478494644165,
+      "learning_rate": 0.00092,
+      "loss": 1.1296,
+      "step": 770
+    },
+    {
+      "epoch": 31.16,
+      "forget_cf_outputs.loss": -0.6047573685646057,
+      "forget_loss": 0.6047573685646057,
+      "gated_loss": 0.00677490234375,
+      "retain_loss": 0.3026810586452484,
+      "step": 779,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 31.2,
+      "grad_norm": 2.141299247741699,
+      "learning_rate": 0.00088,
+      "loss": 1.1566,
+      "step": 780
+    },
+    {
+      "epoch": 31.56,
+      "forget_cf_outputs.loss": -0.5377554297447205,
+      "forget_loss": 0.5377554297447205,
+      "gated_loss": 0.005828857421875,
+      "retain_loss": 0.30064091086387634,
+      "step": 789,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 31.6,
+      "grad_norm": 1.9649981260299683,
+      "learning_rate": 0.00084,
+      "loss": 1.097,
+      "step": 790
+    },
+    {
+      "epoch": 31.96,
+      "forget_cf_outputs.loss": -0.5527829527854919,
+      "forget_loss": 0.5527829527854919,
+      "gated_loss": 0.0062255859375,
+      "retain_loss": 0.20365992188453674,
+      "step": 799,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 32.0,
+      "grad_norm": 2.628004312515259,
+      "learning_rate": 0.0008,
+      "loss": 1.0695,
+      "step": 800
+    },
+    {
+      "epoch": 32.36,
+      "forget_cf_outputs.loss": -0.47989267110824585,
+      "forget_loss": 0.47989267110824585,
+      "gated_loss": 0.005950927734375,
+      "retain_loss": 0.20727092027664185,
+      "step": 809,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 32.4,
+      "grad_norm": 1.9921866655349731,
+      "learning_rate": 0.00076,
+      "loss": 1.0319,
+      "step": 810
+    },
+    {
+      "epoch": 32.76,
+      "forget_cf_outputs.loss": -0.6170799136161804,
+      "forget_loss": 0.6170799136161804,
+      "gated_loss": 0.00555419921875,
+      "retain_loss": 0.23075489699840546,
+      "step": 819,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 32.8,
+      "grad_norm": 2.0169899463653564,
+      "learning_rate": 0.0007199999999999999,
+      "loss": 1.091,
+      "step": 820
+    },
+    {
+      "epoch": 33.16,
+      "forget_cf_outputs.loss": -0.5823113322257996,
+      "forget_loss": 0.5823113322257996,
+      "gated_loss": 0.0068359375,
+      "retain_loss": 0.30245938897132874,
+      "step": 829,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 33.2,
+      "grad_norm": 2.2369046211242676,
+      "learning_rate": 0.00068,
+      "loss": 1.126,
+      "step": 830
+    },
+    {
+      "epoch": 33.56,
+      "forget_cf_outputs.loss": -0.5231561660766602,
+      "forget_loss": 0.5231561660766602,
+      "gated_loss": 0.005767822265625,
+      "retain_loss": 0.3002181351184845,
+      "step": 839,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 33.6,
+      "grad_norm": 2.4003753662109375,
+      "learning_rate": 0.00064,
+      "loss": 1.0696,
+      "step": 840
+    },
+    {
+      "epoch": 33.96,
+      "forget_cf_outputs.loss": -0.4980463981628418,
+      "forget_loss": 0.4980463981628418,
+      "gated_loss": 0.0062255859375,
+      "retain_loss": 0.20359160006046295,
+      "step": 849,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 34.0,
+      "grad_norm": 2.4804441928863525,
+      "learning_rate": 0.0006,
+      "loss": 1.0311,
+      "step": 850
+    },
+    {
+      "epoch": 34.36,
+      "forget_cf_outputs.loss": -0.4477725327014923,
+      "forget_loss": 0.4477725327014923,
+      "gated_loss": 0.00592041015625,
+      "retain_loss": 0.20766329765319824,
+      "step": 859,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 34.4,
+      "grad_norm": 1.8642009496688843,
+      "learning_rate": 0.0005600000000000001,
+      "loss": 0.9964,
+      "step": 860
+    },
+    {
+      "epoch": 34.76,
+      "forget_cf_outputs.loss": -0.5641895532608032,
+      "forget_loss": 0.5641895532608032,
+      "gated_loss": 0.00555419921875,
+      "retain_loss": 0.23121777176856995,
+      "step": 869,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 34.8,
+      "grad_norm": 1.8299639225006104,
+      "learning_rate": 0.0005200000000000001,
+      "loss": 1.055,
+      "step": 870
+    },
+    {
+      "epoch": 35.16,
+      "forget_cf_outputs.loss": -0.5485031604766846,
+      "forget_loss": 0.5485031604766846,
+      "gated_loss": 0.006805419921875,
+      "retain_loss": 0.3044523596763611,
+      "step": 879,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 35.2,
+      "grad_norm": 2.0533902645111084,
+      "learning_rate": 0.00048,
+      "loss": 1.0938,
+      "step": 880
+    },
+    {
+      "epoch": 35.56,
+      "forget_cf_outputs.loss": -0.5070799589157104,
+      "forget_loss": 0.5070799589157104,
+      "gated_loss": 0.005828857421875,
+      "retain_loss": 0.3008805215358734,
+      "step": 889,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 35.6,
+      "grad_norm": 2.342500925064087,
+      "learning_rate": 0.00044,
+      "loss": 1.0375,
+      "step": 890
+    },
+    {
+      "epoch": 35.96,
+      "forget_cf_outputs.loss": -0.4470018446445465,
+      "forget_loss": 0.4470018446445465,
+      "gated_loss": 0.0062255859375,
+      "retain_loss": 0.20359660685062408,
+      "step": 899,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 36.0,
+      "grad_norm": 2.555351734161377,
+      "learning_rate": 0.0004,
+      "loss": 0.9953,
+      "step": 900
+    },
+    {
+      "epoch": 36.36,
+      "forget_cf_outputs.loss": -0.42907437682151794,
+      "forget_loss": 0.42907437682151794,
+      "gated_loss": 0.005950927734375,
+      "retain_loss": 0.20733724534511566,
+      "step": 909,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 36.4,
+      "grad_norm": 1.745898962020874,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.9637,
+      "step": 910
+    },
+    {
+      "epoch": 36.76,
+      "forget_cf_outputs.loss": -0.5278509855270386,
+      "forget_loss": 0.5278509855270386,
+      "gated_loss": 0.00555419921875,
+      "retain_loss": 0.23086762428283691,
+      "step": 919,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 36.8,
+      "grad_norm": 1.8470394611358643,
+      "learning_rate": 0.00032,
+      "loss": 1.0207,
+      "step": 920
+    },
+    {
+      "epoch": 37.16,
+      "forget_cf_outputs.loss": -0.5195350646972656,
+      "forget_loss": 0.5195350646972656,
+      "gated_loss": 0.0068359375,
+      "retain_loss": 0.3039552867412567,
+      "step": 929,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 37.2,
+      "grad_norm": 2.0206496715545654,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 1.0645,
+      "step": 930
+    },
+    {
+      "epoch": 37.56,
+      "forget_cf_outputs.loss": -0.4906051754951477,
+      "forget_loss": 0.4906051754951477,
+      "gated_loss": 0.005828857421875,
+      "retain_loss": 0.3013906478881836,
+      "step": 939,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 37.6,
+      "grad_norm": 2.380035877227783,
+      "learning_rate": 0.00024,
+      "loss": 1.0092,
+      "step": 940
+    },
+    {
+      "epoch": 37.96,
+      "forget_cf_outputs.loss": -0.41733187437057495,
+      "forget_loss": 0.41733187437057495,
+      "gated_loss": 0.0062255859375,
+      "retain_loss": 0.20396247506141663,
+      "step": 949,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 38.0,
+      "grad_norm": 2.3801748752593994,
+      "learning_rate": 0.0002,
+      "loss": 0.9681,
+      "step": 950
+    },
+    {
+      "epoch": 38.36,
+      "forget_cf_outputs.loss": -0.41866594552993774,
+      "forget_loss": 0.41866594552993774,
+      "gated_loss": 0.005950927734375,
+      "retain_loss": 0.20748548209667206,
+      "step": 959,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 38.4,
+      "grad_norm": 1.8068156242370605,
+      "learning_rate": 0.00016,
+      "loss": 0.9408,
+      "step": 960
+    },
+    {
+      "epoch": 38.76,
+      "forget_cf_outputs.loss": -0.503368079662323,
+      "forget_loss": 0.503368079662323,
+      "gated_loss": 0.00555419921875,
+      "retain_loss": 0.2300793081521988,
+      "step": 969,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 38.8,
+      "grad_norm": 1.8037129640579224,
+      "learning_rate": 0.00012,
+      "loss": 0.9892,
+      "step": 970
+    },
+    {
+      "epoch": 39.16,
+      "forget_cf_outputs.loss": -0.5070582628250122,
+      "forget_loss": 0.5070582628250122,
+      "gated_loss": 0.006805419921875,
+      "retain_loss": 0.3026222288608551,
+      "step": 979,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 39.2,
+      "grad_norm": 2.055860757827759,
+      "learning_rate": 8e-05,
+      "loss": 1.0436,
+      "step": 980
+    },
+    {
+      "epoch": 39.56,
+      "forget_cf_outputs.loss": -0.46062666177749634,
+      "forget_loss": 0.46062666177749634,
+      "gated_loss": 0.005828857421875,
+      "retain_loss": 0.30048835277557373,
+      "step": 989,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 39.6,
+      "grad_norm": 2.1290202140808105,
+      "learning_rate": 4e-05,
+      "loss": 0.9838,
+      "step": 990
+    },
+    {
+      "epoch": 39.96,
+      "forget_cf_outputs.loss": -0.39183497428894043,
+      "forget_loss": 0.39183497428894043,
+      "gated_loss": 0.0062255859375,
+      "retain_loss": 0.20340043306350708,
+      "step": 999,
+      "warm_up_unlearning_weight": 1
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 2.2874202728271484,
+      "learning_rate": 0.0,
+      "loss": 0.9411,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 40,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}