AwesomeInterpretability commited on Aug 31, 2025

Commit

f72c6b2

verified ·

1 Parent(s): ebdaa38

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/config.json +29 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/config.json +29 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/config.json +31 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/config.json +31 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/config.json +32 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/config.json +32 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/config.json +32 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/config.json +32 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/config.json +32 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/config.json +32 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/config.json +29 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/config.json +29 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/config.json +29 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/config.json +29 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/config.json +31 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/config.json +31 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/eval_results.json +1 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/ae.pt +3 -0
saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/config.json +31 -0

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a1d8f2dec81cf9df336c0067b2c8ac2b1a781aaa7dc7b3c2c252a6c58aabb39
+size 268510888

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "dict_class": "AutoEncoder",
+        "trainer_class": "StandardTrainerAprilUpdate",
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "lr": 0.0003,
+        "l1_penalty": 0.012,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "steps": 244140,
+        "decay_start": 195312,
+        "seed": 3407,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_0",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 30.879375, "l1_loss": 661.24, "l0": 1127.2820458984374, "frac_variance_explained": -0.4024609375, "cossim": 0.94140625, "l2_ratio": 0.92974609375, "relative_reconstruction_bias": 2.2902734375, "frac_alive": 0.856201171875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11f5775c39495e593118cf42fe9aceaf000b7fa718914aed1924ecfe7ceb82b2
+size 268510888

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "dict_class": "AutoEncoder",
+        "trainer_class": "StandardTrainerAprilUpdate",
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "lr": 0.0003,
+        "l1_penalty": 0.015,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "steps": 244140,
+        "decay_start": 195312,
+        "seed": 3407,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_1",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 35.52625, "l1_loss": 595.71, "l0": 808.0417797851562, "frac_variance_explained": -1.18908203125, "cossim": 0.92630859375, "l2_ratio": 0.91580078125, "relative_reconstruction_bias": 3.07451171875, "frac_alive": 0.84942626953125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9eba2b75a1a740969391657b3a5833639843ecd4afc78fb94a893c75193e364
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "trainer": {
+        "trainer_class": "TopKTrainer",
+        "dict_class": "AutoEncoderTopK",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 520,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_10",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 20.95625, "l1_loss": 1669.56, "l0": 520.0, "frac_variance_explained": 0.94826171875, "cossim": 0.95701171875, "l2_ratio": 0.95572265625, "relative_reconstruction_bias": 1.0398046875, "frac_alive": 0.76019287109375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e989ba017d39a5ec42c1f59c08cbb4d0be38963a1f4b428fc3cd69ddb614b07
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "trainer": {
+        "trainer_class": "TopKTrainer",
+        "dict_class": "AutoEncoderTopK",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 820,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_11",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 15.7865625, "l1_loss": 2385.28, "l0": 820.0, "frac_variance_explained": 0.96828125, "cossim": 0.9765625, "l2_ratio": 0.97267578125, "relative_reconstruction_bias": 1.0452734375, "frac_alive": 0.33575439453125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:694b546c3509ddf883451a3505dc43eed933d17fd2014b184814b191ae590f2f
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1024,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 50,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_12",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 46.85375, "l1_loss": 527.98, "l0": 88.1703369140625, "frac_variance_explained": -3.31318359375, "cossim": 0.87599609375, "l2_ratio": 0.89609375, "relative_reconstruction_bias": 4.184140625, "frac_alive": 0.9842529296875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:066d0e4ee037b48e5a48ed8990e55ce6ae9baf0bda5d425c1c061395e560e524
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1024,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 80,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_13",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 40.933125, "l1_loss": 601.47, "l0": 139.47208984375, "frac_variance_explained": -1.42328125, "cossim": 0.89396484375, "l2_ratio": 0.9131640625, "relative_reconstruction_bias": 2.918828125, "frac_alive": 0.9884033203125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae70d85a1b4695c2b085c534e7dc258ae29a8fb08e5d18d34aa7707d80b1c6f1
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1024,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 160,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_14",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 34.115, "l1_loss": 754.1, "l0": 256.5938452148437, "frac_variance_explained": -0.41078125, "cossim": 0.92021484375, "l2_ratio": 0.93349609375, "relative_reconstruction_bias": 2.233203125, "frac_alive": 0.9779052734375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c539372f8ea9a792322995795b1ce3773c4bae4fbaa65f142c5be8f083f409d
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1024,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 320,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_15",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 26.280625, "l1_loss": 1028.06, "l0": 452.7045849609375, "frac_variance_explained": 0.453125, "cossim": 0.94921875, "l2_ratio": 0.9529296875, "relative_reconstruction_bias": 1.596015625, "frac_alive": 0.79998779296875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee2a19f8efa063e40c3653e33345d62451b3ad047f3e4585bf07ba0db491ee68
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1024,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 520,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_16",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 21.13875, "l1_loss": 1909.0, "l0": 685.9320922851563, "frac_variance_explained": 0.791875, "cossim": 0.961171875, "l2_ratio": 0.9586328125, "relative_reconstruction_bias": 1.278515625, "frac_alive": 0.527099609375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd0757a3307159a6ab182d22337f21511e5888b585a7f4df61ca912a4f14af32
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "trainer": {
+        "trainer_class": "BatchTopKTrainer",
+        "dict_class": "BatchTopKSAE",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "top_k_aux": 1024,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 820,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_17",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 15.2528125, "l1_loss": 2843.04, "l0": 1031.6360986328125, "frac_variance_explained": 0.9468359375, "cossim": 0.9765625, "l2_ratio": 0.968359375, "relative_reconstruction_bias": 1.071953125, "frac_alive": 0.301025390625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9da5396e68d059eea837520a335f7ad87438c4012a2c12281f5f062f589e6cce
+size 268510888

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "dict_class": "AutoEncoder",
+        "trainer_class": "StandardTrainerAprilUpdate",
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "lr": 0.0003,
+        "l1_penalty": 0.02,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "steps": 244140,
+        "decay_start": 195312,
+        "seed": 3407,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_2",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 41.816875, "l1_loss": 535.65, "l0": 496.90281494140623, "frac_variance_explained": -2.56572265625, "cossim": 0.90685546875, "l2_ratio": 0.89265625, "relative_reconstruction_bias": 4.47685546875, "frac_alive": 0.84442138671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c436edbd3b036ae240edece29e375e791805c7ef1ab7c7493006d8b4df11031c
+size 268510888

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "dict_class": "AutoEncoder",
+        "trainer_class": "StandardTrainerAprilUpdate",
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "lr": 0.0003,
+        "l1_penalty": 0.03,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "steps": 244140,
+        "decay_start": 195312,
+        "seed": 3407,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_3",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 48.4025, "l1_loss": 464.59, "l0": 241.1419287109375, "frac_variance_explained": -4.48306640625, "cossim": 0.8783203125, "l2_ratio": 0.85603515625, "relative_reconstruction_bias": 6.0671484375, "frac_alive": 0.8358154296875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bc90a59829f18b3a66df8130f83d72273199c4ce7c1d6c60e0cf6dd55dae725
+size 268510888

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "dict_class": "AutoEncoder",
+        "trainer_class": "StandardTrainerAprilUpdate",
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "lr": 0.0003,
+        "l1_penalty": 0.04,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "steps": 244140,
+        "decay_start": 195312,
+        "seed": 3407,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_4",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 53.45625, "l1_loss": 432.64, "l0": 149.82208251953125, "frac_variance_explained": -6.97576171875, "cossim": 0.85923828125, "l2_ratio": 0.8296484375, "relative_reconstruction_bias": 7.51521484375, "frac_alive": 0.83062744140625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28c5696a2ecfbf2359218fa97544779c1d61d3624e39e003962ad4474d62371f
+size 268510888

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "trainer": {
+        "dict_class": "AutoEncoder",
+        "trainer_class": "StandardTrainerAprilUpdate",
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "lr": 0.0003,
+        "l1_penalty": 0.06,
+        "warmup_steps": 1000,
+        "sparsity_warmup_steps": 5000,
+        "steps": 244140,
+        "decay_start": 195312,
+        "seed": 3407,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_5",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 60.575, "l1_loss": 426.7925, "l0": 80.94847900390624, "frac_variance_explained": -10.817421875, "cossim": 0.832265625, "l2_ratio": 0.79265625, "relative_reconstruction_bias": 9.3669921875, "frac_alive": 0.823486328125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66be800a8f2d8320361cff355cc1d088d8fed53b850abce16af2f04e3c2be36e
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "trainer": {
+        "trainer_class": "TopKTrainer",
+        "dict_class": "AutoEncoderTopK",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 50,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_6",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 36.115, "l1_loss": 225.495, "l0": 50.0, "frac_variance_explained": 0.8665625, "cossim": 0.8618359375, "l2_ratio": 0.8702734375, "relative_reconstruction_bias": 1.028359375, "frac_alive": 0.97332763671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2bd0f50e52e30e6408603f136145e7afee6b43b070709be09fbea3469a97169
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "trainer": {
+        "trainer_class": "TopKTrainer",
+        "dict_class": "AutoEncoderTopK",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 80,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_7",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/eval_results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"l2_loss": 33.834375, "l1_loss": 293.98, "l0": 80.0, "frac_variance_explained": 0.88892578125, "cossim": 0.8798828125, "l2_ratio": 0.88625, "relative_reconstruction_bias": 1.0291015625, "frac_alive": 0.990234375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/ae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:233cbd2e4493010b3bc05b126cf4ae32bc159cb29ae12590528dddfbfdd0a9e1
+size 268511254

saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+    "trainer": {
+        "trainer_class": "TopKTrainer",
+        "dict_class": "AutoEncoderTopK",
+        "lr": 0.0003,
+        "steps": 244140,
+        "auxk_alpha": 0.03125,
+        "warmup_steps": 1000,
+        "decay_start": 195312,
+        "threshold_beta": 0.999,
+        "threshold_start_step": 1000,
+        "seed": 3407,
+        "activation_dim": 2048,
+        "dict_size": 16384,
+        "k": 160,
+        "device": "cuda:0",
+        "layer": 17,
+        "lm_name": "Qwen/Qwen2.5-3B",
+        "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_8",
+        "submodule_name": "resid_post_layer_17"
+    },
+    "buffer": {
+        "d_submodule": 2048,
+        "io": "out",
+        "n_ctxs": 122,
+        "ctx_len": 2048,
+        "refresh_batch_size": 4,
+        "out_batch_size": 2048,
+        "device": "cuda:0"
+    }
+}