Upload folder using huggingface_hub
Browse files- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_0/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_0/config.json +28 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_0/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_1/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_1/config.json +28 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_1/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_10/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_10/config.json +29 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_10/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_11/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_11/config.json +29 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_11/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_2/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_2/config.json +28 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_2/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_3/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_3/config.json +28 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_3/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_4/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_4/config.json +28 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_4/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_5/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_5/config.json +28 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_5/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_6/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_6/config.json +29 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_6/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_7/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_7/config.json +29 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_7/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_8/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_8/config.json +29 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_8/eval_results.json +1 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_9/ae.pt +3 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_9/config.json +29 -0
- saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_9/eval_results.json +1 -0
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6d84e4f04fdf8c73b480d1847bd3f3cafc2cca09ebad9737ca766960837a005
|
| 3 |
+
size 469975062
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_0/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.012,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 3407,
|
| 13 |
+
"device": "cuda:0",
|
| 14 |
+
"layer": 31,
|
| 15 |
+
"lm_name": "google/gemma-2-9b",
|
| 16 |
+
"wandb_name": "GatedTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_0",
|
| 17 |
+
"submodule_name": "resid_post_layer_31"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 3584,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 122,
|
| 23 |
+
"ctx_len": 2048,
|
| 24 |
+
"refresh_batch_size": 4,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:0"
|
| 27 |
+
}
|
| 28 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_0/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 129.475, "l1_loss": 4698.88, "l0": 851.4930541992187, "frac_variance_explained": 0.83517578125, "cossim": 0.943671875, "l2_ratio": 0.95328125, "relative_reconstruction_bias": 1.010234375, "frac_alive": 0.99969482421875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37265778b96e4483b71ed2d37adf96bc758ef66a270fb5a56c4c61c97d1ecee1
|
| 3 |
+
size 469975062
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_1/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.018,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 3407,
|
| 13 |
+
"device": "cuda:0",
|
| 14 |
+
"layer": 31,
|
| 15 |
+
"lm_name": "google/gemma-2-9b",
|
| 16 |
+
"wandb_name": "GatedTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_1",
|
| 17 |
+
"submodule_name": "resid_post_layer_31"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 3584,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 122,
|
| 23 |
+
"ctx_len": 2048,
|
| 24 |
+
"refresh_batch_size": 4,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:0"
|
| 27 |
+
}
|
| 28 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 147.325, "l1_loss": 3421.44, "l0": 485.14355224609375, "frac_variance_explained": 0.7877734375, "cossim": 0.92603515625, "l2_ratio": 0.93662109375, "relative_reconstruction_bias": 1.012109375, "frac_alive": 0.99957275390625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_10/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c8fb90b30ea011358f2cbda7f6cfaf7972d4d5b0ced637d6bf6b6bd2441cff43
|
| 3 |
+
size 469909279
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_10/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 31,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_10",
|
| 14 |
+
"submodule_name": "resid_post_layer_31",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 520
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_10/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 166.575, "l1_loss": 4647.2, "l0": 798.1393188476562, "frac_variance_explained": 0.7728125, "cossim": 0.9116796875, "l2_ratio": 1.0146875, "relative_reconstruction_bias": 1.075546875, "frac_alive": 0.9434814453125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_11/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f2a589d118244f34c0ffa5bad3d92ddaf3d270ee839d87d93981dcec6a6d5a7
|
| 3 |
+
size 469909279
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_11/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 31,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_11",
|
| 14 |
+
"submodule_name": "resid_post_layer_31",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 820
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_11/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 167.445, "l1_loss": 6110.56, "l0": 1219.2402807617189, "frac_variance_explained": 0.79876953125, "cossim": 0.91375, "l2_ratio": 1.0494921875, "relative_reconstruction_bias": 1.1087109375, "frac_alive": 0.916015625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6043d6b801086152c035a18d1414271ef9acb2760c37dff597c580f3dd5bf17f
|
| 3 |
+
size 469975062
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_2/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.024,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 3407,
|
| 13 |
+
"device": "cuda:0",
|
| 14 |
+
"layer": 31,
|
| 15 |
+
"lm_name": "google/gemma-2-9b",
|
| 16 |
+
"wandb_name": "GatedTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_2",
|
| 17 |
+
"submodule_name": "resid_post_layer_31"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 3584,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 122,
|
| 23 |
+
"ctx_len": 2048,
|
| 24 |
+
"refresh_batch_size": 4,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:0"
|
| 27 |
+
}
|
| 28 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 158.52, "l1_loss": 2711.28, "l0": 317.59068603515624, "frac_variance_explained": 0.7526171875, "cossim": 0.91349609375, "l2_ratio": 0.9250390625, "relative_reconstruction_bias": 1.0134765625, "frac_alive": 0.9993896484375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59427277211619ceb46505253b1d0072a4de7902ca54647750ce92397e12b9c1
|
| 3 |
+
size 469975062
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_3/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.04,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 3407,
|
| 13 |
+
"device": "cuda:0",
|
| 14 |
+
"layer": 31,
|
| 15 |
+
"lm_name": "google/gemma-2-9b",
|
| 16 |
+
"wandb_name": "GatedTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_3",
|
| 17 |
+
"submodule_name": "resid_post_layer_31"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 3584,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 122,
|
| 23 |
+
"ctx_len": 2048,
|
| 24 |
+
"refresh_batch_size": 4,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:0"
|
| 27 |
+
}
|
| 28 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 176.195, "l1_loss": 1839.64, "l0": 153.8281005859375, "frac_variance_explained": 0.69720703125, "cossim": 0.89248046875, "l2_ratio": 0.90625, "relative_reconstruction_bias": 1.0146875, "frac_alive": 0.96002197265625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f031c7d25bb319470a73720e45533e98cb9bde2f92bd1df665808dae17b13d2
|
| 3 |
+
size 469975062
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_4/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.06,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 3407,
|
| 13 |
+
"device": "cuda:0",
|
| 14 |
+
"layer": 31,
|
| 15 |
+
"lm_name": "google/gemma-2-9b",
|
| 16 |
+
"wandb_name": "GatedTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_4",
|
| 17 |
+
"submodule_name": "resid_post_layer_31"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 3584,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 122,
|
| 23 |
+
"ctx_len": 2048,
|
| 24 |
+
"refresh_batch_size": 4,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:0"
|
| 27 |
+
}
|
| 28 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_4/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 191.84, "l1_loss": 1343.12, "l0": 83.0945849609375, "frac_variance_explained": 0.6426171875, "cossim": 0.87208984375, "l2_ratio": 0.88474609375, "relative_reconstruction_bias": 1.0132421875, "frac_alive": 0.7711181640625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37c6c8644f3770e426bbe7bad66b7c7f6e778bbcf566af58235ff8f5009195cc
|
| 3 |
+
size 469975062
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_5/config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"dict_class": "GatedAutoEncoder",
|
| 4 |
+
"trainer_class": "GatedSAETrainer",
|
| 5 |
+
"activation_dim": 3584,
|
| 6 |
+
"dict_size": 16384,
|
| 7 |
+
"lr": 0.0003,
|
| 8 |
+
"l1_penalty": 0.08,
|
| 9 |
+
"warmup_steps": 1000,
|
| 10 |
+
"sparsity_warmup_steps": 5000,
|
| 11 |
+
"decay_start": 195312,
|
| 12 |
+
"seed": 3407,
|
| 13 |
+
"device": "cuda:0",
|
| 14 |
+
"layer": 31,
|
| 15 |
+
"lm_name": "google/gemma-2-9b",
|
| 16 |
+
"wandb_name": "GatedTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_5",
|
| 17 |
+
"submodule_name": "resid_post_layer_31"
|
| 18 |
+
},
|
| 19 |
+
"buffer": {
|
| 20 |
+
"d_submodule": 3584,
|
| 21 |
+
"io": "out",
|
| 22 |
+
"n_ctxs": 122,
|
| 23 |
+
"ctx_len": 2048,
|
| 24 |
+
"refresh_batch_size": 4,
|
| 25 |
+
"out_batch_size": 2048,
|
| 26 |
+
"device": "cuda:0"
|
| 27 |
+
}
|
| 28 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_5/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 204.52, "l1_loss": 1045.64, "l0": 51.75321044921875, "frac_variance_explained": 0.59876953125, "cossim": 0.85453125, "l2_ratio": 0.86896484375, "relative_reconstruction_bias": 1.0145703125, "frac_alive": 0.599609375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_6/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a586808fd95eea30f0d3283914ab7e86789dcc81c4d3b1c89a954260920889a
|
| 3 |
+
size 469909279
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_6/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 31,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_6",
|
| 14 |
+
"submodule_name": "resid_post_layer_31",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 50
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_6/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 194.8, "l1_loss": 1277.8, "l0": 64.866396484375, "frac_variance_explained": 0.63359375, "cossim": 0.86810546875, "l2_ratio": 0.87791015625, "relative_reconstruction_bias": 1.0101171875, "frac_alive": 0.79437255859375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_7/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce55638139f00e54e806c21173a473e4689bb13e45b7660015c3389e2e5ddae5
|
| 3 |
+
size 469909279
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_7/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 31,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_7",
|
| 14 |
+
"submodule_name": "resid_post_layer_31",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 80
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_7/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 182.8, "l1_loss": 1683.64, "l0": 109.25167236328124, "frac_variance_explained": 0.674453125, "cossim": 0.8840625, "l2_ratio": 0.89279296875, "relative_reconstruction_bias": 1.0100390625, "frac_alive": 0.85064697265625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_8/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2b533fdd20f00d2ffbda5944dc7044ba843ddcd57ac59c55b9ed2685782e48a
|
| 3 |
+
size 469909279
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_8/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 31,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_8",
|
| 14 |
+
"submodule_name": "resid_post_layer_31",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 160
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_8/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 164.39, "l1_loss": 2474.32, "l0": 236.04934814453125, "frac_variance_explained": 0.734453125, "cossim": 0.90666015625, "l2_ratio": 0.9155859375, "relative_reconstruction_bias": 1.011796875, "frac_alive": 0.93707275390625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_9/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:980a3b3c2447255b02e8e1f30078972109d67a2a621d72b54ac94630bcae712c
|
| 3 |
+
size 469909279
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_9/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "JumpReluTrainer",
|
| 4 |
+
"dict_class": "JumpReluAutoEncoder",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"seed": 3407,
|
| 8 |
+
"activation_dim": 3584,
|
| 9 |
+
"dict_size": 16384,
|
| 10 |
+
"device": "cuda:0",
|
| 11 |
+
"layer": 31,
|
| 12 |
+
"lm_name": "google/gemma-2-9b",
|
| 13 |
+
"wandb_name": "JumpReluTrainer-google/gemma-2-9b-resid_post_layer_31_trainer_9",
|
| 14 |
+
"submodule_name": "resid_post_layer_31",
|
| 15 |
+
"bandwidth": 0.001,
|
| 16 |
+
"sparsity_penalty": 1.0,
|
| 17 |
+
"sparsity_warmup_steps": 5000,
|
| 18 |
+
"target_l0": 320
|
| 19 |
+
},
|
| 20 |
+
"buffer": {
|
| 21 |
+
"d_submodule": 3584,
|
| 22 |
+
"io": "out",
|
| 23 |
+
"n_ctxs": 122,
|
| 24 |
+
"ctx_len": 2048,
|
| 25 |
+
"refresh_batch_size": 4,
|
| 26 |
+
"out_batch_size": 2048,
|
| 27 |
+
"device": "cuda:0"
|
| 28 |
+
}
|
| 29 |
+
}
|
saes_google_gemma-2-9b_gated_jump_relu/resid_post_layer_31/trainer_9/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 143.28, "l1_loss": 3752.0, "l0": 521.1142944335937, "frac_variance_explained": 0.79845703125, "cossim": 0.929765625, "l2_ratio": 0.9350390625, "relative_reconstruction_bias": 1.0071875, "frac_alive": 0.989501953125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}}
|