diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb4459d827d075c2a3ece0b8027bf3f49ccc742a --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a1d8f2dec81cf9df336c0067b2c8ac2b1a781aaa7dc7b3c2c252a6c58aabb39 +size 268510888 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8d0b349722e808d8910873d76c8bae4af933be89 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 2048, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.012, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 3407, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_0", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..79c61676cf366ccd81bcd5ee313cf0224768dcf9 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_0/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 30.879375, "l1_loss": 661.24, "l0": 1127.2820458984374, "frac_variance_explained": -0.4024609375, "cossim": 0.94140625, "l2_ratio": 0.92974609375, "relative_reconstruction_bias": 2.2902734375, "frac_alive": 0.856201171875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a7dbc56f9b949a2660fb1cc6aef38572031f680 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11f5775c39495e593118cf42fe9aceaf000b7fa718914aed1924ecfe7ceb82b2 +size 268510888 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ab712d8ffd9789677ad59bfb3b722094f68e7da --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 2048, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.015, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 3407, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_1", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2f047215147700069a53419f2483582aed451a1d --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_1/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 35.52625, "l1_loss": 595.71, "l0": 808.0417797851562, "frac_variance_explained": -1.18908203125, "cossim": 0.92630859375, "l2_ratio": 0.91580078125, "relative_reconstruction_bias": 3.07451171875, "frac_alive": 0.84942626953125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..e96c5a4e8c0196538ea3ac7612396f96d2835938 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9eba2b75a1a740969391657b3a5833639843ecd4afc78fb94a893c75193e364 +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d93c105081b9780600151468ad55649ebc52ea39 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 520, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_10", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..78e60b0837533db3035e7ccb4a4d62a0227618df --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_10/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 20.95625, "l1_loss": 1669.56, "l0": 520.0, "frac_variance_explained": 0.94826171875, "cossim": 0.95701171875, "l2_ratio": 0.95572265625, "relative_reconstruction_bias": 1.0398046875, "frac_alive": 0.76019287109375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec10a9b80394128d86c2aae6e58dbe75e2fb35eb --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e989ba017d39a5ec42c1f59c08cbb4d0be38963a1f4b428fc3cd69ddb614b07 +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/config.json new file mode 100644 index 0000000000000000000000000000000000000000..16071ed8194a6d71cc4d38e9b0f9285ae2f36231 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 820, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_11", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..820b05f9f90f3c012d35ecf61fb7d247562750d9 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_11/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 15.7865625, "l1_loss": 2385.28, "l0": 820.0, "frac_variance_explained": 0.96828125, "cossim": 0.9765625, "l2_ratio": 0.97267578125, "relative_reconstruction_bias": 1.0452734375, "frac_alive": 0.33575439453125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6b859b0268aca856d24bcea9bea9fa91f699c9c --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:694b546c3509ddf883451a3505dc43eed933d17fd2014b184814b191ae590f2f +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/config.json new file mode 100644 index 0000000000000000000000000000000000000000..01751480738789a3c28cda2a3bcb9f4035511332 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1024, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 50, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_12", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..17abbadf4b100b06bebefdbe1043d5fec3c691e3 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_12/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 46.85375, "l1_loss": 527.98, "l0": 88.1703369140625, "frac_variance_explained": -3.31318359375, "cossim": 0.87599609375, "l2_ratio": 0.89609375, "relative_reconstruction_bias": 4.184140625, "frac_alive": 0.9842529296875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4be7915a6b1015cb7f9dc08a745bcaafb9bfdfe --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:066d0e4ee037b48e5a48ed8990e55ce6ae9baf0bda5d425c1c061395e560e524 +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/config.json new file mode 100644 index 0000000000000000000000000000000000000000..09f338bcbe9602d9d12dddc58769f300613dd551 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1024, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 80, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_13", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c5473e5a0ecee955300f323ebd361a63fdbef591 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_13/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 40.933125, "l1_loss": 601.47, "l0": 139.47208984375, "frac_variance_explained": -1.42328125, "cossim": 0.89396484375, "l2_ratio": 0.9131640625, "relative_reconstruction_bias": 2.918828125, "frac_alive": 0.9884033203125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec46fdf40df0c2c96c1ec6518e2f6a247f6a1f73 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae70d85a1b4695c2b085c534e7dc258ae29a8fb08e5d18d34aa7707d80b1c6f1 +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/config.json new file mode 100644 index 0000000000000000000000000000000000000000..186e32428da6e5871c48a1321a56aac670a312f7 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1024, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 160, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_14", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..52ece4fd13f32e3334977c73e6d5ea1a7df05595 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_14/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 34.115, "l1_loss": 754.1, "l0": 256.5938452148437, "frac_variance_explained": -0.41078125, "cossim": 0.92021484375, "l2_ratio": 0.93349609375, "relative_reconstruction_bias": 2.233203125, "frac_alive": 0.9779052734375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..c16e0226251aa0551f8f02401eec70ca5f59b0ee --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c539372f8ea9a792322995795b1ce3773c4bae4fbaa65f142c5be8f083f409d +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2c566dcadbe975453a1ad58c7ed676981ba600b5 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1024, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 320, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_15", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e5c007834ca9f45e4a5d9a4fec457f3a6288a456 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_15/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 26.280625, "l1_loss": 1028.06, "l0": 452.7045849609375, "frac_variance_explained": 0.453125, "cossim": 0.94921875, "l2_ratio": 0.9529296875, "relative_reconstruction_bias": 1.596015625, "frac_alive": 0.79998779296875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea8d331284842a52ce5fa70b8da764ce8eb833e5 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee2a19f8efa063e40c3653e33345d62451b3ad047f3e4585bf07ba0db491ee68 +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/config.json new file mode 100644 index 0000000000000000000000000000000000000000..332de89e5455df1229134137ffaec99cf924a0a8 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1024, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 520, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_16", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2327d89f6badb7057c9a44aaa7c9381e1c0e17a7 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_16/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 21.13875, "l1_loss": 1909.0, "l0": 685.9320922851563, "frac_variance_explained": 0.791875, "cossim": 0.961171875, "l2_ratio": 0.9586328125, "relative_reconstruction_bias": 1.278515625, "frac_alive": 0.527099609375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..782de4c3abe96b24f94aed2c5f56baf9f70aaf92 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd0757a3307159a6ab182d22337f21511e5888b585a7f4df61ca912a4f14af32 +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/config.json new file mode 100644 index 0000000000000000000000000000000000000000..599aaebd2b98db85e3b585a9fc4a33d5038788ec --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/config.json @@ -0,0 +1,32 @@ +{ + "trainer": { + "trainer_class": "BatchTopKTrainer", + "dict_class": "BatchTopKSAE", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "top_k_aux": 1024, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 820, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "BatchTopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_17", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..91e56e433bd3510f83dc792dffeafe68282bb88d --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_17/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 15.2528125, "l1_loss": 2843.04, "l0": 1031.6360986328125, "frac_variance_explained": 0.9468359375, "cossim": 0.9765625, "l2_ratio": 0.968359375, "relative_reconstruction_bias": 1.071953125, "frac_alive": 0.301025390625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..4820195834bd5565aff2d522dc611e47d134808f --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9da5396e68d059eea837520a335f7ad87438c4012a2c12281f5f062f589e6cce +size 268510888 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9c129124a83a8b1e816053638191200303299a62 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 2048, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.02, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 3407, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_2", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..de7df2ddceaf94d7652fcab2599729182ef4dca4 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_2/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 41.816875, "l1_loss": 535.65, "l0": 496.90281494140623, "frac_variance_explained": -2.56572265625, "cossim": 0.90685546875, "l2_ratio": 0.89265625, "relative_reconstruction_bias": 4.47685546875, "frac_alive": 0.84442138671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d170b5d3eb6267b0e3f306071cbb43222a130aa --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c436edbd3b036ae240edece29e375e791805c7ef1ab7c7493006d8b4df11031c +size 268510888 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c23bd8a41a54175a1c3b629696b00b30d57e7af0 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 2048, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.03, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 3407, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_3", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..13702bfa2beb76907cc4f37ee8de03143a1abd74 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_3/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 48.4025, "l1_loss": 464.59, "l0": 241.1419287109375, "frac_variance_explained": -4.48306640625, "cossim": 0.8783203125, "l2_ratio": 0.85603515625, "relative_reconstruction_bias": 6.0671484375, "frac_alive": 0.8358154296875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..77df0a033feacd801f64493da0ab244cce5fb27a --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bc90a59829f18b3a66df8130f83d72273199c4ce7c1d6c60e0cf6dd55dae725 +size 268510888 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/config.json new file mode 100644 index 0000000000000000000000000000000000000000..00530a7ead047dee2fb9c230e17df0211ea2d498 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 2048, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.04, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 3407, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_4", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e1cac45b6a0749a8fd9a160c7a41a33fd8764550 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_4/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 53.45625, "l1_loss": 432.64, "l0": 149.82208251953125, "frac_variance_explained": -6.97576171875, "cossim": 0.85923828125, "l2_ratio": 0.8296484375, "relative_reconstruction_bias": 7.51521484375, "frac_alive": 0.83062744140625, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..7588e56767415ba6f597e9e4054d3689d039be21 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28c5696a2ecfbf2359218fa97544779c1d61d3624e39e003962ad4474d62371f +size 268510888 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a1fdfbdc38df209141a2d7175eed03f27eea16c1 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/config.json @@ -0,0 +1,29 @@ +{ + "trainer": { + "dict_class": "AutoEncoder", + "trainer_class": "StandardTrainerAprilUpdate", + "activation_dim": 2048, + "dict_size": 16384, + "lr": 0.0003, + "l1_penalty": 0.06, + "warmup_steps": 1000, + "sparsity_warmup_steps": 5000, + "steps": 244140, + "decay_start": 195312, + "seed": 3407, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "StandardTrainerNew-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_5", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..133d615696edda70751ad99c308a030cacf836be --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_5/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 60.575, "l1_loss": 426.7925, "l0": 80.94847900390624, "frac_variance_explained": -10.817421875, "cossim": 0.832265625, "l2_ratio": 0.79265625, "relative_reconstruction_bias": 9.3669921875, "frac_alive": 0.823486328125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..433c3af3f14d1d97bfecfc946fee9b958f01e374 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66be800a8f2d8320361cff355cc1d088d8fed53b850abce16af2f04e3c2be36e +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c12fb5ee1c9121602f716d5e350132a90efcfb9b --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 50, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_6", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5ce99af6cc07383aa51a94ef9b75061b4683b70a --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_6/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 36.115, "l1_loss": 225.495, "l0": 50.0, "frac_variance_explained": 0.8665625, "cossim": 0.8618359375, "l2_ratio": 0.8702734375, "relative_reconstruction_bias": 1.028359375, "frac_alive": 0.97332763671875, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c995349a04993afed7d1a90551d85a18b866e2c --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2bd0f50e52e30e6408603f136145e7afee6b43b070709be09fbea3469a97169 +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0dfaf0a07d7f650934164c1384a1a7df336f3e86 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 80, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_7", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f0f3aa9a200506067e9a0ffb0cbff48b76a64d7e --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_7/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 33.834375, "l1_loss": 293.98, "l0": 80.0, "frac_variance_explained": 0.88892578125, "cossim": 0.8798828125, "l2_ratio": 0.88625, "relative_reconstruction_bias": 1.0291015625, "frac_alive": 0.990234375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..53510127f9569a5c12be65586bb0d2bc5300546f --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:233cbd2e4493010b3bc05b126cf4ae32bc159cb29ae12590528dddfbfdd0a9e1 +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e03bd1f8fabf488cd60056ace6f59541c0e36818 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 160, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_8", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..829a985360918299dc9a04e7d5d6adc1ef6a2afd --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_8/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 30.47875, "l1_loss": 467.69, "l0": 160.0, "frac_variance_explained": 0.885390625, "cossim": 0.905859375, "l2_ratio": 0.9115625, "relative_reconstruction_bias": 1.0475, "frac_alive": 0.99066162109375, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_9/ae.pt b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_9/ae.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca155a32696145406ae7358b648f8504202e8b34 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_9/ae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4dbcffbaa046b2ae6003d27dd331b982d78513e056353b1da6cc4524522d741 +size 268511254 diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_9/config.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_9/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a35c510047bfaad0ff2dcb0823bf39d8aabc24d9 --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_9/config.json @@ -0,0 +1,31 @@ +{ + "trainer": { + "trainer_class": "TopKTrainer", + "dict_class": "AutoEncoderTopK", + "lr": 0.0003, + "steps": 244140, + "auxk_alpha": 0.03125, + "warmup_steps": 1000, + "decay_start": 195312, + "threshold_beta": 0.999, + "threshold_start_step": 1000, + "seed": 3407, + "activation_dim": 2048, + "dict_size": 16384, + "k": 320, + "device": "cuda:0", + "layer": 17, + "lm_name": "Qwen/Qwen2.5-3B", + "wandb_name": "TopKTrainer-Qwen/Qwen2.5-3B-resid_post_layer_17_trainer_9", + "submodule_name": "resid_post_layer_17" + }, + "buffer": { + "d_submodule": 2048, + "io": "out", + "n_ctxs": 122, + "ctx_len": 2048, + "refresh_batch_size": 4, + "out_batch_size": 2048, + "device": "cuda:0" + } +} \ No newline at end of file diff --git a/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_9/eval_results.json b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_9/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e2a33d9d8ddad3f8865837b280502ee251f540ef --- /dev/null +++ b/saes_Qwen_Qwen2.5-3B_batch_top_k_top_k_standard_new/resid_post_layer_17/trainer_9/eval_results.json @@ -0,0 +1 @@ +{"l2_loss": 24.574375, "l1_loss": 913.22, "l0": 320.0, "frac_variance_explained": 0.935078125, "cossim": 0.938671875, "l2_ratio": 0.94154296875, "relative_reconstruction_bias": 1.036640625, "frac_alive": 0.95147705078125, "hyperparameters": {"n_inputs": 200, "context_length": 2048}} \ No newline at end of file