Trains llama-2b resid-pre-16 SAEs on up to 200M tokens of OpenWebText

Browse files

Files changed (5) hide show

lunar-fire-36_llama/hyperparameters.yaml +75 -0
lunar-fire-36_llama/sparse_autoencoder/Llama-2-7b-hf_blocks.16.hook_mlp_out_s32768_100M_tokens_openwebtext.pt +3 -0
lunar-fire-36_llama/sparse_autoencoder/Llama-2-7b-hf_blocks.16.hook_mlp_out_s32768_150M_tokens_openwebtext.pt +3 -0
lunar-fire-36_llama/sparse_autoencoder/Llama-2-7b-hf_blocks.16.hook_mlp_out_s32768_200M_tokens_openwebtext.pt +3 -0
lunar-fire-36_llama/sparse_autoencoder/Llama-2-7b-hf_blocks.16.hook_mlp_out_s32768_50M_tokens_openwebtext.pt +3 -0

lunar-fire-36_llama/hyperparameters.yaml ADDED Viewed

	@@ -0,0 +1,75 @@

+adam_beta1: 0.9
+adam_beta2: 0.999
+b_dec_init_method: zeros
+cached_activations_path: null
+checkpoint_path: ./outputs/checkpoints
+clip_grad_norm: true
+context_size: 256
+custom_loss: null
+d_in: 4096
+d_out: null
+dataset: skylion007/openwebtext
+dense_loss_coefficient: 0
+device: cuda
+different_output: false
+dtype: float32
+epsilon_l0_approx: 0.5
+eval_frequency: 500
+expansion_factor: 8
+feature_reinit_scale: 0.2
+feature_resampling_method: null
+fine_tune_dataset: false
+finetuning_steps: !!python/tuple
+- 1000
+flatten_activations_over_layer: false
+flatten_activations_over_layer_output: false
+from_pretrained_path: null
+hook_point: blocks.16.hook_resid_pre
+hook_point_head_index: null
+hook_point_head_index_output: null
+hook_point_layer: 16
+hook_point_layer_output: null
+hook_point_output: null
+initial_decoder_norm: 0.1
+initialise_encoder_to_decoder_transpose: false
+is_dataset_tokenized: false
+l0_coefficient: 0
+l0_warmup: false
+l0_warmup_steps: 1000
+l1_coefficient: 5
+l1_warmup: true
+l1_warmup_steps: 5000
+log_to_wandb: true
+loop_dataset: false
+lr: 0.0001
+lr_scheduler_name: constant_with_warmup
+lr_warm_up_steps: 1000
+max_resample_step: 100000
+max_sparsity_target: 1
+min_sparsity_for_resample: 0
+min_sparsity_target: 0
+model_name: meta-llama/Llama-2-7b-hf
+mse_loss_coefficient: 1
+mse_loss_type: standard
+multiple_runs: false
+n_batches_in_store_buffer: 128
+n_checkpoints: 80
+n_running_sparsity: 500
+n_starting_steps: null
+normalise_initial_decoder_weights: false
+normalise_w_dec: false
+resample_batches: 128
+resample_frequency: 25000
+scale_input_norm: false
+seed: 42
+sparse_loss_coefficient: 0
+sparsity_log_frequency: 5000
+store_batch_size: 8
+subtract_b_dec_from_inputs: false
+total_training_steps: 73242
+train_batch_size: 4096
+use_cached_activations: false
+use_gated_sparse_autoencoder: false
+wandb_log_frequency: 10
+wandb_project: test_pythia-mlp
+weight_l1_by_decoder_norms: true

lunar-fire-36_llama/sparse_autoencoder/Llama-2-7b-hf_blocks.16.hook_mlp_out_s32768_100M_tokens_openwebtext.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc4edf63c86d477956d054b87f0980666fac2db42f3b0e4cec7828e480e30145
+size 1073894352

lunar-fire-36_llama/sparse_autoencoder/Llama-2-7b-hf_blocks.16.hook_mlp_out_s32768_150M_tokens_openwebtext.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77b86136732e028bea093921a33ca2648404d1f0670b9f77f56558154905d388
+size 1073894352

lunar-fire-36_llama/sparse_autoencoder/Llama-2-7b-hf_blocks.16.hook_mlp_out_s32768_200M_tokens_openwebtext.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4c4316f423fba61eb21718ccfb4d06d35c19ef2b95ed996d10899769436f252
+size 1073894352

lunar-fire-36_llama/sparse_autoencoder/Llama-2-7b-hf_blocks.16.hook_mlp_out_s32768_50M_tokens_openwebtext.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be6c2e053e06599c4a22f432dd13c1752e42629f17f60bf845343fb0afc9f105
+size 1073894344