Initial commit of 100M training tokens on gpt2-small, pythia-160m-deduped, opt-125m
f7a232d
verified
| activation_function: topk | |
| adam_beta1: 0.9 | |
| adam_beta2: 0.999 | |
| b_dec_init_method: zeros | |
| cached_activations_path: null | |
| checkpoint_path: ./outputs/checkpoints | |
| clip_grad_norm: true | |
| context_size: 128 | |
| custom_loss: null | |
| d_in: 768 | |
| d_out: null | |
| dataset: Skylion007/openwebtext | |
| dense_loss_coefficient: 0 | |
| device: cuda | |
| different_output: false | |
| dtype: float32 | |
| epsilon_l0_approx: 0.1 | |
| eval_frequency: 500 | |
| expansion_factor: 32 | |
| feature_reinit_scale: 0.2 | |
| feature_resampling_method: null | |
| fine_tune_dataset: false | |
| finetuning_steps: !!python/tuple | |
| - 1000 | |
| flatten_activations_over_layer: false | |
| flatten_activations_over_layer_output: false | |
| from_pretrained_path: null | |
| hook_point: blocks.8.hook_resid_pre | |
| hook_point_head_index: null | |
| hook_point_head_index_output: null | |
| hook_point_layer: 8 | |
| hook_point_layer_output: null | |
| hook_point_output: null | |
| initial_decoder_norm: 0.1 | |
| initialise_encoder_to_decoder_transpose: true | |
| is_dataset_tokenized: false | |
| l0_coefficient: 0 | |
| l0_warmup: false | |
| l0_warmup_steps: 1000 | |
| l1_coefficient: 0 | |
| l1_warmup: false | |
| l1_warmup_steps: 1000 | |
| log_to_wandb: true | |
| loop_dataset: false | |
| lr: 4.0e-05 | |
| lr_scheduler_name: constant_with_warmup | |
| lr_warm_up_steps: 2000 | |
| max_resample_step: 100000 | |
| max_sparsity_target: 1 | |
| min_sparsity_for_resample: 0 | |
| min_sparsity_target: 1.0e-05 | |
| model_name: gpt2-small | |
| mse_loss_coefficient: 1 | |
| mse_loss_type: centered | |
| multiple_runs: false | |
| n_batches_in_store_buffer: 64 | |
| n_checkpoints: 5 | |
| n_running_sparsity: 300 | |
| n_starting_steps: null | |
| normalise_initial_decoder_weights: false | |
| normalise_w_dec: true | |
| remove_bos_tokens: false | |
| resample_batches: 128 | |
| resample_frequency: 25000 | |
| scale_input_norm: false | |
| seed: 42 | |
| sparse_loss_coefficient: 0.0001 | |
| sparsity_log_frequency: 5000 | |
| store_batch_size: 8 | |
| subtract_b_dec_from_inputs: false | |
| topk_amount: 20 | |
| total_training_steps: 24414 | |
| train_batch_size: 4096 | |
| use_cached_activations: false | |
| use_gated_sparse_autoencoder: false | |
| wandb_log_frequency: 10 | |
| wandb_project: test_topk | |
| weight_l1_by_decoder_norms: false | |