activation_function: topk
adam_beta1: 0.9
adam_beta2: 0.999
b_dec_init_method: zeros
cached_activations_path: null
checkpoint_path: ./outputs/checkpoints
clip_grad_norm: true
context_size: 128
custom_loss: null
d_in: 768
d_out: null
dataset: Skylion007/openwebtext
dense_loss_coefficient: 0
device: cuda
different_output: false
dtype: float32
epsilon_l0_approx: 0.1
eval_frequency: 500
expansion_factor: 32
feature_reinit_scale: 0.2
feature_resampling_method: null
fine_tune_dataset: false
finetuning_steps: !!python/tuple
- 1000
flatten_activations_over_layer: false
flatten_activations_over_layer_output: false
from_pretrained_path: null
hook_point: blocks.8.hook_resid_pre
hook_point_head_index: null
hook_point_head_index_output: null
hook_point_layer: 8
hook_point_layer_output: null
hook_point_output: null
initial_decoder_norm: 0.1
initialise_encoder_to_decoder_transpose: true
is_dataset_tokenized: false
l0_coefficient: 0
l0_warmup: false
l0_warmup_steps: 1000
l1_coefficient: 0
l1_warmup: false
l1_warmup_steps: 1000
log_to_wandb: true
loop_dataset: false
lr: 4.0e-05
lr_scheduler_name: constant_with_warmup
lr_warm_up_steps: 2000
max_resample_step: 100000
max_sparsity_target: 1
min_sparsity_for_resample: 0
min_sparsity_target: 1.0e-05
model_name: gpt2-small
mse_loss_coefficient: 1
mse_loss_type: centered
multiple_runs: false
n_batches_in_store_buffer: 64
n_checkpoints: 5
n_running_sparsity: 300
n_starting_steps: null
normalise_initial_decoder_weights: false
normalise_w_dec: true
remove_bos_tokens: false
resample_batches: 128
resample_frequency: 25000
scale_input_norm: false
seed: 42
sparse_loss_coefficient: 0.0001
sparsity_log_frequency: 5000
store_batch_size: 8
subtract_b_dec_from_inputs: false
topk_amount: 20
total_training_steps: 24414
train_batch_size: 4096
use_cached_activations: false
use_gated_sparse_autoencoder: false
wandb_log_frequency: 10
wandb_project: test_topk
weight_l1_by_decoder_norms: false