activation_function: relu
adam_beta1: 0.9
adam_beta2: 0.999
b_dec_init_method: zeros
cached_activations_path: null
checkpoint_path: ../outputs/checkpoints
clip_grad_norm: false
context_size: 1024
custom_loss: null
d_in: 2048
d_out: null
dataset: Skylion007/openwebtext
dense_loss_coefficient: 0
device: cuda
different_output: false
dtype: float32
epsilon_l0_approx: 0.2
eval_frequency: 500
expansion_factor: 8
feature_reinit_scale: 0.2
feature_resampling_method: null
fine_tune_dataset: false
finetuning_steps: !!python/tuple
- 1000
flatten_activations_over_layer: false
flatten_activations_over_layer_output: false
from_pretrained_path: null
hook_point: blocks.9.hook_resid_pre
hook_point_head_index: null
hook_point_head_index_output: null
hook_point_layer: 9
hook_point_layer_output: null
hook_point_output: null
initial_decoder_norm: 0.1
initialise_encoder_to_decoder_transpose: false
is_dataset_tokenized: false
l0_coefficient: 2.0e-05
l0_warmup: false
l0_warmup_steps: 1000
l1_coefficient: 0
l1_warmup: false
l1_warmup_steps: 1000
log_to_wandb: true
loop_dataset: false
lr: 0.0004
lr_scheduler_name: constant
lr_warm_up_steps: 500
max_resample_step: 100000
max_sparsity_target: 1
min_sparsity_for_resample: 0
min_sparsity_target: 1.0e-05
model_name: gemma-2b-it
mse_loss_coefficient: 1
mse_loss_type: centered
multiple_runs: false
n_batches_in_store_buffer: 128
n_checkpoints: 160
n_running_sparsity: 300
n_starting_steps: null
normalise_initial_decoder_weights: false
normalise_w_dec: true
remove_bos_tokens: false
resample_batches: 128
resample_frequency: 25000
scale_input_norm: false
seed: 42
sparse_loss_coefficient: 1.0e-06
sparsity_log_frequency: 5000
store_batch_size: 2
subtract_b_dec_from_inputs: false
topk_amount: 10
total_training_steps: 200000
train_batch_size: 4096
use_cached_activations: false
use_gated_sparse_autoencoder: false
wandb_log_frequency: 10
wandb_project: test_gemma_2b
weight_l1_by_decoder_norms: false