activation_function: topk adam_beta1: 0.9 adam_beta2: 0.999 b_dec_init_method: zeros cached_activations_path: null checkpoint_path: ./outputs/checkpoints clip_grad_norm: true context_size: 128 custom_loss: null d_in: 768 d_out: null dataset: Skylion007/openwebtext dense_loss_coefficient: 0 device: cuda different_output: false dtype: float32 epsilon_l0_approx: 0.1 eval_frequency: 500 expansion_factor: 32 feature_reinit_scale: 0.2 feature_resampling_method: null fine_tune_dataset: false finetuning_steps: !!python/tuple - 1000 flatten_activations_over_layer: false flatten_activations_over_layer_output: false from_pretrained_path: null hook_point: blocks.8.hook_resid_pre hook_point_head_index: null hook_point_head_index_output: null hook_point_layer: 8 hook_point_layer_output: null hook_point_output: null initial_decoder_norm: 0.1 initialise_encoder_to_decoder_transpose: true is_dataset_tokenized: false l0_coefficient: 0 l0_warmup: false l0_warmup_steps: 1000 l1_coefficient: 0 l1_warmup: false l1_warmup_steps: 1000 log_to_wandb: true loop_dataset: false lr: 4.0e-05 lr_scheduler_name: constant_with_warmup lr_warm_up_steps: 2000 max_resample_step: 100000 max_sparsity_target: 1 min_sparsity_for_resample: 0 min_sparsity_target: 1.0e-05 model_name: gpt2-small mse_loss_coefficient: 1 mse_loss_type: centered multiple_runs: false n_batches_in_store_buffer: 64 n_checkpoints: 5 n_running_sparsity: 300 n_starting_steps: null normalise_initial_decoder_weights: false normalise_w_dec: true remove_bos_tokens: false resample_batches: 128 resample_frequency: 25000 scale_input_norm: false seed: 42 sparse_loss_coefficient: 0.0001 sparsity_log_frequency: 5000 store_batch_size: 8 subtract_b_dec_from_inputs: false topk_amount: 20 total_training_steps: 24414 train_batch_size: 4096 use_cached_activations: false use_gated_sparse_autoencoder: false wandb_log_frequency: 10 wandb_project: test_topk weight_l1_by_decoder_norms: false