activation_function: relu adam_beta1: 0.9 adam_beta2: 0.999 b_dec_init_method: zeros cached_activations_path: null checkpoint_path: ../outputs/checkpoints clip_grad_norm: false context_size: 1024 custom_loss: null d_in: 2048 d_out: null dataset: Skylion007/openwebtext dense_loss_coefficient: 0 device: cuda different_output: false dtype: float32 epsilon_l0_approx: 0.2 eval_frequency: 500 expansion_factor: 8 feature_reinit_scale: 0.2 feature_resampling_method: null fine_tune_dataset: false finetuning_steps: !!python/tuple - 1000 flatten_activations_over_layer: false flatten_activations_over_layer_output: false from_pretrained_path: null hook_point: blocks.9.hook_resid_pre hook_point_head_index: null hook_point_head_index_output: null hook_point_layer: 9 hook_point_layer_output: null hook_point_output: null initial_decoder_norm: 0.1 initialise_encoder_to_decoder_transpose: false is_dataset_tokenized: false l0_coefficient: 2.0e-05 l0_warmup: false l0_warmup_steps: 1000 l1_coefficient: 0 l1_warmup: false l1_warmup_steps: 1000 log_to_wandb: true loop_dataset: false lr: 0.0004 lr_scheduler_name: constant lr_warm_up_steps: 500 max_resample_step: 100000 max_sparsity_target: 1 min_sparsity_for_resample: 0 min_sparsity_target: 1.0e-05 model_name: gemma-2b-it mse_loss_coefficient: 1 mse_loss_type: centered multiple_runs: false n_batches_in_store_buffer: 128 n_checkpoints: 160 n_running_sparsity: 300 n_starting_steps: null normalise_initial_decoder_weights: false normalise_w_dec: true remove_bos_tokens: false resample_batches: 128 resample_frequency: 25000 scale_input_norm: false seed: 42 sparse_loss_coefficient: 1.0e-06 sparsity_log_frequency: 5000 store_batch_size: 2 subtract_b_dec_from_inputs: false topk_amount: 10 total_training_steps: 200000 train_batch_size: 4096 use_cached_activations: false use_gated_sparse_autoencoder: false wandb_log_frequency: 10 wandb_project: test_gemma_2b weight_l1_by_decoder_norms: false