| activation_function: relu |
| adam_beta1: 0.9 |
| adam_beta2: 0.999 |
| b_dec_init_method: zeros |
| cached_activations_path: null |
| checkpoint_path: ../outputs/checkpoints |
| clip_grad_norm: false |
| context_size: 1024 |
| custom_loss: null |
| d_in: 2048 |
| d_out: null |
| dataset: Skylion007/openwebtext |
| dense_loss_coefficient: 0 |
| device: cuda |
| different_output: false |
| dtype: float32 |
| epsilon_l0_approx: 0.2 |
| eval_frequency: 500 |
| expansion_factor: 8 |
| feature_reinit_scale: 0.2 |
| feature_resampling_method: null |
| fine_tune_dataset: false |
| finetuning_steps: !!python/tuple |
| - 1000 |
| flatten_activations_over_layer: false |
| flatten_activations_over_layer_output: false |
| from_pretrained_path: null |
| hook_point: blocks.9.hook_resid_pre |
| hook_point_head_index: null |
| hook_point_head_index_output: null |
| hook_point_layer: 9 |
| hook_point_layer_output: null |
| hook_point_output: null |
| initial_decoder_norm: 0.1 |
| initialise_encoder_to_decoder_transpose: false |
| is_dataset_tokenized: false |
| l0_coefficient: 2.0e-05 |
| l0_warmup: false |
| l0_warmup_steps: 1000 |
| l1_coefficient: 0 |
| l1_warmup: false |
| l1_warmup_steps: 1000 |
| log_to_wandb: true |
| loop_dataset: false |
| lr: 0.0004 |
| lr_scheduler_name: constant |
| lr_warm_up_steps: 500 |
| max_resample_step: 100000 |
| max_sparsity_target: 1 |
| min_sparsity_for_resample: 0 |
| min_sparsity_target: 1.0e-05 |
| model_name: gemma-2b-it |
| mse_loss_coefficient: 1 |
| mse_loss_type: centered |
| multiple_runs: false |
| n_batches_in_store_buffer: 128 |
| n_checkpoints: 160 |
| n_running_sparsity: 300 |
| n_starting_steps: null |
| normalise_initial_decoder_weights: false |
| normalise_w_dec: true |
| remove_bos_tokens: false |
| resample_batches: 128 |
| resample_frequency: 25000 |
| scale_input_norm: false |
| seed: 42 |
| sparse_loss_coefficient: 1.0e-06 |
| sparsity_log_frequency: 5000 |
| store_batch_size: 2 |
| subtract_b_dec_from_inputs: false |
| topk_amount: 10 |
| total_training_steps: 200000 |
| train_batch_size: 4096 |
| use_cached_activations: false |
| use_gated_sparse_autoencoder: false |
| wandb_log_frequency: 10 |
| wandb_project: test_gemma_2b |
| weight_l1_by_decoder_norms: false |
|
|