topk_saes / efficient-disco-5 /hyperparameters.yaml

Initial commit of 100M training tokens on gpt2-small, pythia-160m-deduped, opt-125m

f7a232d verified over 1 year ago

1.93 kB

	activation_function: topk
	adam_beta1: 0.9
	adam_beta2: 0.999
	b_dec_init_method: zeros
	cached_activations_path: null
	checkpoint_path: ./outputs/checkpoints
	clip_grad_norm: true
	context_size: 128
	custom_loss: null
	d_in: 768
	d_out: null
	dataset: Skylion007/openwebtext
	dense_loss_coefficient: 0
	device: cuda
	different_output: false
	dtype: float32
	epsilon_l0_approx: 0.1
	eval_frequency: 500
	expansion_factor: 32
	feature_reinit_scale: 0.2
	feature_resampling_method: null
	fine_tune_dataset: false
	finetuning_steps: !!python/tuple
	- 1000
	flatten_activations_over_layer: false
	flatten_activations_over_layer_output: false
	from_pretrained_path: null
	hook_point: blocks.8.hook_resid_pre
	hook_point_head_index: null
	hook_point_head_index_output: null
	hook_point_layer: 8
	hook_point_layer_output: null
	hook_point_output: null
	initial_decoder_norm: 0.1
	initialise_encoder_to_decoder_transpose: true
	is_dataset_tokenized: false
	l0_coefficient: 0
	l0_warmup: false
	l0_warmup_steps: 1000
	l1_coefficient: 0
	l1_warmup: false
	l1_warmup_steps: 1000
	log_to_wandb: true
	loop_dataset: false
	lr: 4.0e-05
	lr_scheduler_name: constant_with_warmup
	lr_warm_up_steps: 2000
	max_resample_step: 100000
	max_sparsity_target: 1
	min_sparsity_for_resample: 0
	min_sparsity_target: 1.0e-05
	model_name: gpt2-small
	mse_loss_coefficient: 1
	mse_loss_type: centered
	multiple_runs: false
	n_batches_in_store_buffer: 64
	n_checkpoints: 5
	n_running_sparsity: 300
	n_starting_steps: null
	normalise_initial_decoder_weights: false
	normalise_w_dec: true
	remove_bos_tokens: false
	resample_batches: 128
	resample_frequency: 25000
	scale_input_norm: false
	seed: 42
	sparse_loss_coefficient: 0.0001
	sparsity_log_frequency: 5000
	store_batch_size: 8
	subtract_b_dec_from_inputs: false
	topk_amount: 20
	total_training_steps: 24414
	train_batch_size: 4096
	use_cached_activations: false
	use_gated_sparse_autoencoder: false
	wandb_log_frequency: 10
	wandb_project: test_topk
	weight_l1_by_decoder_norms: false