| { |
| "model_name": "google/gemma-2-2b", |
| "layer": 12, |
| "hook_point": "resid_post", |
| "act_size": 2304, |
| "sae_type": "topk", |
| "dict_size": 65536, |
| "aux_penalty": 0.03125, |
| "input_unit_norm": true, |
| "batch_norm_on_queries": false, |
| "affine_batch_norm": false, |
| "linear_heads": 0, |
| "topk2": 32, |
| "topk1": 50, |
| "topk2_warmup_steps_fraction": 0.0, |
| "start_topk2": 50, |
| "topk1_warmup_steps_fraction": 0.0, |
| "start_topk1": 50, |
| "topk2_aux": 512, |
| "cartesian_op": "mul", |
| "router_depth": 2, |
| "router_tree_width": null, |
| "num_mkeys": null, |
| "num_nkeys": null, |
| "num_heads": -1, |
| "n_batches_to_dead": 10, |
| "lr": 0.0008, |
| "bandwidth": 0.001, |
| "l1_coeff": 0.0018, |
| "num_tokens": 500000000, |
| "seq_len": 1024, |
| "model_batch_size": 64, |
| "num_batches_in_buffer": 5, |
| "max_grad_norm": 1.0, |
| "batch_size": 8192, |
| "weight_decay": 0.0, |
| "warmup_fraction": 0.1, |
| "scheduler_type": "cosine_with_min_lr", |
| "device": "cuda", |
| "dtype": "torch.float32", |
| "sae_dtype": "torch.float32", |
| "dataset_path": "HuggingFaceFW/fineweb-edu", |
| "wandb_project": "turbo-llama-lens", |
| "enable_wandb": true, |
| "sae_name": "sae", |
| "seed": 42, |
| "performance_log_steps": 100, |
| "save_checkpoint_steps": 15000000, |
| "wandb_run_suffix": "ex72_for_sae_bench_gemma", |
| "sweep_pair": "{'dict_size': 65536, 'num_tokens': 500000000, 'sae_type': 'topk', 'start_topk1': 50, 'start_topk2': 50, 'topk1': 50}" |
| } |