pythia_1b-topk / cluster /config.json
ghidav
Reformat
81bc28b
{"sae": {"expansion_factor": 16, "normalize_decoder": true, "num_latents": 0, "k": 128, "multi_topk": false, "jumprelu": false, "jumprelu_init_threshold": 0.001, "jumprelu_bandwidth": 0.001, "jumprelu_target_l0": null, "jumprelu_per_layer_l0": false, "init_enc_as_dec_transpose": true, "init_b_dec_as_zeros": false}, "batch_size": 4, "max_seq_len": 1024, "num_training_tokens": 1000000000, "grad_acc_steps": 1, "micro_acc_steps": 1, "adam_8bit": false, "adam_epsilon": 1e-08, "adam_betas": [0.9, 0.999], "lr": null, "lr_scheduler_name": "constant", "lr_warmup_steps": 0.0, "l1_coefficient": 0.0, "l1_warmup_steps": 0.0, "use_l2_loss": false, "auxk_alpha": 0.03125, "dead_feature_threshold": 10000000, "hookpoints": ["layers.0", "layers.1", "layers.2", "layers.3", "layers.4", "layers.5", "layers.6", "layers.7", "layers.8", "layers.9", "layers.10", "layers.11", "layers.12", "layers.13", "layers.14", "layers.15"], "layers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "layer_stride": 1, "distribute_modules": true, "save_every": 100000, "normalize_activations": 1.0, "num_norm_estimation_tokens": 5000000, "clusters": {"k1-c0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "k2-c0": [0, 1, 2, 3, 4, 5, 6, 7, 8], "k2-c1": [9, 10, 11, 12, 13, 14], "k3-c0": [0, 1, 2, 3, 4], "k3-c1": [5, 6, 7, 8], "k4-c0": [9, 10, 11, 12], "k4-c1": [13, 14], "k5-c0": [2, 3, 4], "k5-c1": [0, 1], "k6-c0": [9, 10], "k6-c1": [11, 12], "k7-c0": [7, 8], "k7-c1": [5, 6], "k8-c0": [3, 4], "layers.0": [0], "layers.1": [1], "layers.2": [2], "layers.3": [3], "layers.4": [4], "layers.5": [5], "layers.6": [6], "layers.7": [7], "layers.8": [8], "layers.9": [9], "layers.10": [10], "layers.11": [11], "layers.12": [12], "layers.13": [13], "layers.14": [14], "layers.15": [15]}, "cluster_hookpoints": {"k1-c0": ["layers.0", "layers.1", "layers.2", "layers.3", "layers.4", "layers.5", "layers.6", "layers.7", "layers.8", "layers.9", "layers.10", "layers.11", "layers.12", "layers.13", "layers.14"], "k2-c0": ["layers.0", "layers.1", "layers.2", "layers.3", "layers.4", "layers.5", "layers.6", "layers.7", "layers.8"], "k2-c1": ["layers.9", "layers.10", "layers.11", "layers.12", "layers.13", "layers.14"], "k3-c0": ["layers.0", "layers.1", "layers.2", "layers.3", "layers.4"], "k3-c1": ["layers.5", "layers.6", "layers.7", "layers.8"], "k4-c0": ["layers.9", "layers.10", "layers.11", "layers.12"], "k4-c1": ["layers.13", "layers.14"], "k5-c0": ["layers.2", "layers.3", "layers.4"], "k5-c1": ["layers.0", "layers.1"], "k6-c0": ["layers.9", "layers.10"], "k6-c1": ["layers.11", "layers.12"], "k7-c0": ["layers.7", "layers.8"], "k7-c1": ["layers.5", "layers.6"], "k8-c0": ["layers.3", "layers.4"], "layers.0": ["layers.0"], "layers.1": ["layers.1"], "layers.2": ["layers.2"], "layers.3": ["layers.3"], "layers.4": ["layers.4"], "layers.5": ["layers.5"], "layers.6": ["layers.6"], "layers.7": ["layers.7"], "layers.8": ["layers.8"], "layers.9": ["layers.9"], "layers.10": ["layers.10"], "layers.11": ["layers.11"], "layers.12": ["layers.12"], "layers.13": ["layers.13"], "layers.14": ["layers.14"], "layers.15": ["layers.15"]}, "hook": null, "keep_last_n_checkpoints": 4, "resume_from": null, "log_to_wandb": true, "run_name": "checkpoints-clusters/pythia-1b-topk", "wandb_log_frequency": 1}