| { | |
| "trainer": { | |
| "dict_class": "VSAEMixtureGaussian", | |
| "trainer_class": "VSAEMixtureTrainer", | |
| "activation_dim": 2048, | |
| "dict_size": 8192, | |
| "lr": 0.0005, | |
| "kl_coeff": 100, | |
| "warmup_steps": 500, | |
| "sparsity_warmup_steps": 500, | |
| "steps": 10000, | |
| "decay_start": 8000, | |
| "resample_steps": null, | |
| "var_flag": 0, | |
| "n_correlated_pairs": 0, | |
| "n_anticorrelated_pairs": 0, | |
| "use_april_update_mode": true, | |
| "seed": null, | |
| "device": "cuda", | |
| "layer": 0, | |
| "lm_name": "gelu-1l", | |
| "wandb_name": "VSAEMix_gelu-1l_d8192_lr0.0005_kl100_corr0_anticorr0_trainer_0", | |
| "submodule_name": null | |
| }, | |
| "buffer": { | |
| "d_submodule": 2048, | |
| "n_ctxs": 3000, | |
| "ctx_len": 128, | |
| "refresh_batch_size": 32, | |
| "out_batch_size": 1024, | |
| "device": "cuda" | |
| } | |
| } |