{ "activation_dims": { "mlp_0": 768, "attn_0": 768, "mlp_1": 768, "attn_1": 768, "mlp_2": 768, "attn_2": 768, "mlp_3": 768, "attn_3": 768, "mlp_4": 768, "attn_4": 768, "mlp_5": 768, "attn_5": 768, "mlp_6": 768, "attn_6": 768, "mlp_7": 768, "attn_7": 768, "mlp_8": 768, "attn_8": 768, "mlp_9": 768, "attn_9": 768, "mlp_10": 768, "attn_10": 768, "mlp_11": 768, "attn_11": 768 }, "dict_sizes": { "mlp_0": 12288, "attn_0": 12288, "mlp_1": 12288, "attn_1": 12288, "mlp_2": 12288, "attn_2": 12288, "mlp_3": 12288, "attn_3": 12288, "mlp_4": 12288, "attn_4": 12288, "mlp_5": 12288, "attn_5": 12288, "mlp_6": 12288, "attn_6": 12288, "mlp_7": 12288, "attn_7": 12288, "mlp_8": 12288, "attn_8": 12288, "mlp_9": 12288, "attn_9": 12288, "mlp_10": 12288, "attn_10": 12288, "mlp_11": 12288, "attn_11": 12288 }, "ks": { "mlp_0": 128, "attn_0": 128, "mlp_1": 128, "attn_1": 128, "mlp_2": 128, "attn_2": 128, "mlp_3": 128, "attn_3": 128, "mlp_4": 128, "attn_4": 128, "mlp_5": 128, "attn_5": 128, "mlp_6": 128, "attn_6": 128, "mlp_7": 128, "attn_7": 128, "mlp_8": 128, "attn_8": 128, "mlp_9": 128, "attn_9": 128, "mlp_10": 128, "attn_10": 128, "mlp_11": 128, "attn_11": 128 }, "layers": [], "lm_name": "", "submodule_names": [ "mlp_0", "attn_0", "mlp_1", "attn_1", "mlp_2", "attn_2", "mlp_3", "attn_3", "mlp_4", "attn_4", "mlp_5", "attn_5", "mlp_6", "attn_6", "mlp_7", "attn_7", "mlp_8", "attn_8", "mlp_9", "attn_9", "mlp_10", "attn_10", "mlp_11", "attn_11" ], "connection_sparsity_coeff": 0.01, "use_sparse_connections": false, "dtype": "torch.float32", "buffer_config": { "ctx_len": 128, "refresh_batch_size": 256, "out_batch_size": 4096 } }