encoder-mla-baseline-perf / full_config.json
kokolamba's picture
Upload best checkpoint from config
5c8386d verified
{
"shorthand": "rd.16 - 6.mla.64.32 - mlp.1024 - model.256.lyr.6 - ah.8.32",
"notes": "Current top-performing MLA configuration",
"model": {
"hidden_size": 256,
"num_hidden_layers": 6,
"intermediate_size": 1024,
"hidden_dropout_prob": 0.1,
"attention_dropout_prob": 0.1,
"classifier_dropout": null,
"initializer_range": 0.02,
"layer_norm_eps": 1e-12,
"rms_norm_eps": 1e-06,
"vocab_size": 30522,
"rope_theta": 10000.0,
"rope_scaling": null,
"max_position_embeddings": 128,
"num_dense_layers": 0,
"q_latent_dim": 64,
"kv_latent_dim": 32,
"num_attention_heads": 8,
"head_dim": 32,
"rope_dims": 16,
"attention_bias": false,
"output_subspace": false,
"o_latent_dim": 64,
"attention_backend": "sdpa",
"ffn_decompose": false,
"ffn_rank": null,
"vocab_subspace": false,
"vocab_rank": 128
},
"pre_train": {
"output_dir": "checkpoints/mla_baseline",
"seed": 42,
"train_batch_size": 256,
"learning_rate": 0.0005,
"num_train_steps": 50000,
"eval_steps": 2000,
"weight_decay": 0.01,
"mlm_probability": 0.15,
"dataset_name": "wikitext",
"dataset_config": "wikitext-103-raw-v1",
"max_seq_length": 128,
"eval_batch_size": 64,
"fp16": false,
"bf16": true,
"torch_compile": true,
"torch_compile_backend": "inductor",
"torch_compile_mode": "default",
"best_checkpoint": "checkpoints/mla_baseline/checkpoint-50000",
"run_name": "mla_baseline",
"run_id": "koko",
"push_to_hub": true,
"hub_model_id": "kokolamba/encoder-mla-baseline-perf",
"hub_strategy": "end"
},
"fine_tune": {
"task": "sst2",
"batch_size": 16,
"lr": 2e-05,
"epochs": 3,
"seed": 42,
"max_length": 128,
"weight_decay": 0,
"warmup_ratio": 0,
"run_name": "configperf - mla_baseline - pt_id.koko - sst2",
"model_path": "checkpoints/mla_baseline/checkpoint-50000",
"tuned_from_id": "koko",
"run_id": "a52vmc64",
"run_url": "https://wandb.ai/abdulhakeemadefioye-personal/subspace-encoder-sst2/runs/a52vmc64"
}
}