{ "shorthand": "rd.16 - 6.mla.64.32 - mlp.1024 - model.256.lyr.6 - ah.8.32", "notes": "Current top-performing MLA configuration", "model": { "hidden_size": 256, "num_hidden_layers": 6, "intermediate_size": 1024, "hidden_dropout_prob": 0.1, "attention_dropout_prob": 0.1, "classifier_dropout": null, "initializer_range": 0.02, "layer_norm_eps": 1e-12, "rms_norm_eps": 1e-06, "vocab_size": 30522, "rope_theta": 10000.0, "rope_scaling": null, "max_position_embeddings": 128, "num_dense_layers": 0, "q_latent_dim": 64, "kv_latent_dim": 32, "num_attention_heads": 8, "head_dim": 32, "rope_dims": 16, "attention_bias": false, "output_subspace": false, "o_latent_dim": 64, "attention_backend": "sdpa", "ffn_decompose": false, "ffn_rank": null, "vocab_subspace": false, "vocab_rank": 128 }, "pre_train": { "output_dir": "checkpoints/mla_baseline", "seed": 42, "train_batch_size": 256, "learning_rate": 0.0005, "num_train_steps": 50000, "eval_steps": 2000, "weight_decay": 0.01, "mlm_probability": 0.15, "dataset_name": "wikitext", "dataset_config": "wikitext-103-raw-v1", "max_seq_length": 128, "eval_batch_size": 64, "fp16": false, "bf16": true, "torch_compile": true, "torch_compile_backend": "inductor", "torch_compile_mode": "default", "best_checkpoint": "checkpoints/mla_baseline/checkpoint-50000", "run_name": "mla_baseline", "run_id": "koko", "push_to_hub": true, "hub_model_id": "kokolamba/encoder-mla-baseline-perf", "hub_strategy": "end" }, "fine_tune": { "task": "sst2", "batch_size": 16, "lr": 2e-05, "epochs": 3, "seed": 42, "max_length": 128, "weight_decay": 0, "warmup_ratio": 0, "run_name": "configperf - mla_baseline - pt_id.koko - sst2", "model_path": "checkpoints/mla_baseline/checkpoint-50000", "tuned_from_id": "koko", "run_id": "a52vmc64", "run_url": "https://wandb.ai/abdulhakeemadefioye-personal/subspace-encoder-sst2/runs/a52vmc64" } }