| { | |
| "shorthand": "rd.16 - 6.mla.64.32 - mlp.1024 - model.256.lyr.6 - ah.8.32", | |
| "notes": "Current top-performing MLA configuration", | |
| "model": { | |
| "hidden_size": 256, | |
| "num_hidden_layers": 6, | |
| "intermediate_size": 1024, | |
| "hidden_dropout_prob": 0.1, | |
| "attention_dropout_prob": 0.1, | |
| "classifier_dropout": null, | |
| "initializer_range": 0.02, | |
| "layer_norm_eps": 1e-12, | |
| "rms_norm_eps": 1e-06, | |
| "vocab_size": 30522, | |
| "rope_theta": 10000.0, | |
| "rope_scaling": null, | |
| "max_position_embeddings": 128, | |
| "num_dense_layers": 0, | |
| "q_latent_dim": 64, | |
| "kv_latent_dim": 32, | |
| "num_attention_heads": 8, | |
| "head_dim": 32, | |
| "rope_dims": 16, | |
| "attention_bias": false, | |
| "output_subspace": false, | |
| "o_latent_dim": 64, | |
| "attention_backend": "sdpa", | |
| "ffn_decompose": false, | |
| "ffn_rank": null, | |
| "vocab_subspace": false, | |
| "vocab_rank": 128 | |
| }, | |
| "pre_train": { | |
| "output_dir": "checkpoints/mla_baseline", | |
| "seed": 42, | |
| "train_batch_size": 256, | |
| "learning_rate": 0.0005, | |
| "num_train_steps": 50000, | |
| "eval_steps": 2000, | |
| "weight_decay": 0.01, | |
| "mlm_probability": 0.15, | |
| "dataset_name": "wikitext", | |
| "dataset_config": "wikitext-103-raw-v1", | |
| "max_seq_length": 128, | |
| "eval_batch_size": 64, | |
| "fp16": false, | |
| "bf16": true, | |
| "torch_compile": true, | |
| "torch_compile_backend": "inductor", | |
| "torch_compile_mode": "default", | |
| "best_checkpoint": "checkpoints/mla_baseline/checkpoint-50000", | |
| "run_name": "mla_baseline", | |
| "run_id": "koko", | |
| "push_to_hub": true, | |
| "hub_model_id": "kokolamba/encoder-mla-baseline-perf", | |
| "hub_strategy": "end" | |
| }, | |
| "fine_tune": { | |
| "task": "sst2", | |
| "batch_size": 16, | |
| "lr": 2e-05, | |
| "epochs": 3, | |
| "seed": 42, | |
| "max_length": 128, | |
| "weight_decay": 0, | |
| "warmup_ratio": 0, | |
| "run_name": "configperf - mla_baseline - pt_id.koko - sst2", | |
| "model_path": "checkpoints/mla_baseline/checkpoint-50000", | |
| "tuned_from_id": "koko", | |
| "run_id": "a52vmc64", | |
| "run_url": "https://wandb.ai/abdulhakeemadefioye-personal/subspace-encoder-sst2/runs/a52vmc64" | |
| } | |
| } |