| { | |
| "final_model": "small + GQA + RoPE", | |
| "main_repo": "lllezd/dl-course-hw13", | |
| "variant_repo": "lllezd/dl-course-hw13-small-gqa-rope", | |
| "eval_loss": 2.4694972038269043, | |
| "eval_perplexity": 11.816504064743178, | |
| "experiments": { | |
| "1_small_gqa_baseline": { | |
| "description": "small + GQA baseline", | |
| "val_loss": 2.565, | |
| "perplexity": 13.0 | |
| }, | |
| "2_small_gqa_rope": { | |
| "description": "small + GQA + RoPE", | |
| "val_loss": 2.4694972038269043, | |
| "perplexity": 11.816504064743178 | |
| }, | |
| "3_mini_mla_rope": { | |
| "description": "mini + MLA + RoPE", | |
| "val_loss": 2.7306, | |
| "perplexity": 15.34 | |
| }, | |
| "4_small_mla_rope": { | |
| "description": "small + MLA + RoPE", | |
| "val_loss": 2.4905, | |
| "perplexity": 12.07 | |
| } | |
| }, | |
| "config": { | |
| "n_layer": 12, | |
| "n_head": 12, | |
| "n_kv_head": 6, | |
| "hidden_dim": 768, | |
| "intermediate_dim": 2048, | |
| "dropout": 0.1, | |
| "vocab_size": 1024, | |
| "max_seq_len": 128, | |
| "use_rope": true, | |
| "rope_base": 10000.0, | |
| "attention_type": "gqa", | |
| "q_latent_dim": null, | |
| "kv_latent_dim": null | |
| }, | |
| "created_at": "2026-05-22T10:01:36.637143Z" | |
| } |