{ "final_model": "small + GQA + RoPE", "main_repo": "lllezd/dl-course-hw13", "variant_repo": "lllezd/dl-course-hw13-small-gqa-rope", "eval_loss": 2.4694972038269043, "eval_perplexity": 11.816504064743178, "experiments": { "1_small_gqa_baseline": { "description": "small + GQA baseline", "val_loss": 2.565, "perplexity": 13.0 }, "2_small_gqa_rope": { "description": "small + GQA + RoPE", "val_loss": 2.4694972038269043, "perplexity": 11.816504064743178 }, "3_mini_mla_rope": { "description": "mini + MLA + RoPE", "val_loss": 2.7306, "perplexity": 15.34 }, "4_small_mla_rope": { "description": "small + MLA + RoPE", "val_loss": 2.4905, "perplexity": 12.07 } }, "config": { "n_layer": 12, "n_head": 12, "n_kv_head": 6, "hidden_dim": 768, "intermediate_dim": 2048, "dropout": 0.1, "vocab_size": 1024, "max_seq_len": 128, "use_rope": true, "rope_base": 10000.0, "attention_type": "gqa", "q_latent_dim": null, "kv_latent_dim": null }, "created_at": "2026-05-22T10:01:36.637143Z" }