{ "model_type": "pycraft_decoder_only", "vocab_size": 32000, "d_model": 512, "n_layers": 8, "n_heads": 8, "n_kv_heads": 2, "d_ff": 1408, "max_seq_len": 1024, "use_qk_norm": true, "rope_theta": 10000.0, "dropout": 0.0, "architecture_notes": { "attention": "Grouped Query Attention GQA 8Q/2KV", "positional": "RoPE rotary embeddings", "qk_norm": "RMSNorm on Q and K OLMo2 Qwen3 2025", "ffn": "SwiGLU gated feedforward", "norm": "RMSNorm pre-norm", "objective": "Causal LM plus FIM 50 percent PSM format" }, "training_summary": { "pretrain_steps": 4000, "pretrain_loss": 1.16, "pretrain_ppl": 3.2, "tokens_seen": "1.05B", "sft_steps": 400, "sft_loss": 1.15, "sft_ppl": 3.15, "hardware": "NVIDIA RTX 3050 Laptop 4GB VRAM" } }