| { | |
| "model_type": "pycraft_decoder_only", | |
| "vocab_size": 32000, | |
| "d_model": 512, | |
| "n_layers": 8, | |
| "n_heads": 8, | |
| "n_kv_heads": 2, | |
| "d_ff": 1408, | |
| "max_seq_len": 1024, | |
| "use_qk_norm": true, | |
| "rope_theta": 10000.0, | |
| "dropout": 0.0, | |
| "architecture_notes": { | |
| "attention": "Grouped Query Attention GQA 8Q/2KV", | |
| "positional": "RoPE rotary embeddings", | |
| "qk_norm": "RMSNorm on Q and K OLMo2 Qwen3 2025", | |
| "ffn": "SwiGLU gated feedforward", | |
| "norm": "RMSNorm pre-norm", | |
| "objective": "Causal LM plus FIM 50 percent PSM format" | |
| }, | |
| "training_summary": { | |
| "pretrain_steps": 4000, | |
| "pretrain_loss": 1.16, | |
| "pretrain_ppl": 3.2, | |
| "tokens_seen": "1.05B", | |
| "sft_steps": 400, | |
| "sft_loss": 1.15, | |
| "sft_ppl": 3.15, | |
| "hardware": "NVIDIA RTX 3050 Laptop 4GB VRAM" | |
| } | |
| } |