| { | |
| "architecture": "Raschka GPTModel (separate W_query/W_key/W_value, no weight tying)", | |
| "model_type": "instruction-tuned (SFT)", | |
| "base_model": "nishantup/nanogpt-slm-124m (gpt_slm_best.pth)", | |
| "model_config": { | |
| "vocab_size": 50257, | |
| "context_length": 256, | |
| "emb_dim": 768, | |
| "n_heads": 12, | |
| "n_layers": 12, | |
| "drop_rate": 0.0, | |
| "qkv_bias": false | |
| }, | |
| "total_parameters_millions": 163.2, | |
| "tokenizer": "tiktoken gpt2 (50,257 BPE tokens)", | |
| "framework": "PyTorch", | |
| "prompt_format": "Alpaca (### Instruction / ### Input / ### Response)", | |
| "training": { | |
| "dataset": "Alpaca-format instruction dataset (1,100 examples)", | |
| "epochs": 2, | |
| "optimizer": "AdamW (lr=5e-5, weight_decay=0.1)", | |
| "max_length": 256 | |
| } | |
| } |