| { | |
| "architecture": "GPT (custom, distilled from SmolLM-135M)", | |
| "parameters": "124M", | |
| "teacher_model": "HuggingFaceTB/SmolLM-135M-Instruct", | |
| "dataset": "HuggingFaceFW/fineweb-edu/sample-10BT", | |
| "distill_alpha": 0.5, | |
| "distill_temp": 2.0, | |
| "max_steps": 5000, | |
| "tokens_processed": 327680000, | |
| "best_loss": 326.0110778808594 | |
| } |