{ "architecture": "GPT (custom, distilled from SmolLM-135M)", "parameters": "124M", "teacher_model": "HuggingFaceTB/SmolLM-135M-Instruct", "dataset": "HuggingFaceFW/fineweb-edu/sample-10BT", "distill_alpha": 0.5, "distill_temp": 2.0, "max_steps": 5000, "tokens_processed": 327680000, "best_loss": 326.0110778808594 }