{
  "architecture": "GPT (custom, distilled from SmolLM-135M)",
  "parameters": "124M",
  "teacher_model": "HuggingFaceTB/SmolLM-135M-Instruct",
  "dataset": "HuggingFaceFW/fineweb-edu/sample-10BT",
  "distill_alpha": 0.5,
  "distill_temp": 2.0,
  "max_steps": 5000,
  "tokens_processed": 327680000,
  "best_loss": 326.0110778808594
}