{ "model_type": "fineweb-gpt", "architectures": [ "GPTForCausalLM" ], "bos_token_id": 0, "eos_token_id": 0, "pad_token_id": 1, "vocab_size": 8192, "context_len": 512, "n_layers": 6, "d_model": 256, "n_heads": 8, "d_ff": 1024, "dropout": 0.1, "tie_embeddings": true, "trained_steps": 1800, "val_loss": 5.2764, "perplexity": 195.7, "training_tokens": "~5M", "dataset": "HuggingFaceFW/fineweb-edu (sample-10BT, 10k docs)" }