{ "comment": "Note: this is not the full params file used for training (see our github repo), but sufficient for model loading", "data": { "add_bos": true, "add_eos": true, "batch_size": 3, "load_async": true, "n_views": 2, "prefetch_size": 1024, "root_dir": "", "seed": 42, "seq_len": 4096, "sources": { "original_shuffled4": 1.0 }, "tokenizer": { "name": "tiktoken", "path": "" } }, "model": { "attn_impl": "sdpa", "canon_activation": false, "canon_bias": false, "canon_kernel": 4, "canon_residual": true, "canon_set": "ABCD", "dim": 4096, "ffn_dim_multiplier": 1.0, "head_dim": null, "hidden_dim": 14336, "init_base_std": null, "init_std_factor": "disabled", "max_seqlen": 4096, "multiple_of": 256, "n_heads": 32, "n_kv_heads": 8, "n_layers": 32, "norm_eps": 1e-05, "qk_norm": false, "rope_dim": 32, "rope_theta": 100000.0, "seed": 42, "sliding_window": null, "vocab_size": 128256, "weight_tying": false, "z_loss": false } }