{
    "comment": "Note: this is not the full params file used for training (see our github repo), but sufficient for model loading",
    "data": {
        "add_bos": true,
        "add_eos": true,
        "batch_size": 3,
        "load_async": true,
        "n_views": 2,
        "prefetch_size": 1024,
        "root_dir": "<zeyuan_placeholder>",
        "seed": 42,
        "seq_len": 4096,
        "sources": {
            "original_shuffled4": 1.0
        },
        "tokenizer": {
            "name": "tiktoken",
            "path": "<zeyuan_placeholder>"
        }
    },
    "model": {
        "attn_impl": "sdpa",
        "canon_activation": false,
        "canon_bias": false,
        "canon_kernel": 4,
        "canon_residual": true,
        "canon_set": "ABCD",
        "dim": 4096,
        "ffn_dim_multiplier": 1.0,
        "head_dim": null,
        "hidden_dim": 14336,
        "init_base_std": null,
        "init_std_factor": "disabled",
        "max_seqlen": 4096,
        "multiple_of": 256,
        "n_heads": 32,
        "n_kv_heads": 8,
        "n_layers": 32,
        "norm_eps": 1e-05,
        "qk_norm": false,
        "rope_dim": 32,
        "rope_theta": 100000.0,
        "seed": 42,
        "sliding_window": null,
        "vocab_size": 128256,
        "weight_tying": false,
        "z_loss": false
    }
}