| { |
| "comment": "Note: this is not the full params file used for training (see our github repo), but sufficient for model loading", |
| "data": { |
| "add_bos": true, |
| "add_eos": true, |
| "batch_size": 3, |
| "load_async": true, |
| "n_views": 2, |
| "prefetch_size": 1024, |
| "root_dir": "<zeyuan_placeholder>", |
| "seed": 42, |
| "seq_len": 4096, |
| "sources": { |
| "original_shuffled4": 1.0 |
| }, |
| "tokenizer": { |
| "name": "tiktoken", |
| "path": "<zeyuan_placeholder>" |
| } |
| }, |
| "model": { |
| "attn_impl": "sdpa", |
| "canon_activation": false, |
| "canon_bias": false, |
| "canon_kernel": 4, |
| "canon_residual": true, |
| "canon_set": "ABCD", |
| "dim": 4096, |
| "ffn_dim_multiplier": 1.0, |
| "head_dim": null, |
| "hidden_dim": 14336, |
| "init_base_std": null, |
| "init_std_factor": "disabled", |
| "max_seqlen": 4096, |
| "multiple_of": 256, |
| "n_heads": 32, |
| "n_kv_heads": 8, |
| "n_layers": 32, |
| "norm_eps": 1e-05, |
| "qk_norm": false, |
| "rope_dim": 32, |
| "rope_theta": 100000.0, |
| "seed": 42, |
| "sliding_window": null, |
| "vocab_size": 128256, |
| "weight_tying": false, |
| "z_loss": false |
| } |
| } |
|
|