| { |
| "name": "parcae-small-140m", |
| "hf_config": { |
| "org": "SandyResearch", |
| "name": "parcae-small-140m" |
| }, |
| "block_size": 2048, |
| "n_embd": 768, |
| "intermediate_size": 3072, |
| "num_attention_heads": 6, |
| "num_key_value_heads": 6, |
| "vocab_size": 32768, |
| "padding_multiple": 64, |
| "padded_vocab_size": 32768, |
| "rope_settings": { |
| "use_rope": true, |
| "rope_condense_ratio": 1, |
| "rope_base": 50000 |
| }, |
| "use_abacus": false, |
| "randomize_positions_from": null, |
| "block_class_name": "TransformerPreNormBlock", |
| "norm_class_name": "RMSNorm", |
| "attn_impl": "flash", |
| "norm_eps": 1e-05, |
| "mlp_class_name": "BaseMLP", |
| "nonlin_name": "ReLU2", |
| "bias": false, |
| "qk_bias": false, |
| "init_strategy": "scaled-zero", |
| "init_orthogonal": true, |
| "skip_initialization": false, |
| "mup_model_scaling_factor": 1, |
| "use_fused_head": "pytorch", |
| "debias_attention": false, |
| "center_attention": false, |
| "clip_qkv": null, |
| "qk_norm": true, |
| "logit_softcap": null, |
| "activation_checkpoint_impl": "per-iteration", |
| "simple_ops": false, |
| "strategy": "single", |
| "injection_type": "diagonal", |
| "n_layers_in_recurrent_block": 2, |
| "n_layers_in_prelude": 2, |
| "n_layers_in_coda": 2, |
| "state_init": "like-init", |
| "recurrent_embedding_dimension": 768, |
| "recurrent_intermediation_embedding_dimension": 3072, |
| "recurrent_num_attention_heads": null, |
| "prelude_norm": true, |
| "sampling_scheme": "poisson-truncated-full", |
| "mean_recurrence": 8, |
| "mean_backprop_depth": 4, |
| "lockstep_n": false, |
| "lockstep_k": false, |
| "curriculum_target": "forward", |
| "recurrent_iteration_method": "per-sequence", |
| "tie_embeddings": true, |
| "model_class_name": "Parcae", |
| "_is_recurrent_block_config": false, |
| "_class_name": "ParcaeConfig" |
| } |