{ "name": "parcae-medium-370m", "hf_config": { "org": "SandyResearch", "name": "parcae-medium-370m" }, "block_size": 2048, "n_embd": 1024, "intermediate_size": 4096, "num_attention_heads": 8, "num_key_value_heads": 8, "vocab_size": 32768, "padding_multiple": 64, "padded_vocab_size": 32768, "rope_settings": { "use_rope": true, "rope_condense_ratio": 1, "rope_base": 50000 }, "use_abacus": false, "randomize_positions_from": null, "block_class_name": "TransformerPreNormBlock", "norm_class_name": "RMSNorm", "attn_impl": "flash", "norm_eps": 1e-05, "mlp_class_name": "BaseMLP", "nonlin_name": "ReLU2", "bias": false, "qk_bias": false, "init_strategy": "scaled-zero", "init_orthogonal": true, "skip_initialization": false, "mup_model_scaling_factor": 1, "use_fused_head": "pytorch", "debias_attention": false, "center_attention": false, "clip_qkv": null, "qk_norm": true, "logit_softcap": null, "activation_checkpoint_impl": "per-iteration", "simple_ops": false, "strategy": "single", "injection_type": "diagonal", "n_layers_in_recurrent_block": 4, "n_layers_in_prelude": 4, "n_layers_in_coda": 4, "state_init": "like-init", "recurrent_embedding_dimension": 1024, "recurrent_intermediation_embedding_dimension": 4096, "recurrent_num_attention_heads": null, "prelude_norm": true, "sampling_scheme": "poisson-truncated-full", "mean_recurrence": 8, "mean_backprop_depth": 4, "lockstep_n": false, "lockstep_k": false, "curriculum_target": "forward", "recurrent_iteration_method": "per-sequence", "tie_embeddings": true, "model_class_name": "Parcae", "_is_recurrent_block_config": false, "_class_name": "ParcaeConfig" }