{ "architectures": [ "AdaptiveRiverLM" ], "model_type": "adaptive-river", "auto_map": { "AutoConfig": "configuration_adaptive_river.AdaptiveRiverConfig", "AutoModelForCausalLM": "modeling_adaptive_river.AdaptiveRiverLM" }, "vocab_size": 50257, "hidden_size": 1024, "num_hidden_layers": 24, "intermediate_size": 4096, "num_attention_heads": 16, "max_position_embeddings": 2048, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "use_cache": false, "pad_token_id": 50256, "bos_token_id": 50256, "eos_token_id": 50256, "tie_word_embeddings": false, "rope_theta": 10000.0, "rope_scaling": null, "attention_dropout": 0.0, "moe_config": { "num_attention_experts": 6, "num_ffn_experts": 4, "attention_top_k": 6, "ffn_top_k": 1, "gate_temperature": 0.7, "moe_dropout": 0.0, "load_balance_weight": 0.01, "router_z_weight": 0.001, "entropy_weight": 0.0001, "head_entropy_weight": 0.0001 }, "mamba_config": { "d_state": 16, "d_conv": 4, "expand_factor": 2, "enhanced_expand_factor": 4, "early_mamba_layers": [0, 1], "enhanced_mamba_layers": [22, 23] }, "layer_types": { "0": "mamba", "1": "mamba", "2-21": "moe", "22": "mamba_enhanced", "23": "mamba_enhanced" }, "budget_ratio_default": 1.0, "checkpoint_attention_threshold": 0.35, "checkpoint_ffn_threshold": 0.35, "rotary_pct": 1.0, "torch_dtype": "bfloat16", "transformers_version": "4.36.0" }