{ "attention_dropout": 0.0, "dropout_ramp_steps": 0, "dropout_schedule": "linear", "dropout_warmup_steps": 0, "expert_capacity_factor": 1.5, "gradient_clip_norm": 0.0, "hidden_dropout": 0.0, "hidden_size": 672, "initializer_range": 0.02, "intermediate_size": 2016, "label_smoothing": 0.0, "max_position_embeddings": 8192, "moe_implementation": "megablocks", "moe_world_size": 4, "num_attention_heads": 12, "num_experts": 8, "num_experts_per_tok": 2, "num_hidden_layers": 20, "num_key_value_heads": 12, "resid_dropout": 0.0, "rms_norm_eps": "1e-6", "router_aux_loss_coef": 0.01, "use_cache": true, "vocab_size": 114 }