{ "architectures" : [ "Starcoder2ForCausalLM" ], "attention_dropout" : 0.10000000000000001, "bos_token_id" : 0, "embedding_dropout" : 0.10000000000000001, "eos_token_id" : 0, "hidden_act" : "gelu_pytorch_tanh", "hidden_size" : 3072, "initializer_range" : 0.018041999999999999, "intermediate_size" : 12288, "max_position_embeddings" : 16384, "mlp_type" : "default", "model_type" : "starcoder2", "norm_epsilon" : 1.0000000000000001e-05, "norm_type" : "layer_norm", "num_attention_heads" : 24, "num_hidden_layers" : 30, "num_key_value_heads" : 2, "quantization" : { "bits" : 4, "group_size" : 64, "mode" : "affine" }, "quantization_config" : { "bits" : 4, "group_size" : 64, "mode" : "affine" }, "residual_dropout" : 0.10000000000000001, "rope_theta" : 999999.44203588134, "sliding_window" : 4096, "transformers_version" : "4.37.0.dev0", "use_bias" : 1, "use_cache" : 1, "vocab_size" : 49152 }