| { | |
| "_name_or_path": "mistralai/Mistral-7B-v0.1", | |
| "architectures": [ | |
| "SparseMistralforCausalLM" | |
| ], | |
| "attention_dropout": 0.0, | |
| "auto_map": { | |
| "AutoConfig": "sparsification_sftt.SparseMistralConfig", | |
| "AutoModelForCausalLM": "sparsification_sftt.SparseMistralforCausalLM" | |
| }, | |
| "bos_token_id": 1, | |
| "eos_token_id": 2, | |
| "hidden_act": "silu", | |
| "hidden_size": 4096, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 14336, | |
| "max_position_embeddings": 32768, | |
| "model_type": "sparse_mistral", | |
| "num_attention_heads": 32, | |
| "num_hidden_layers": 32, | |
| "num_key_value_heads": 8, | |
| "rms_norm_eps": 1e-05, | |
| "rope_theta": 10000.0, | |
| "sliding_window": 4096, | |
| "thresholds": [ | |
| 0.023069201037287712, | |
| 0.03309928998351097, | |
| 0.04312938079237938, | |
| 0.05516548827290535, | |
| 0.07522567361593246, | |
| 0.09327983111143112, | |
| 0.10531593859195709, | |
| 0.11935807019472122, | |
| 0.12738214433193207, | |
| 0.12738214433193207, | |
| 0.1313941776752472, | |
| 0.13340020179748535, | |
| 0.13941824436187744, | |
| 0.1414242684841156, | |
| 0.15546639263629913, | |
| 0.1675025075674057, | |
| 0.18555666506290436, | |
| 0.19157472252845764, | |
| 0.20762285590171814, | |
| 0.2196589708328247, | |
| 0.22768303751945496, | |
| 0.23771312832832336, | |
| 0.2357071191072464, | |
| 0.23771312832832336, | |
| 0.24172517657279968, | |
| 0.24172517657279968, | |
| 0.24172517657279968, | |
| 0.24172517657279968, | |
| 0.24172517657279968, | |
| 0.23971915245056152, | |
| 0.2357071191072464, | |
| 0.225677028298378 | |
| ], | |
| "tie_word_embeddings": false, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.36.2", | |
| "use_cache": false, | |
| "use_relu": false, | |
| "use_resilu": false, | |
| "use_sparse_model": true, | |
| "use_sparse_predictor": false, | |
| "use_sparse_regularization": false, | |
| "vocab_size": 32000 | |
| } | |