| { |
| "_name_or_path": "Phi-3-small-128k-instruct", |
| "architectures": [ |
| "Phi3SmallForCausalLM" |
| ], |
| "attention_dropout_prob": 0.0, |
| "auto_map": { |
| "AutoConfig": "configuration_phi3_small.Phi3SmallConfig", |
| "AutoModelForCausalLM": "modeling_phi3_small.Phi3SmallForCausalLM", |
| "AutoTokenizer": "tokenization_phi3_small.Phi3SmallTokenizer" |
| }, |
| "blocksparse_block_size": 64, |
| "blocksparse_homo_head_pattern": false, |
| "blocksparse_num_local_blocks": 16, |
| "blocksparse_triton_kernel_block_size": 64, |
| "blocksparse_vert_stride": 8, |
| "bos_token_id": 100257, |
| "dense_attention_every_n_layers": 2, |
| "embedding_dropout_prob": 0.1, |
| "eos_token_id": 100257, |
| "ff_dim_multiplier": null, |
| "ff_intermediate_size": 14336, |
| "ffn_dropout_prob": 0.1, |
| "gegelu_limit": 20.0, |
| "gegelu_pad_to_256": true, |
| "hidden_act": "gegelu", |
| "hidden_size": 4096, |
| "initializer_range": 0.02, |
| "layer_norm_epsilon": 1e-05, |
| "max_position_embeddings": 131072, |
| "model_type": "phi3small", |
| "mup_attn_multiplier": 1.0, |
| "mup_embedding_multiplier": 10.0, |
| "mup_use_scaling": true, |
| "mup_width_multiplier": 8.0, |
| "num_attention_heads": 32, |
| "num_hidden_layers": 32, |
| "num_key_value_heads": 8, |
| "original_max_position_embeddings": 8192, |
| "pad_sequence_to_multiple_of_64": true, |
| "reorder_and_upcast_attn": false, |
| "rope_embedding_base": 1000000, |
| "rope_position_scale": 1.0, |
| "rope_scaling": { |
| "long_factor": [ |
| 1.0, |
| 1.01, |
| 1.01, |
| 1.02, |
| 1.04, |
| 1.04, |
| 1.04, |
| 1.05, |
| 1.05, |
| 1.06, |
| 1.07, |
| 1.08, |
| 1.08, |
| 1.08, |
| 1.08, |
| 1.08, |
| 1.08, |
| 1.08, |
| 1.09, |
| 1.09, |
| 1.2, |
| 2.31, |
| 3.76, |
| 9.38, |
| 10.1, |
| 10.8, |
| 18.1, |
| 25.2, |
| 25.3, |
| 26.1, |
| 26.6, |
| 30.2, |
| 33.0, |
| 41.5, |
| 44.4, |
| 44.8, |
| 50.2, |
| 51.9, |
| 59.3, |
| 62.7, |
| 66.1, |
| 66.3, |
| 85.8, |
| 89.3, |
| 90.0, |
| 99.9, |
| 107.0, |
| 110.0, |
| 111.0, |
| 117.0, |
| 118.0, |
| 121.0, |
| 122.0, |
| 127.0, |
| 127.0, |
| 128.0, |
| 128.0, |
| 128.0, |
| 128.0, |
| 128.0, |
| 128.0, |
| 129.0, |
| 129.0, |
| 129.0 |
| ], |
| "long_mscale": 1.1902380714238083, |
| "original_max_position_embeddings": 8192, |
| "short_factor": [ |
| 1.02, |
| 1.02, |
| 1.05, |
| 1.05, |
| 1.06, |
| 1.08, |
| 1.08, |
| 1.08, |
| 1.08, |
| 1.12, |
| 1.1800000000000002, |
| 1.1900000000000002, |
| 1.1900000000000002, |
| 1.2100000000000002, |
| 1.2300000000000002, |
| 1.2400000000000002, |
| 1.2400000000000002, |
| 1.2500000000000002, |
| 1.3000000000000003, |
| 1.3100000000000003, |
| 1.4600000000000004, |
| 1.5100000000000005, |
| 1.7000000000000006, |
| 1.9300000000000008, |
| 2.080000000000001, |
| 2.4399999999999933, |
| 3.2199999999999767, |
| 3.4499999999999718, |
| 3.579999999999969, |
| 4.669999999999946, |
| 4.779999999999943, |
| 5.999999999999917, |
| 6.009999999999917, |
| 6.4199999999999084, |
| 6.619999999999904, |
| 7.189999999999892, |
| 7.3099999999998895, |
| 7.339999999999889, |
| 7.479999999999886, |
| 9.749999999999837, |
| 10.919999999999812, |
| 11.219999999999805, |
| 11.749999999999794, |
| 11.979999999999789, |
| 13.239999999999762, |
| 13.579999999999755, |
| 13.669999999999753, |
| 13.82999999999975, |
| 14.009999999999746, |
| 14.679999999999731, |
| 14.889999999999727, |
| 15.769999999999708, |
| 15.769999999999708, |
| 15.819999999999707, |
| 15.839999999999707, |
| 15.919999999999705, |
| 16.029999999999703, |
| 16.12999999999972, |
| 16.44999999999977, |
| 16.44999999999977, |
| 16.77999999999982, |
| 16.83999999999983, |
| 16.83999999999983, |
| 16.889999999999837 |
| ], |
| "short_mscale": 1.0, |
| "type": "su" |
| }, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.38.1", |
| "use_cache": true, |
| "attention_bias": false, |
| "vocab_size": 100352 |
| } |