iamPi commited on
Commit
8ee70c8
·
verified ·
1 Parent(s): 089f940

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +15 -13
config.json CHANGED
@@ -4,22 +4,19 @@
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
- "auto_map": {
8
- "AutoConfig": "configuration_deepseek.DeepseekV3Config",
9
- "AutoModel": "modeling_deepseek.DeepseekV3Model",
10
- "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
11
- },
12
  "aux_loss_alpha": 0.001,
13
  "bos_token_id": 163584,
14
- "eos_token_id": 163586,
 
15
  "ep_size": 1,
16
  "first_k_dense_replace": 1,
 
17
  "hidden_act": "silu",
18
  "hidden_size": 2048,
19
  "initializer_range": 0.02,
20
  "intermediate_size": 11264,
21
  "kv_lora_rank": 512,
22
- "max_position_embeddings": 8192,
23
  "model_type": "deepseek_v3",
24
  "moe_intermediate_size": 1408,
25
  "moe_layer_freq": 1,
@@ -31,22 +28,27 @@
31
  "num_experts_per_tok": 6,
32
  "num_hidden_layers": 27,
33
  "num_key_value_heads": 16,
34
- "num_nextn_predict_layers": 0,
 
35
  "pretraining_tp": 1,
36
  "q_lora_rank": null,
 
37
  "qk_nope_head_dim": 128,
38
  "qk_rope_head_dim": 64,
39
  "rms_norm_eps": 1e-05,
40
- "rope_theta": 50000.0,
 
 
 
 
41
  "routed_scaling_factor": 2.446,
42
  "scoring_func": "sigmoid",
43
  "seq_aux": true,
44
  "tie_word_embeddings": false,
45
  "topk_group": 1,
46
  "topk_method": "noaux_tc",
47
- "torch_dtype": "bfloat16",
48
- "transformers_version": "4.46.3",
49
- "use_cache": true,
50
  "v_head_dim": 128,
51
  "vocab_size": 163840
52
- }
 
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
 
 
 
 
 
7
  "aux_loss_alpha": 0.001,
8
  "bos_token_id": 163584,
9
+ "dtype": "bfloat16",
10
+ "eos_token_id": 163585,
11
  "ep_size": 1,
12
  "first_k_dense_replace": 1,
13
+ "head_dim": 64,
14
  "hidden_act": "silu",
15
  "hidden_size": 2048,
16
  "initializer_range": 0.02,
17
  "intermediate_size": 11264,
18
  "kv_lora_rank": 512,
19
+ "max_position_embeddings": 131072,
20
  "model_type": "deepseek_v3",
21
  "moe_intermediate_size": 1408,
22
  "moe_layer_freq": 1,
 
28
  "num_experts_per_tok": 6,
29
  "num_hidden_layers": 27,
30
  "num_key_value_heads": 16,
31
+ "num_nextn_predict_layers": 1,
32
+ "pad_token_id": 163839,
33
  "pretraining_tp": 1,
34
  "q_lora_rank": null,
35
+ "qk_head_dim": 192,
36
  "qk_nope_head_dim": 128,
37
  "qk_rope_head_dim": 64,
38
  "rms_norm_eps": 1e-05,
39
+ "rope_interleave": true,
40
+ "rope_parameters": {
41
+ "rope_theta": 800000.0,
42
+ "rope_type": "default"
43
+ },
44
  "routed_scaling_factor": 2.446,
45
  "scoring_func": "sigmoid",
46
  "seq_aux": true,
47
  "tie_word_embeddings": false,
48
  "topk_group": 1,
49
  "topk_method": "noaux_tc",
50
+ "transformers_version": "5.7.0",
51
+ "use_cache": false,
 
52
  "v_head_dim": 128,
53
  "vocab_size": 163840
54
+ }