Tongjilibo commited on
Commit
fee315e
·
1 Parent(s): 087bf94

增加Qwen3

Browse files
.gitignore CHANGED
@@ -1,9 +1,7 @@
1
  # 忽略所有文件
2
  *
3
- config.json
4
- generation_config.json
5
- Tongjilibo
6
-
7
- # 允许 .json 文件
8
  !bert4torch_config.json
9
-
 
 
 
 
1
  # 忽略所有文件
2
  *
 
 
 
 
 
3
  !bert4torch_config.json
4
+ !*/
5
+ Tongjilibo
6
+ __tensorflow_weights
7
+ maidalun1020
Qwen/Qwen3-0.6B/bert4torch_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen3",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "attention_head_size": 128,
8
+ "attention_key_size": 128,
9
+ "intermediate_size": 3072,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_eps": 1e-06,
12
+ "hidden_size": 1024,
13
+ "num_attention_heads": 16,
14
+ "num_hidden_layers": 28,
15
+ "num_key_value_heads": 8,
16
+ "rope_theta": 1000000.0,
17
+ "tie_word_embeddings": true,
18
+ "torch_dtype": "bfloat16",
19
+ "_attn_implementation": "sdpa",
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 40960,
25
+ "sliding_window": null,
26
+ "max_window_layers": 28,
27
+ "convert_lm_logits_dtype": "float32",
28
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
29
+ "max_length": 40960}
30
+ }
Qwen/Qwen3-1.7B/bert4torch_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen3",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "attention_head_size": 128,
8
+ "attention_key_size": 128,
9
+ "intermediate_size": 6144,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_eps": 1e-06,
12
+ "hidden_size": 2048,
13
+ "num_attention_heads": 16,
14
+ "num_hidden_layers": 28,
15
+ "num_key_value_heads": 8,
16
+ "rope_theta": 1000000.0,
17
+ "tie_word_embeddings": true,
18
+ "torch_dtype": "bfloat16",
19
+ "_attn_implementation": "sdpa",
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 40960,
25
+ "sliding_window": null,
26
+ "max_window_layers": 28,
27
+ "convert_lm_logits_dtype": "float32",
28
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
29
+ "max_length": 40960}
30
+ }
Qwen/Qwen3-14B/bert4torch_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen3",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "attention_head_size": 128,
8
+ "attention_key_size": 128,
9
+ "intermediate_size": 17408,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_eps": 1e-06,
12
+ "hidden_size": 5120,
13
+ "num_attention_heads": 40,
14
+ "num_hidden_layers": 40,
15
+ "num_key_value_heads": 8,
16
+ "rope_theta": 1000000.0,
17
+ "tie_word_embeddings": false,
18
+ "torch_dtype": "bfloat16",
19
+ "_attn_implementation": "sdpa",
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 40960,
25
+ "sliding_window": null,
26
+ "max_window_layers": 40,
27
+ "convert_lm_logits_dtype": "float32",
28
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
29
+ "max_length": 40960}
30
+ }
Qwen/Qwen3-32B/bert4torch_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen3",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "attention_head_size": 128,
8
+ "attention_key_size": 128,
9
+ "intermediate_size": 25600,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_eps": 1e-06,
12
+ "hidden_size": 5120,
13
+ "num_attention_heads": 64,
14
+ "num_hidden_layers": 64,
15
+ "num_key_value_heads": 8,
16
+ "rope_theta": 1000000.0,
17
+ "tie_word_embeddings": false,
18
+ "torch_dtype": "bfloat16",
19
+ "_attn_implementation": "sdpa",
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 40960,
25
+ "sliding_window": null,
26
+ "max_window_layers": 64,
27
+ "convert_lm_logits_dtype": "float32",
28
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
29
+ "max_length": 40960}
30
+ }
Qwen/Qwen3-4B/bert4torch_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen3",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "attention_head_size": 128,
8
+ "attention_key_size": 128,
9
+ "intermediate_size": 9728,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_eps": 1e-06,
12
+ "hidden_size": 2560,
13
+ "num_attention_heads": 32,
14
+ "num_hidden_layers": 36,
15
+ "num_key_value_heads": 8,
16
+ "rope_theta": 1000000.0,
17
+ "tie_word_embeddings": true,
18
+ "torch_dtype": "bfloat16",
19
+ "_attn_implementation": "sdpa",
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 40960,
25
+ "sliding_window": null,
26
+ "max_window_layers": 36,
27
+ "convert_lm_logits_dtype": "float32",
28
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
29
+ "max_length": 40960}
30
+ }
Qwen/Qwen3-8B/bert4torch_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen3",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "attention_head_size": 128,
8
+ "attention_key_size": 128,
9
+ "intermediate_size": 12288,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_eps": 1e-06,
12
+ "hidden_size": 4096,
13
+ "num_attention_heads": 32,
14
+ "num_hidden_layers": 36,
15
+ "num_key_value_heads": 8,
16
+ "rope_theta": 1000000.0,
17
+ "tie_word_embeddings": false,
18
+ "torch_dtype": "bfloat16",
19
+ "_attn_implementation": "sdpa",
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 40960,
25
+ "sliding_window": null,
26
+ "max_window_layers": 36,
27
+ "convert_lm_logits_dtype": "float32",
28
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
29
+ "max_length": 40960}
30
+ }