Tongjilibo commited on
Commit
0dc73a9
·
1 Parent(s): d32cec4

format json格式

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 01-ai/Yi-1.5-6B-Chat/bert4torch_config.json +30 -24
  2. 01-ai/Yi-1.5-6B/bert4torch_config.json +29 -23
  3. 01-ai/Yi-1.5-9B-32K/bert4torch_config.json +29 -23
  4. 01-ai/Yi-1.5-9B-Chat-16K/bert4torch_config.json +29 -24
  5. 01-ai/Yi-1.5-9B-Chat/bert4torch_config.json +30 -24
  6. 01-ai/Yi-1.5-9B/bert4torch_config.json +29 -23
  7. 01-ai/Yi-6B-200K/bert4torch_config.json +322 -316
  8. 01-ai/Yi-6B/bert4torch_config.json +322 -316
  9. 01-ai/Yi-9B-200K/bert4torch_config.json +466 -461
  10. 01-ai/Yi-9B/bert4torch_config.json +466 -460
  11. BAAI/bge-base-en-v1.5/bert4torch_config.json +224 -222
  12. BAAI/bge-base-zh-v1.5/bert4torch_config.json +224 -222
  13. BAAI/bge-large-en-v1.5/bert4torch_config.json +414 -412
  14. BAAI/bge-large-zh-v1.5/bert4torch_config.json +416 -414
  15. BAAI/bge-small-en-v1.5/bert4torch_config.json +222 -220
  16. BAAI/bge-small-zh-v1.5/bert4torch_config.json +222 -220
  17. BelleGroup/BELLE-LLaMA-7B-2M-enc/bert4torch_config.json +8 -2
  18. ClueAI/ChatYuan-large-v1/bert4torch_config.json +16 -16
  19. ClueAI/ChatYuan-large-v2/bert4torch_config.json +17 -17
  20. ClueAI/PromptCLUE-base-v1-5/bert4torch_config.json +15 -15
  21. ClueAI/PromptCLUE-base/bert4torch_config.json +17 -17
  22. FacebookAI/roberta-base/bert4torch_config.json +225 -226
  23. IDEA-CCNL/Erlangshen-DeBERTa-v2-320M-Chinese/bert4torch_config.json +24 -24
  24. IDEA-CCNL/Erlangshen-DeBERTa-v2-710M-Chinese/bert4torch_config.json +31 -31
  25. IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese/bert4torch_config.json +23 -23
  26. IDEA-CCNL/Ziya-LLaMA-13B-v1.1/bert4torch_config.json +27 -21
  27. IDEA-CCNL/Ziya-LLaMA-13B-v1/bert4torch_config.json +27 -21
  28. OpenGVLab/InternVL2_5-1B/bert4torch_config.json +85 -78
  29. Qwen/Qwen-14B-Chat/bert4torch_config.json +39 -24
  30. Qwen/Qwen-14B/bert4torch_config.json +38 -25
  31. Qwen/Qwen-1_8B-Chat/bert4torch_config.json +40 -25
  32. Qwen/Qwen-1_8B/bert4torch_config.json +39 -26
  33. Qwen/Qwen-7B-Chat/bert4torch_config.json +40 -25
  34. Qwen/Qwen-7B/bert4torch_config.json +39 -26
  35. Qwen/Qwen1.5-0.5B-Chat/bert4torch_config.json +36 -28
  36. Qwen/Qwen1.5-0.5B/bert4torch_config.json +36 -29
  37. Qwen/Qwen1.5-1.8B-Chat/bert4torch_config.json +36 -28
  38. Qwen/Qwen1.5-1.8B/bert4torch_config.json +36 -29
  39. Qwen/Qwen1.5-14B-Chat/bert4torch_config.json +36 -28
  40. Qwen/Qwen1.5-14B/bert4torch_config.json +36 -29
  41. Qwen/Qwen1.5-7B-Chat/bert4torch_config.json +36 -28
  42. Qwen/Qwen1.5-7B/bert4torch_config.json +36 -29
  43. Qwen/Qwen2-0.5B-Instruct/bert4torch_config.json +36 -28
  44. Qwen/Qwen2-0.5B/bert4torch_config.json +36 -29
  45. Qwen/Qwen2-1.5B-Instruct/bert4torch_config.json +36 -28
  46. Qwen/Qwen2-1.5B/bert4torch_config.json +36 -29
  47. Qwen/Qwen2-7B-Instruct/bert4torch_config.json +36 -28
  48. Qwen/Qwen2-7B/bert4torch_config.json +36 -29
  49. Qwen/Qwen2-VL-2B-Instruct/bert4torch_config.json +30 -26
  50. Qwen/Qwen2-VL-7B-Instruct/bert4torch_config.json +67 -63
01-ai/Yi-1.5-6B-Chat/bert4torch_config.json CHANGED
@@ -1,24 +1,30 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "hidden_act": "silu",
5
- "hidden_size": 4096,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 11008,
8
- "max_position_embeddings": 4096,
9
- "model": "llama",
10
- "template": "llama3",
11
- "num_attention_heads": 32,
12
- "num_hidden_layers": 32,
13
- "num_key_value_heads": 4,
14
- "pad_token_id": 0,
15
- "layer_norm_eps": 1e-06,
16
- "rope_theta": 5000000.0,
17
- "tie_word_embeddings": false,
18
- "torch_dtype": "bfloat16",
19
- "vocab_size": 64000,
20
- "skip_init": true,
21
- "rope_rank": "updown",
22
- "segment_vocab_size": 0,
23
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 7}
24
- }
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "template": "llama3",
11
+ "num_attention_heads": 32,
12
+ "num_hidden_layers": 32,
13
+ "num_key_value_heads": 4,
14
+ "pad_token_id": 0,
15
+ "layer_norm_eps": 1e-06,
16
+ "rope_theta": 5000000.0,
17
+ "tie_word_embeddings": false,
18
+ "torch_dtype": "bfloat16",
19
+ "vocab_size": 64000,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "segment_vocab_size": 0,
23
+ "generation_config": {
24
+ "tokenizer_decode_config": {
25
+ "skip_special_tokens": true
26
+ },
27
+ "max_length": 4096,
28
+ "eos_token_id": 7
29
+ }
30
+ }
01-ai/Yi-1.5-6B/bert4torch_config.json CHANGED
@@ -1,23 +1,29 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "hidden_act": "silu",
5
- "hidden_size": 4096,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 11008,
8
- "max_position_embeddings": 4096,
9
- "model": "llama",
10
- "num_attention_heads": 32,
11
- "num_hidden_layers": 32,
12
- "num_key_value_heads": 4,
13
- "pad_token_id": 0,
14
- "layer_norm_eps": 1e-06,
15
- "rope_theta": 5000000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "vocab_size": 64000,
19
- "skip_init": true,
20
- "rope_rank": "updown",
21
- "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2}
23
- }
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {
23
+ "tokenizer_decode_config": {
24
+ "skip_special_tokens": true
25
+ },
26
+ "max_length": 4096,
27
+ "eos_token_id": 2
28
+ }
29
+ }
01-ai/Yi-1.5-9B-32K/bert4torch_config.json CHANGED
@@ -1,23 +1,29 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "hidden_act": "silu",
5
- "hidden_size": 4096,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 11008,
8
- "max_position_embeddings": 32768,
9
- "model": "llama",
10
- "num_attention_heads": 32,
11
- "num_hidden_layers": 48,
12
- "num_key_value_heads": 4,
13
- "pad_token_id": 0,
14
- "layer_norm_eps": 1e-06,
15
- "rope_theta": 5000000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "vocab_size": 64000,
19
- "skip_init": true,
20
- "rope_rank": "updown",
21
- "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 32768, "eos_token_id": 2}
23
- }
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 32768,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 48,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {
23
+ "tokenizer_decode_config": {
24
+ "skip_special_tokens": true
25
+ },
26
+ "max_length": 32768,
27
+ "eos_token_id": 2
28
+ }
29
+ }
01-ai/Yi-1.5-9B-Chat-16K/bert4torch_config.json CHANGED
@@ -1,25 +1,30 @@
1
  {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "hidden_act": "silu",
5
- "hidden_size": 4096,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 11008,
8
- "max_position_embeddings": 16384,
9
- "model": "llama",
10
- "template": "llama3",
11
- "num_attention_heads": 32,
12
- "num_hidden_layers": 48,
13
- "num_key_value_heads": 4,
14
- "pad_token_id": 0,
15
- "layer_norm_eps": 1e-06,
16
- "rope_theta": 5000000.0,
17
- "tie_word_embeddings": false,
18
- "torch_dtype": "bfloat16",
19
- "vocab_size": 64000,
20
- "skip_init": true,
21
- "rope_rank": "updown",
22
- "segment_vocab_size": 0,
23
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 16384, "eos_token_id": 7}
24
- }
25
-
 
 
 
 
 
 
1
  {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 16384,
9
+ "model": "llama",
10
+ "template": "llama3",
11
+ "num_attention_heads": 32,
12
+ "num_hidden_layers": 48,
13
+ "num_key_value_heads": 4,
14
+ "pad_token_id": 0,
15
+ "layer_norm_eps": 1e-06,
16
+ "rope_theta": 5000000.0,
17
+ "tie_word_embeddings": false,
18
+ "torch_dtype": "bfloat16",
19
+ "vocab_size": 64000,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "segment_vocab_size": 0,
23
+ "generation_config": {
24
+ "tokenizer_decode_config": {
25
+ "skip_special_tokens": true
26
+ },
27
+ "max_length": 16384,
28
+ "eos_token_id": 7
29
+ }
30
+ }
01-ai/Yi-1.5-9B-Chat/bert4torch_config.json CHANGED
@@ -1,24 +1,30 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "hidden_act": "silu",
5
- "hidden_size": 4096,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 11008,
8
- "max_position_embeddings": 4096,
9
- "model": "llama",
10
- "template": "llama3",
11
- "num_attention_heads": 32,
12
- "num_hidden_layers": 48,
13
- "num_key_value_heads": 4,
14
- "pad_token_id": 0,
15
- "layer_norm_eps": 1e-06,
16
- "rope_theta": 5000000.0,
17
- "tie_word_embeddings": false,
18
- "torch_dtype": "bfloat16",
19
- "vocab_size": 64000,
20
- "skip_init": true,
21
- "rope_rank": "updown",
22
- "segment_vocab_size": 0,
23
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 7}
24
- }
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "template": "llama3",
11
+ "num_attention_heads": 32,
12
+ "num_hidden_layers": 48,
13
+ "num_key_value_heads": 4,
14
+ "pad_token_id": 0,
15
+ "layer_norm_eps": 1e-06,
16
+ "rope_theta": 5000000.0,
17
+ "tie_word_embeddings": false,
18
+ "torch_dtype": "bfloat16",
19
+ "vocab_size": 64000,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "segment_vocab_size": 0,
23
+ "generation_config": {
24
+ "tokenizer_decode_config": {
25
+ "skip_special_tokens": true
26
+ },
27
+ "max_length": 4096,
28
+ "eos_token_id": 7
29
+ }
30
+ }
01-ai/Yi-1.5-9B/bert4torch_config.json CHANGED
@@ -1,23 +1,29 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "hidden_act": "silu",
5
- "hidden_size": 4096,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 11008,
8
- "max_position_embeddings": 4096,
9
- "model": "llama",
10
- "num_attention_heads": 32,
11
- "num_hidden_layers": 48,
12
- "num_key_value_heads": 4,
13
- "pad_token_id": 0,
14
- "layer_norm_eps": 1e-06,
15
- "rope_theta": 5000000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "vocab_size": 64000,
19
- "skip_init": true,
20
- "rope_rank": "updown",
21
- "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2}
23
- }
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 48,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {
23
+ "tokenizer_decode_config": {
24
+ "skip_special_tokens": true
25
+ },
26
+ "max_length": 4096,
27
+ "eos_token_id": 2
28
+ }
29
+ }
01-ai/Yi-6B-200K/bert4torch_config.json CHANGED
@@ -1,316 +1,322 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "hidden_act": "silu",
5
- "hidden_size": 4096,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 11008,
8
- "max_position_embeddings": 200000,
9
- "model": "llama",
10
- "num_attention_heads": 32,
11
- "num_hidden_layers": 32,
12
- "num_key_value_heads": 4,
13
- "pad_token_id": 0,
14
- "layer_norm_eps": 1e-05,
15
- "rope_theta": 10000000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "vocab_size": 64000,
19
- "skip_init": true,
20
- "rope_rank": "updown",
21
- "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 200000, "eos_token_id": 2},
23
- "mapping": {
24
- "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
- "LayerNormFinal.weight": "model.norm.weight",
26
- "lm_head.weight": "lm_head.weight",
27
- "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
- "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
- "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
- "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
- "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
- "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
- "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
- "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
- "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
- "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
- "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
- "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
- "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
- "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
- "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
- "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
- "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
- "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
- "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
- "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
- "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
- "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
- "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
- "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
- "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
- "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
- "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
- "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
- "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
- "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
- "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
- "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
- "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
- "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
- "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
- "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
- "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
- "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
- "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
- "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
- "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
- "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
- "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
- "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
- "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
- "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
- "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
- "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
- "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
- "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
- "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
- "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
- "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
- "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
- "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
- "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
- "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
- "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
- "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
- "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
- "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
- "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
- "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
- "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
- "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
- "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
- "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
- "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
- "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
- "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
- "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
- "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
- "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
- "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
- "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
- "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
- "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
- "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
- "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
- "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
- "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
- "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
- "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
- "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
- "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
- "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
- "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
- "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
- "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
- "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
- "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
- "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
- "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
- "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
- "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
- "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
- "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
- "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
- "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
- "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
- "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
- "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
- "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
- "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
- "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
- "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
- "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
- "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
- "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
- "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
- "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
- "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
- "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
- "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
- "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
- "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
- "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
- "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
- "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
- "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
- "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
- "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
- "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
- "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
- "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
- "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
- "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
- "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
- "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
- "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
- "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
- "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
- "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
- "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
- "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
- "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
- "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
- "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
- "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
- "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
- "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
- "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
- "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
- "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
- "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
- "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
- "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
- "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
- "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
- "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
- "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
- "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
- "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
- "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
- "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
- "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
- "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
- "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
- "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
- "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
- "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
- "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
- "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
- "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
- "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
- "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
- "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
- "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
- "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
- "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
- "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
- "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
- "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
- "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
- "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
- "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
- "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
- "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
- "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
- "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
- "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
- "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
- "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
- "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
- "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
- "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
- "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
- "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
- "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
- "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
- "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
- "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
- "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
- "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
- "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
- "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
- "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
- "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
- "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
- "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
- "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
- "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
- "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
- "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
- "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
- "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
- "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
- "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
- "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
- "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
- "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
- "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
- "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
- "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
- "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
- "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
- "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
- "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
- "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
- "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
- "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
- "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
- "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
- "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
- "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
- "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
- "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
- "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
- "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
- "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
- "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
- "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
- "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
- "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
- "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
- "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
- "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
- "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
- "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
- "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
- "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
- "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
- "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
- "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
- "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
- "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
- "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
- "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
- "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
- "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
- "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
- "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
- "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
- "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
- "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
- "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
- "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
- "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
- "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
- "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
- "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
- "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
- "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
- "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
- "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
- "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
- "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
- "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
- "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
- "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
- "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
- "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
- "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
- "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
- "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
- "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
- "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
- "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
- "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
- "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
- "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
- "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
- "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
- "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
- "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
- "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
- "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
- "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight"
315
- }
316
- }
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 200000,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-05,
15
+ "rope_theta": 10000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {
23
+ "tokenizer_decode_config": {
24
+ "skip_special_tokens": true
25
+ },
26
+ "max_length": 200000,
27
+ "eos_token_id": 2
28
+ },
29
+ "mapping": {
30
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
31
+ "LayerNormFinal.weight": "model.norm.weight",
32
+ "lm_head.weight": "lm_head.weight",
33
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
34
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
35
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
36
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
37
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
38
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
39
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
40
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
41
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
42
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
43
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
44
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
45
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
46
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
47
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
48
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
49
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
50
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
51
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
52
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
53
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
54
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
55
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
56
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
57
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
58
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
59
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
60
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
61
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
62
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
63
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
64
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
65
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
66
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
67
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
68
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
69
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
70
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
71
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
72
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
73
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
74
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
75
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
76
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
77
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
78
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
79
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
80
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
81
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
82
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
83
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
84
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
85
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
86
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
87
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
88
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
89
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
90
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
91
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
92
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
93
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
94
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
95
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
96
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
97
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
98
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
99
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
100
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
101
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
102
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
103
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
104
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
105
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
106
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
107
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
108
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
109
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
110
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
111
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
112
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
113
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
114
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
115
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
116
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
117
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
118
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
119
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
120
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
121
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
122
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
123
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
124
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
125
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
126
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
127
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
128
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
129
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
130
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
131
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
132
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
133
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
134
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
135
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
136
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
137
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
138
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
139
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
140
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
141
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
142
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
143
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
144
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
145
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
146
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
147
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
148
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
149
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
150
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
151
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
152
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
153
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
154
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
155
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
156
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
157
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
158
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
159
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
160
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
161
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
162
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
163
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
164
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
165
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
166
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
167
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
168
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
169
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
170
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
171
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
172
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
173
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
174
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
175
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
176
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
177
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
178
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
179
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
180
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
181
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
182
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
183
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
184
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
185
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
186
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
187
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
188
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
189
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
190
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
191
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
192
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
193
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
194
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
195
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
196
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
197
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
198
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
199
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
200
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
201
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
202
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
203
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
204
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
205
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
206
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
207
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
208
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
209
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
210
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
211
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
212
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
213
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
214
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
215
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
216
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
217
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
218
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
219
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
220
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
221
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
222
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
223
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
224
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
225
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
226
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
227
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
228
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
229
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
230
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
231
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
232
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
233
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
234
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
235
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
236
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
237
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
238
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
239
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
240
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
241
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
242
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
243
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
244
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
245
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
246
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
247
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
248
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
249
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
250
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
251
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
252
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
253
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
254
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
255
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
256
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
257
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
258
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
259
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
260
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
261
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
262
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
263
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
264
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
265
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
266
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
267
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
268
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
269
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
270
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
271
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
272
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
273
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
274
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
275
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
276
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
277
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
278
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
279
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
280
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
281
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
282
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
283
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
284
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
285
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
286
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
287
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
288
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
289
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
290
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
291
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
292
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
293
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
294
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
295
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
296
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
297
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
298
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
299
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
300
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
301
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
302
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
303
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
304
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
305
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
306
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
307
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
308
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
309
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
310
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
311
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
312
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
313
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
314
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
315
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
316
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
317
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
318
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
319
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
320
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight"
321
+ }
322
+ }
01-ai/Yi-6B/bert4torch_config.json CHANGED
@@ -1,316 +1,322 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "hidden_act": "silu",
5
- "hidden_size": 4096,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 11008,
8
- "max_position_embeddings": 4096,
9
- "model": "llama",
10
- "num_attention_heads": 32,
11
- "num_hidden_layers": 32,
12
- "num_key_value_heads": 4,
13
- "pad_token_id": 0,
14
- "layer_norm_eps": 1e-05,
15
- "rope_theta": 5000000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "vocab_size": 64000,
19
- "skip_init": true,
20
- "rope_rank": "updown",
21
- "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2},
23
- "mapping": {
24
- "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
- "LayerNormFinal.weight": "model.norm.weight",
26
- "lm_head.weight": "lm_head.weight",
27
- "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
- "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
- "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
- "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
- "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
- "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
- "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
- "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
- "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
- "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
- "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
- "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
- "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
- "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
- "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
- "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
- "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
- "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
- "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
- "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
- "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
- "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
- "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
- "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
- "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
- "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
- "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
- "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
- "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
- "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
- "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
- "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
- "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
- "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
- "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
- "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
- "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
- "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
- "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
- "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
- "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
- "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
- "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
- "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
- "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
- "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
- "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
- "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
- "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
- "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
- "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
- "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
- "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
- "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
- "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
- "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
- "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
- "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
- "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
- "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
- "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
- "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
- "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
- "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
- "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
- "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
- "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
- "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
- "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
- "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
- "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
- "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
- "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
- "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
- "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
- "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
- "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
- "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
- "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
- "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
- "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
- "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
- "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
- "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
- "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
- "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
- "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
- "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
- "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
- "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
- "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
- "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
- "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
- "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
- "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
- "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
- "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
- "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
- "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
- "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
- "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
- "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
- "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
- "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
- "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
- "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
- "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
- "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
- "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
- "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
- "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
- "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
- "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
- "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
- "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
- "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
- "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
- "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
- "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
- "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
- "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
- "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
- "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
- "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
- "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
- "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
- "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
- "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
- "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
- "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
- "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
- "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
- "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
- "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
- "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
- "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
- "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
- "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
- "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
- "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
- "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
- "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
- "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
- "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
- "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
- "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
- "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
- "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
- "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
- "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
- "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
- "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
- "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
- "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
- "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
- "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
- "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
- "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
- "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
- "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
- "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
- "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
- "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
- "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
- "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
- "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
- "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
- "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
- "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
- "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
- "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
- "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
- "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
- "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
- "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
- "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
- "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
- "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
- "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
- "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
- "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
- "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
- "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
- "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
- "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
- "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
- "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
- "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
- "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
- "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
- "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
- "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
- "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
- "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
- "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
- "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
- "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
- "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
- "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
- "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
- "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
- "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
- "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
- "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
- "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
- "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
- "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
- "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
- "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
- "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
- "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
- "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
- "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
- "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
- "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
- "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
- "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
- "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
- "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
- "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
- "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
- "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
- "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
- "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
- "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
- "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
- "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
- "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
- "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
- "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
- "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
- "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
- "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
- "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
- "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
- "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
- "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
- "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
- "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
- "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
- "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
- "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
- "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
- "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
- "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
- "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
- "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
- "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
- "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
- "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
- "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
- "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
- "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
- "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
- "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
- "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
- "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
- "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
- "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
- "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
- "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
- "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
- "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
- "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
- "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
- "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
- "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
- "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
- "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
- "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
- "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
- "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
- "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
- "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
- "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
- "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
- "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
- "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
- "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
- "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
- "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
- "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
- "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
- "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
- "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
- "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
- "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
- "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight"
315
- }
316
- }
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-05,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {
23
+ "tokenizer_decode_config": {
24
+ "skip_special_tokens": true
25
+ },
26
+ "max_length": 4096,
27
+ "eos_token_id": 2
28
+ },
29
+ "mapping": {
30
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
31
+ "LayerNormFinal.weight": "model.norm.weight",
32
+ "lm_head.weight": "lm_head.weight",
33
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
34
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
35
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
36
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
37
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
38
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
39
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
40
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
41
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
42
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
43
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
44
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
45
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
46
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
47
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
48
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
49
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
50
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
51
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
52
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
53
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
54
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
55
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
56
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
57
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
58
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
59
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
60
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
61
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
62
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
63
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
64
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
65
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
66
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
67
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
68
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
69
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
70
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
71
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
72
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
73
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
74
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
75
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
76
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
77
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
78
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
79
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
80
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
81
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
82
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
83
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
84
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
85
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
86
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
87
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
88
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
89
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
90
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
91
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
92
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
93
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
94
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
95
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
96
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
97
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
98
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
99
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
100
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
101
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
102
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
103
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
104
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
105
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
106
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
107
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
108
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
109
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
110
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
111
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
112
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
113
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
114
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
115
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
116
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
117
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
118
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
119
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
120
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
121
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
122
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
123
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
124
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
125
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
126
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
127
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
128
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
129
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
130
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
131
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
132
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
133
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
134
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
135
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
136
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
137
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
138
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
139
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
140
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
141
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
142
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
143
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
144
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
145
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
146
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
147
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
148
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
149
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
150
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
151
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
152
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
153
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
154
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
155
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
156
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
157
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
158
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
159
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
160
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
161
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
162
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
163
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
164
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
165
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
166
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
167
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
168
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
169
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
170
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
171
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
172
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
173
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
174
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
175
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
176
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
177
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
178
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
179
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
180
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
181
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
182
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
183
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
184
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
185
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
186
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
187
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
188
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
189
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
190
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
191
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
192
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
193
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
194
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
195
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
196
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
197
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
198
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
199
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
200
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
201
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
202
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
203
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
204
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
205
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
206
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
207
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
208
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
209
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
210
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
211
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
212
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
213
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
214
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
215
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
216
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
217
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
218
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
219
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
220
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
221
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
222
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
223
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
224
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
225
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
226
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
227
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
228
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
229
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
230
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
231
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
232
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
233
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
234
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
235
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
236
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
237
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
238
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
239
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
240
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
241
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
242
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
243
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
244
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
245
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
246
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
247
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
248
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
249
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
250
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
251
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
252
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
253
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
254
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
255
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
256
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
257
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
258
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
259
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
260
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
261
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
262
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
263
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
264
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
265
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
266
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
267
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
268
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
269
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
270
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
271
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
272
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
273
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
274
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
275
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
276
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
277
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
278
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
279
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
280
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
281
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
282
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
283
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
284
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
285
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
286
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
287
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
288
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
289
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
290
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
291
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
292
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
293
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
294
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
295
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
296
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
297
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
298
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
299
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
300
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
301
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
302
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
303
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
304
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
305
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
306
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
307
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
308
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
309
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
310
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
311
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
312
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
313
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
314
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
315
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
316
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
317
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
318
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
319
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
320
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight"
321
+ }
322
+ }
01-ai/Yi-9B-200K/bert4torch_config.json CHANGED
@@ -1,461 +1,466 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "hidden_act": "silu",
5
- "hidden_size": 4096,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 11008,
8
- "max_position_embeddings": 262144,
9
- "model": "llama",
10
- "num_attention_heads": 32,
11
- "num_hidden_layers": 48,
12
- "num_key_value_heads": 4,
13
- "pad_token_id": 0,
14
- "layer_norm_eps": 1e-06,
15
- "rope_theta": 10000000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "vocab_size": 64000,
19
- "skip_init": true,
20
- "rope_rank": "updown",
21
- "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 262144, "eos_token_id": 2},
23
- "mapping": {
24
- "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
- "LayerNormFinal.weight": "model.norm.weight",
26
- "lm_head.weight": "lm_head.weight",
27
- "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
- "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
- "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
- "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
- "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
- "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
- "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
- "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
- "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
- "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
- "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
- "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
- "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
- "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
- "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
- "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
- "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
- "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
- "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
- "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
- "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
- "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
- "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
- "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
- "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
- "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
- "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
- "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
- "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
- "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
- "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
- "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
- "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
- "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
- "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
- "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
- "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
- "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
- "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
- "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
- "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
- "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
- "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
- "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
- "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
- "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
- "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
- "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
- "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
- "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
- "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
- "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
- "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
- "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
- "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
- "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
- "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
- "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
- "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
- "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
- "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
- "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
- "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
- "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
- "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
- "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
- "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
- "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
- "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
- "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
- "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
- "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
- "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
- "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
- "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
- "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
- "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
- "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
- "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
- "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
- "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
- "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
- "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
- "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
- "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
- "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
- "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
- "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
- "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
- "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
- "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
- "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
- "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
- "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
- "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
- "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
- "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
- "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
- "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
- "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
- "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
- "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
- "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
- "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
- "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
- "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
- "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
- "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
- "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
- "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
- "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
- "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
- "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
- "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
- "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
- "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
- "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
- "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
- "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
- "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
- "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
- "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
- "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
- "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
- "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
- "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
- "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
- "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
- "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
- "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
- "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
- "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
- "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
- "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
- "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
- "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
- "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
- "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
- "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
- "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
- "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
- "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
- "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
- "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
- "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
- "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
- "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
- "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
- "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
- "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
- "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
- "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
- "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
- "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
- "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
- "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
- "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
- "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
- "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
- "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
- "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
- "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
- "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
- "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
- "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
- "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
- "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
- "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
- "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
- "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
- "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
- "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
- "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
- "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
- "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
- "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
- "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
- "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
- "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
- "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
- "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
- "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
- "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
- "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
- "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
- "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
- "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
- "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
- "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
- "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
- "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
- "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
- "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
- "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
- "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
- "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
- "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
- "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
- "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
- "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
- "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
- "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
- "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
- "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
- "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
- "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
- "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
- "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
- "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
- "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
- "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
- "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
- "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
- "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
- "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
- "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
- "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
- "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
- "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
- "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
- "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
- "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
- "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
- "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
- "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
- "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
- "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
- "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
- "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
- "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
- "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
- "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
- "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
- "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
- "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
- "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
- "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
- "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
- "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
- "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
- "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
- "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
- "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
- "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
- "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
- "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
- "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
- "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
- "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
- "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
- "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
- "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
- "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
- "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
- "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
- "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
- "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
- "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
- "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
- "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
- "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
- "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
- "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
- "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
- "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
- "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
- "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
- "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
- "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
- "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
- "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
- "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
- "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
- "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
- "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
- "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
- "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
- "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
- "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
- "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
- "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
- "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
- "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
- "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
- "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
- "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
- "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
- "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
315
- "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
316
- "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
317
- "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
318
- "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
319
- "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
320
- "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
321
- "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
322
- "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
323
- "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
324
- "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
325
- "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
326
- "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
327
- "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
328
- "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
329
- "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
330
- "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
331
- "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
332
- "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
333
- "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
334
- "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
335
- "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
336
- "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
337
- "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
338
- "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
339
- "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
340
- "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
341
- "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
342
- "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
343
- "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
344
- "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
345
- "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
346
- "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
347
- "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
348
- "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
349
- "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
350
- "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
351
- "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
352
- "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
353
- "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
354
- "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
355
- "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
356
- "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
357
- "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
358
- "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
359
- "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
360
- "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
361
- "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
362
- "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
363
- "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
364
- "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
365
- "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
366
- "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
367
- "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
368
- "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
369
- "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
370
- "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
371
- "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
372
- "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
373
- "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
374
- "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
375
- "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
376
- "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
377
- "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
378
- "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
379
- "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
380
- "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
381
- "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
382
- "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
383
- "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
384
- "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
385
- "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
386
- "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
387
- "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
388
- "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
389
- "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
390
- "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
391
- "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
392
- "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
393
- "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
394
- "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
395
- "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
396
- "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
397
- "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
398
- "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
399
- "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
400
- "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
401
- "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
402
- "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
403
- "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
404
- "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
405
- "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
406
- "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
407
- "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
408
- "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
409
- "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
410
- "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
411
- "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
412
- "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
413
- "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
414
- "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
415
- "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
416
- "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
417
- "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
418
- "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
419
- "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
420
- "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
421
- "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
422
- "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
423
- "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
424
- "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
425
- "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
426
- "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
427
- "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
428
- "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
429
- "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
430
- "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
431
- "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
432
- "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
433
- "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
434
- "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
435
- "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
436
- "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
437
- "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
438
- "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
439
- "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
440
- "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
441
- "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
442
- "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
443
- "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
444
- "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
445
- "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
446
- "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
447
- "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
448
- "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
449
- "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
450
- "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
451
- "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
452
- "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
453
- "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
454
- "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
455
- "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
456
- "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
457
- "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
458
- "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
459
- }
460
- }
461
-
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 262144,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 48,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_theta": 10000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {
23
+ "tokenizer_decode_config": {
24
+ "skip_special_tokens": true
25
+ },
26
+ "max_length": 262144,
27
+ "eos_token_id": 2
28
+ },
29
+ "mapping": {
30
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
31
+ "LayerNormFinal.weight": "model.norm.weight",
32
+ "lm_head.weight": "lm_head.weight",
33
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
34
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
35
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
36
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
37
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
38
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
39
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
40
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
41
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
42
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
43
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
44
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
45
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
46
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
47
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
48
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
49
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
50
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
51
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
52
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
53
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
54
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
55
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
56
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
57
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
58
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
59
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
60
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
61
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
62
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
63
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
64
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
65
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
66
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
67
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
68
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
69
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
70
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
71
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
72
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
73
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
74
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
75
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
76
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
77
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
78
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
79
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
80
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
81
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
82
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
83
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
84
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
85
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
86
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
87
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
88
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
89
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
90
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
91
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
92
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
93
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
94
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
95
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
96
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
97
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
98
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
99
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
100
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
101
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
102
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
103
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
104
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
105
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
106
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
107
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
108
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
109
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
110
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
111
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
112
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
113
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
114
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
115
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
116
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
117
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
118
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
119
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
120
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
121
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
122
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
123
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
124
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
125
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
126
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
127
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
128
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
129
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
130
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
131
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
132
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
133
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
134
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
135
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
136
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
137
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
138
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
139
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
140
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
141
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
142
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
143
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
144
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
145
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
146
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
147
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
148
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
149
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
150
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
151
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
152
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
153
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
154
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
155
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
156
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
157
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
158
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
159
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
160
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
161
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
162
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
163
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
164
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
165
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
166
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
167
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
168
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
169
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
170
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
171
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
172
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
173
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
174
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
175
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
176
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
177
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
178
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
179
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
180
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
181
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
182
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
183
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
184
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
185
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
186
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
187
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
188
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
189
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
190
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
191
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
192
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
193
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
194
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
195
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
196
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
197
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
198
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
199
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
200
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
201
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
202
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
203
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
204
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
205
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
206
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
207
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
208
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
209
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
210
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
211
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
212
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
213
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
214
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
215
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
216
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
217
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
218
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
219
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
220
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
221
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
222
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
223
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
224
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
225
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
226
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
227
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
228
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
229
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
230
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
231
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
232
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
233
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
234
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
235
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
236
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
237
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
238
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
239
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
240
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
241
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
242
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
243
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
244
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
245
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
246
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
247
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
248
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
249
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
250
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
251
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
252
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
253
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
254
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
255
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
256
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
257
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
258
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
259
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
260
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
261
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
262
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
263
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
264
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
265
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
266
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
267
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
268
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
269
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
270
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
271
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
272
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
273
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
274
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
275
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
276
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
277
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
278
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
279
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
280
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
281
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
282
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
283
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
284
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
285
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
286
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
287
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
288
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
289
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
290
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
291
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
292
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
293
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
294
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
295
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
296
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
297
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
298
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
299
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
300
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
301
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
302
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
303
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
304
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
305
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
306
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
307
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
308
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
309
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
310
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
311
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
312
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
313
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
314
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
315
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
316
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
317
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
318
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
319
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
320
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
321
+ "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
322
+ "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
323
+ "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
324
+ "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
325
+ "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
326
+ "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
327
+ "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
328
+ "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
329
+ "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
330
+ "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
331
+ "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
332
+ "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
333
+ "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
334
+ "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
335
+ "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
336
+ "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
337
+ "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
338
+ "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
339
+ "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
340
+ "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
341
+ "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
342
+ "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
343
+ "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
344
+ "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
345
+ "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
346
+ "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
347
+ "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
348
+ "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
349
+ "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
350
+ "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
351
+ "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
352
+ "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
353
+ "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
354
+ "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
355
+ "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
356
+ "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
357
+ "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
358
+ "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
359
+ "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
360
+ "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
361
+ "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
362
+ "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
363
+ "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
364
+ "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
365
+ "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
366
+ "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
367
+ "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
368
+ "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
369
+ "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
370
+ "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
371
+ "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
372
+ "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
373
+ "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
374
+ "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
375
+ "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
376
+ "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
377
+ "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
378
+ "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
379
+ "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
380
+ "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
381
+ "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
382
+ "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
383
+ "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
384
+ "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
385
+ "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
386
+ "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
387
+ "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
388
+ "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
389
+ "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
390
+ "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
391
+ "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
392
+ "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
393
+ "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
394
+ "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
395
+ "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
396
+ "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
397
+ "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
398
+ "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
399
+ "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
400
+ "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
401
+ "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
402
+ "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
403
+ "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
404
+ "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
405
+ "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
406
+ "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
407
+ "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
408
+ "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
409
+ "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
410
+ "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
411
+ "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
412
+ "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
413
+ "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
414
+ "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
415
+ "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
416
+ "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
417
+ "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
418
+ "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
419
+ "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
420
+ "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
421
+ "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
422
+ "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
423
+ "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
424
+ "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
425
+ "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
426
+ "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
427
+ "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
428
+ "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
429
+ "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
430
+ "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
431
+ "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
432
+ "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
433
+ "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
434
+ "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
435
+ "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
436
+ "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
437
+ "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
438
+ "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
439
+ "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
440
+ "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
441
+ "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
442
+ "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
443
+ "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
444
+ "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
445
+ "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
446
+ "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
447
+ "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
448
+ "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
449
+ "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
450
+ "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
451
+ "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
452
+ "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
453
+ "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
454
+ "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
455
+ "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
456
+ "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
457
+ "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
458
+ "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
459
+ "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
460
+ "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
461
+ "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
462
+ "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
463
+ "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
464
+ "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
465
+ }
466
+ }
01-ai/Yi-9B/bert4torch_config.json CHANGED
@@ -1,460 +1,466 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "hidden_act": "silu",
5
- "hidden_size": 4096,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 11008,
8
- "max_position_embeddings": 4096,
9
- "model": "llama",
10
- "num_attention_heads": 32,
11
- "num_hidden_layers": 48,
12
- "num_key_value_heads": 4,
13
- "pad_token_id": 0,
14
- "layer_norm_eps": 1e-06,
15
- "rope_theta": 10000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "vocab_size": 64000,
19
- "skip_init": true,
20
- "rope_rank": "updown",
21
- "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2},
23
- "mapping": {
24
- "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
- "LayerNormFinal.weight": "model.norm.weight",
26
- "lm_head.weight": "lm_head.weight",
27
- "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
- "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
- "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
- "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
- "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
- "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
- "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
- "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
- "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
- "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
- "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
- "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
- "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
- "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
- "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
- "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
- "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
- "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
- "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
- "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
- "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
- "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
- "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
- "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
- "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
- "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
- "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
- "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
- "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
- "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
- "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
- "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
- "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
- "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
- "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
- "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
- "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
- "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
- "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
- "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
- "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
- "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
- "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
- "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
- "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
- "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
- "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
- "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
- "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
- "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
- "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
- "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
- "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
- "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
- "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
- "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
- "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
- "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
- "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
- "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
- "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
- "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
- "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
- "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
- "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
- "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
- "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
- "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
- "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
- "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
- "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
- "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
- "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
- "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
- "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
- "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
- "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
- "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
- "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
- "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
- "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
- "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
- "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
- "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
- "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
- "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
- "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
- "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
- "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
- "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
- "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
- "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
- "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
- "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
- "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
- "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
- "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
- "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
- "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
- "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
- "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
- "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
- "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
- "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
- "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
- "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
- "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
- "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
- "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
- "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
- "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
- "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
- "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
- "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
- "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
- "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
- "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
- "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
- "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
- "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
- "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
- "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
- "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
- "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
- "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
- "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
- "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
- "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
- "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
- "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
- "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
- "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
- "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
- "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
- "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
- "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
- "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
- "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
- "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
- "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
- "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
- "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
- "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
- "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
- "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
- "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
- "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
- "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
- "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
- "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
- "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
- "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
- "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
- "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
- "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
- "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
- "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
- "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
- "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
- "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
- "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
- "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
- "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
- "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
- "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
- "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
- "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
- "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
- "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
- "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
- "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
- "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
- "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
- "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
- "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
- "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
- "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
- "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
- "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
- "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
- "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
- "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
- "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
- "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
- "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
- "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
- "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
- "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
- "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
- "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
- "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
- "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
- "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
- "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
- "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
- "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
- "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
- "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
- "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
- "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
- "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
- "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
- "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
- "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
- "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
- "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
- "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
- "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
- "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
- "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
- "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
- "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
- "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
- "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
- "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
- "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
- "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
- "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
- "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
- "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
- "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
- "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
- "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
- "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
- "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
- "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
- "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
- "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
- "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
- "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
- "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
- "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
- "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
- "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
- "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
- "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
- "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
- "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
- "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
- "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
- "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
- "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
- "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
- "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
- "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
- "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
- "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
- "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
- "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
- "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
- "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
- "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
- "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
- "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
- "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
- "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
- "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
- "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
- "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
- "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
- "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
- "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
- "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
- "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
- "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
- "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
- "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
- "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
- "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
- "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
- "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
- "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
- "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
- "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
- "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
- "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
- "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
- "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
- "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
- "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
- "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
- "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
- "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
- "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
- "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
- "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
- "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
- "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
315
- "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
316
- "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
317
- "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
318
- "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
319
- "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
320
- "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
321
- "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
322
- "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
323
- "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
324
- "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
325
- "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
326
- "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
327
- "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
328
- "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
329
- "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
330
- "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
331
- "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
332
- "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
333
- "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
334
- "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
335
- "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
336
- "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
337
- "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
338
- "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
339
- "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
340
- "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
341
- "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
342
- "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
343
- "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
344
- "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
345
- "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
346
- "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
347
- "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
348
- "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
349
- "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
350
- "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
351
- "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
352
- "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
353
- "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
354
- "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
355
- "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
356
- "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
357
- "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
358
- "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
359
- "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
360
- "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
361
- "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
362
- "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
363
- "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
364
- "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
365
- "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
366
- "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
367
- "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
368
- "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
369
- "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
370
- "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
371
- "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
372
- "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
373
- "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
374
- "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
375
- "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
376
- "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
377
- "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
378
- "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
379
- "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
380
- "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
381
- "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
382
- "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
383
- "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
384
- "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
385
- "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
386
- "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
387
- "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
388
- "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
389
- "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
390
- "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
391
- "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
392
- "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
393
- "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
394
- "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
395
- "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
396
- "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
397
- "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
398
- "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
399
- "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
400
- "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
401
- "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
402
- "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
403
- "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
404
- "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
405
- "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
406
- "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
407
- "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
408
- "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
409
- "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
410
- "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
411
- "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
412
- "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
413
- "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
414
- "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
415
- "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
416
- "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
417
- "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
418
- "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
419
- "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
420
- "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
421
- "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
422
- "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
423
- "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
424
- "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
425
- "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
426
- "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
427
- "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
428
- "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
429
- "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
430
- "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
431
- "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
432
- "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
433
- "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
434
- "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
435
- "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
436
- "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
437
- "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
438
- "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
439
- "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
440
- "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
441
- "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
442
- "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
443
- "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
444
- "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
445
- "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
446
- "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
447
- "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
448
- "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
449
- "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
450
- "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
451
- "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
452
- "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
453
- "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
454
- "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
455
- "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
456
- "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
457
- "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
458
- "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
459
- }
460
- }
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 48,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_theta": 10000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {
23
+ "tokenizer_decode_config": {
24
+ "skip_special_tokens": true
25
+ },
26
+ "max_length": 4096,
27
+ "eos_token_id": 2
28
+ },
29
+ "mapping": {
30
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
31
+ "LayerNormFinal.weight": "model.norm.weight",
32
+ "lm_head.weight": "lm_head.weight",
33
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
34
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
35
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
36
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
37
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
38
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
39
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
40
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
41
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
42
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
43
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
44
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
45
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
46
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
47
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
48
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
49
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
50
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
51
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
52
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
53
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
54
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
55
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
56
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
57
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
58
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
59
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
60
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
61
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
62
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
63
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
64
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
65
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
66
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
67
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
68
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
69
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
70
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
71
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
72
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
73
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
74
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
75
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
76
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
77
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
78
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
79
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
80
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
81
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
82
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
83
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
84
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
85
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
86
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
87
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
88
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
89
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
90
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
91
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
92
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
93
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
94
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
95
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
96
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
97
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
98
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
99
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
100
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
101
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
102
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
103
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
104
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
105
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
106
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
107
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
108
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
109
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
110
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
111
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
112
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
113
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
114
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
115
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
116
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
117
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
118
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
119
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
120
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
121
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
122
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
123
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
124
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
125
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
126
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
127
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
128
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
129
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
130
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
131
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
132
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
133
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
134
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
135
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
136
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
137
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
138
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
139
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
140
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
141
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
142
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
143
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
144
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
145
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
146
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
147
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
148
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
149
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
150
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
151
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
152
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
153
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
154
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
155
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
156
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
157
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
158
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
159
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
160
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
161
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
162
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
163
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
164
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
165
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
166
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
167
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
168
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
169
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
170
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
171
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
172
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
173
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
174
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
175
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
176
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
177
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
178
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
179
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
180
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
181
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
182
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
183
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
184
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
185
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
186
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
187
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
188
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
189
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
190
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
191
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
192
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
193
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
194
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
195
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
196
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
197
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
198
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
199
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
200
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
201
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
202
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
203
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
204
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
205
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
206
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
207
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
208
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
209
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
210
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
211
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
212
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
213
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
214
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
215
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
216
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
217
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
218
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
219
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
220
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
221
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
222
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
223
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
224
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
225
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
226
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
227
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
228
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
229
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
230
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
231
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
232
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
233
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
234
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
235
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
236
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
237
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
238
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
239
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
240
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
241
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
242
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
243
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
244
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
245
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
246
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
247
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
248
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
249
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
250
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
251
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
252
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
253
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
254
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
255
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
256
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
257
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
258
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
259
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
260
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
261
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
262
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
263
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
264
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
265
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
266
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
267
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
268
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
269
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
270
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
271
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
272
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
273
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
274
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
275
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
276
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
277
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
278
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
279
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
280
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
281
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
282
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
283
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
284
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
285
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
286
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
287
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
288
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
289
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
290
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
291
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
292
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
293
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
294
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
295
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
296
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
297
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
298
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
299
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
300
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
301
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
302
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
303
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
304
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
305
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
306
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
307
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
308
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
309
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
310
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
311
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
312
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
313
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
314
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
315
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
316
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
317
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
318
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
319
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
320
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
321
+ "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
322
+ "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
323
+ "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
324
+ "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
325
+ "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
326
+ "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
327
+ "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
328
+ "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
329
+ "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
330
+ "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
331
+ "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
332
+ "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
333
+ "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
334
+ "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
335
+ "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
336
+ "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
337
+ "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
338
+ "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
339
+ "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
340
+ "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
341
+ "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
342
+ "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
343
+ "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
344
+ "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
345
+ "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
346
+ "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
347
+ "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
348
+ "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
349
+ "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
350
+ "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
351
+ "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
352
+ "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
353
+ "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
354
+ "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
355
+ "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
356
+ "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
357
+ "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
358
+ "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
359
+ "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
360
+ "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
361
+ "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
362
+ "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
363
+ "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
364
+ "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
365
+ "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
366
+ "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
367
+ "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
368
+ "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
369
+ "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
370
+ "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
371
+ "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
372
+ "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
373
+ "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
374
+ "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
375
+ "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
376
+ "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
377
+ "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
378
+ "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
379
+ "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
380
+ "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
381
+ "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
382
+ "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
383
+ "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
384
+ "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
385
+ "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
386
+ "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
387
+ "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
388
+ "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
389
+ "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
390
+ "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
391
+ "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
392
+ "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
393
+ "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
394
+ "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
395
+ "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
396
+ "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
397
+ "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
398
+ "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
399
+ "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
400
+ "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
401
+ "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
402
+ "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
403
+ "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
404
+ "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
405
+ "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
406
+ "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
407
+ "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
408
+ "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
409
+ "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
410
+ "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
411
+ "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
412
+ "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
413
+ "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
414
+ "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
415
+ "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
416
+ "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
417
+ "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
418
+ "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
419
+ "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
420
+ "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
421
+ "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
422
+ "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
423
+ "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
424
+ "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
425
+ "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
426
+ "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
427
+ "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
428
+ "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
429
+ "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
430
+ "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
431
+ "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
432
+ "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
433
+ "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
434
+ "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
435
+ "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
436
+ "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
437
+ "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
438
+ "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
439
+ "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
440
+ "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
441
+ "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
442
+ "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
443
+ "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
444
+ "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
445
+ "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
446
+ "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
447
+ "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
448
+ "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
449
+ "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
450
+ "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
451
+ "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
452
+ "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
453
+ "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
454
+ "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
455
+ "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
456
+ "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
457
+ "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
458
+ "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
459
+ "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
460
+ "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
461
+ "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
462
+ "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
463
+ "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
464
+ "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
465
+ }
466
+ }
BAAI/bge-base-en-v1.5/bert4torch_config.json CHANGED
@@ -1,223 +1,225 @@
1
  {
2
- "attention_probs_dropout_prob": 0.1,
3
- "hidden_act": "gelu",
4
- "hidden_dropout_prob": 0.1,
5
- "hidden_size": 768,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 3072,
8
- "layer_norm_eps": 1e-12,
9
- "max_position_embeddings": 512,
10
- "model": "bert",
11
- "num_attention_heads": 12,
12
- "num_hidden_layers": 12,
13
- "bos_token_id": 0,
14
- "eos_token_id": 2,
15
- "pad_token_id": 0,
16
- "torch_dtype": "float32",
17
- "type_vocab_size": 2,
18
- "vocab_size": 30522,
19
- "with_pool": true,
20
- "pooling": {"pool_strategy": "cls"},
21
- "norm_mode": "torch_buildin",
22
- "mapping": {
23
- "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
24
- "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
25
- "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
26
- "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
27
- "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
28
- "pooler.weight": "pooler.dense.weight",
29
- "pooler.bias": "pooler.dense.bias",
30
- "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
31
- "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
32
- "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
33
- "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
34
- "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
35
- "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
36
- "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
37
- "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
38
- "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
39
- "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
40
- "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
41
- "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
42
- "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
43
- "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
44
- "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
45
- "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
46
- "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
47
- "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
48
- "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
49
- "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
50
- "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
51
- "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
52
- "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
53
- "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
54
- "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
55
- "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
56
- "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
57
- "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
58
- "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
59
- "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
60
- "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
61
- "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
62
- "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
63
- "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
64
- "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
65
- "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
66
- "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
67
- "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
68
- "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
69
- "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
70
- "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
71
- "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
72
- "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
73
- "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
74
- "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
75
- "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
76
- "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
77
- "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
78
- "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
79
- "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
80
- "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
81
- "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
82
- "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
83
- "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
84
- "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
85
- "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
86
- "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
87
- "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
88
- "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
89
- "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
90
- "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
91
- "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
92
- "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
93
- "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
94
- "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
95
- "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
96
- "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
97
- "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
98
- "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
99
- "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
100
- "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
101
- "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
102
- "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
103
- "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
104
- "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
105
- "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
106
- "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
107
- "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
108
- "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
109
- "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
110
- "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
111
- "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
112
- "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
113
- "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
114
- "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
115
- "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
116
- "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
117
- "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
118
- "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
119
- "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
120
- "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
121
- "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
122
- "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
123
- "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
124
- "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
125
- "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
126
- "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
127
- "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
128
- "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
129
- "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
130
- "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
131
- "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
132
- "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
133
- "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
134
- "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
135
- "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
136
- "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
137
- "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
138
- "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
139
- "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
140
- "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
141
- "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
142
- "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
143
- "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
144
- "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
145
- "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
146
- "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
147
- "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
148
- "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
149
- "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
150
- "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
151
- "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
152
- "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
153
- "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
154
- "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
155
- "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
156
- "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
157
- "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
158
- "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
159
- "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
160
- "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
161
- "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
162
- "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
163
- "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
164
- "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
165
- "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
166
- "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
167
- "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
168
- "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
169
- "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
170
- "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
171
- "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
172
- "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
173
- "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
174
- "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
175
- "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
176
- "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
177
- "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
178
- "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
179
- "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
180
- "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
181
- "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
182
- "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
183
- "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
184
- "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
185
- "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
186
- "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
187
- "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
188
- "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
189
- "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
190
- "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
191
- "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
192
- "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
193
- "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
194
- "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
195
- "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
196
- "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
197
- "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
198
- "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
199
- "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
200
- "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
201
- "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
202
- "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
203
- "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
204
- "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
205
- "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
206
- "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
207
- "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
208
- "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
209
- "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
210
- "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
211
- "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
212
- "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
213
- "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
214
- "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
215
- "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
216
- "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
217
- "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
218
- "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
219
- "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
220
- "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
221
- "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias"
222
- }
223
- }
 
 
 
1
  {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 768,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 3072,
8
+ "layer_norm_eps": 1e-12,
9
+ "max_position_embeddings": 512,
10
+ "model": "bert",
11
+ "num_attention_heads": 12,
12
+ "num_hidden_layers": 12,
13
+ "bos_token_id": 0,
14
+ "eos_token_id": 2,
15
+ "pad_token_id": 0,
16
+ "torch_dtype": "float32",
17
+ "type_vocab_size": 2,
18
+ "vocab_size": 30522,
19
+ "with_pool": true,
20
+ "pooling": {
21
+ "pool_strategy": "cls"
22
+ },
23
+ "norm_mode": "torch_buildin",
24
+ "mapping": {
25
+ "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
26
+ "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
27
+ "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
28
+ "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
29
+ "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
30
+ "pooler.weight": "pooler.dense.weight",
31
+ "pooler.bias": "pooler.dense.bias",
32
+ "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
33
+ "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
34
+ "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
35
+ "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
36
+ "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
37
+ "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
38
+ "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
39
+ "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
40
+ "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
41
+ "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
42
+ "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
43
+ "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
44
+ "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
45
+ "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
46
+ "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
47
+ "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
48
+ "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
49
+ "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
50
+ "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
51
+ "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
52
+ "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
53
+ "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
54
+ "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
55
+ "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
56
+ "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
57
+ "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
58
+ "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
59
+ "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
60
+ "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
61
+ "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
62
+ "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
63
+ "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
64
+ "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
65
+ "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
66
+ "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
67
+ "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
68
+ "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
69
+ "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
70
+ "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
71
+ "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
72
+ "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
73
+ "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
74
+ "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
75
+ "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
76
+ "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
77
+ "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
78
+ "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
79
+ "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
80
+ "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
81
+ "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
82
+ "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
83
+ "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
84
+ "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
85
+ "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
86
+ "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
87
+ "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
88
+ "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
89
+ "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
90
+ "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
91
+ "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
92
+ "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
93
+ "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
94
+ "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
95
+ "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
96
+ "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
97
+ "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
98
+ "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
99
+ "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
100
+ "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
101
+ "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
102
+ "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
103
+ "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
104
+ "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
105
+ "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
106
+ "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
107
+ "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
108
+ "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
109
+ "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
110
+ "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
111
+ "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
112
+ "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
113
+ "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
114
+ "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
115
+ "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
116
+ "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
117
+ "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
118
+ "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
119
+ "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
120
+ "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
121
+ "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
122
+ "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
123
+ "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
124
+ "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
125
+ "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
126
+ "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
127
+ "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
128
+ "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
129
+ "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
130
+ "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
131
+ "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
132
+ "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
133
+ "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
134
+ "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
135
+ "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
136
+ "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
137
+ "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
138
+ "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
139
+ "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
140
+ "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
141
+ "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
142
+ "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
143
+ "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
144
+ "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
145
+ "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
146
+ "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
147
+ "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
148
+ "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
149
+ "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
150
+ "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
151
+ "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
152
+ "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
153
+ "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
154
+ "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
155
+ "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
156
+ "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
157
+ "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
158
+ "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
159
+ "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
160
+ "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
161
+ "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
162
+ "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
163
+ "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
164
+ "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
165
+ "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
166
+ "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
167
+ "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
168
+ "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
169
+ "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
170
+ "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
171
+ "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
172
+ "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
173
+ "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
174
+ "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
175
+ "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
176
+ "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
177
+ "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
178
+ "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
179
+ "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
180
+ "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
181
+ "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
182
+ "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
183
+ "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
184
+ "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
185
+ "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
186
+ "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
187
+ "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
188
+ "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
189
+ "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
190
+ "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
191
+ "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
192
+ "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
193
+ "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
194
+ "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
195
+ "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
196
+ "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
197
+ "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
198
+ "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
199
+ "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
200
+ "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
201
+ "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
202
+ "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
203
+ "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
204
+ "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
205
+ "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
206
+ "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
207
+ "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
208
+ "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
209
+ "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
210
+ "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
211
+ "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
212
+ "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
213
+ "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
214
+ "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
215
+ "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
216
+ "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
217
+ "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
218
+ "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
219
+ "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
220
+ "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
221
+ "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
222
+ "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
223
+ "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias"
224
+ }
225
+ }
BAAI/bge-base-zh-v1.5/bert4torch_config.json CHANGED
@@ -1,223 +1,225 @@
1
  {
2
- "attention_probs_dropout_prob": 0.1,
3
- "hidden_act": "gelu",
4
- "hidden_dropout_prob": 0.1,
5
- "hidden_size": 768,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 3072,
8
- "layer_norm_eps": 1e-12,
9
- "max_position_embeddings": 512,
10
- "model": "bert",
11
- "num_attention_heads": 12,
12
- "num_hidden_layers": 12,
13
- "bos_token_id": 0,
14
- "eos_token_id": 2,
15
- "pad_token_id": 0,
16
- "torch_dtype": "float32",
17
- "type_vocab_size": 2,
18
- "vocab_size": 21128,
19
- "with_pool": true,
20
- "pooling": {"pool_strategy": "cls"},
21
- "norm_mode": "torch_buildin",
22
- "mapping": {
23
- "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
24
- "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
25
- "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
26
- "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
27
- "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
28
- "pooler.weight": "pooler.dense.weight",
29
- "pooler.bias": "pooler.dense.bias",
30
- "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
31
- "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
32
- "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
33
- "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
34
- "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
35
- "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
36
- "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
37
- "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
38
- "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
39
- "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
40
- "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
41
- "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
42
- "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
43
- "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
44
- "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
45
- "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
46
- "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
47
- "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
48
- "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
49
- "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
50
- "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
51
- "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
52
- "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
53
- "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
54
- "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
55
- "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
56
- "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
57
- "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
58
- "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
59
- "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
60
- "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
61
- "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
62
- "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
63
- "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
64
- "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
65
- "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
66
- "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
67
- "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
68
- "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
69
- "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
70
- "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
71
- "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
72
- "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
73
- "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
74
- "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
75
- "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
76
- "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
77
- "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
78
- "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
79
- "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
80
- "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
81
- "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
82
- "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
83
- "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
84
- "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
85
- "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
86
- "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
87
- "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
88
- "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
89
- "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
90
- "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
91
- "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
92
- "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
93
- "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
94
- "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
95
- "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
96
- "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
97
- "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
98
- "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
99
- "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
100
- "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
101
- "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
102
- "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
103
- "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
104
- "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
105
- "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
106
- "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
107
- "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
108
- "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
109
- "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
110
- "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
111
- "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
112
- "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
113
- "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
114
- "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
115
- "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
116
- "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
117
- "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
118
- "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
119
- "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
120
- "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
121
- "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
122
- "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
123
- "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
124
- "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
125
- "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
126
- "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
127
- "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
128
- "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
129
- "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
130
- "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
131
- "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
132
- "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
133
- "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
134
- "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
135
- "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
136
- "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
137
- "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
138
- "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
139
- "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
140
- "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
141
- "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
142
- "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
143
- "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
144
- "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
145
- "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
146
- "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
147
- "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
148
- "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
149
- "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
150
- "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
151
- "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
152
- "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
153
- "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
154
- "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
155
- "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
156
- "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
157
- "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
158
- "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
159
- "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
160
- "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
161
- "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
162
- "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
163
- "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
164
- "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
165
- "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
166
- "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
167
- "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
168
- "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
169
- "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
170
- "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
171
- "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
172
- "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
173
- "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
174
- "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
175
- "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
176
- "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
177
- "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
178
- "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
179
- "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
180
- "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
181
- "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
182
- "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
183
- "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
184
- "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
185
- "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
186
- "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
187
- "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
188
- "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
189
- "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
190
- "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
191
- "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
192
- "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
193
- "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
194
- "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
195
- "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
196
- "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
197
- "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
198
- "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
199
- "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
200
- "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
201
- "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
202
- "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
203
- "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
204
- "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
205
- "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
206
- "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
207
- "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
208
- "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
209
- "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
210
- "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
211
- "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
212
- "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
213
- "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
214
- "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
215
- "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
216
- "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
217
- "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
218
- "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
219
- "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
220
- "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
221
- "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias"
222
- }
223
- }
 
 
 
1
  {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 768,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 3072,
8
+ "layer_norm_eps": 1e-12,
9
+ "max_position_embeddings": 512,
10
+ "model": "bert",
11
+ "num_attention_heads": 12,
12
+ "num_hidden_layers": 12,
13
+ "bos_token_id": 0,
14
+ "eos_token_id": 2,
15
+ "pad_token_id": 0,
16
+ "torch_dtype": "float32",
17
+ "type_vocab_size": 2,
18
+ "vocab_size": 21128,
19
+ "with_pool": true,
20
+ "pooling": {
21
+ "pool_strategy": "cls"
22
+ },
23
+ "norm_mode": "torch_buildin",
24
+ "mapping": {
25
+ "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
26
+ "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
27
+ "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
28
+ "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
29
+ "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
30
+ "pooler.weight": "pooler.dense.weight",
31
+ "pooler.bias": "pooler.dense.bias",
32
+ "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
33
+ "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
34
+ "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
35
+ "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
36
+ "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
37
+ "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
38
+ "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
39
+ "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
40
+ "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
41
+ "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
42
+ "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
43
+ "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
44
+ "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
45
+ "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
46
+ "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
47
+ "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
48
+ "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
49
+ "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
50
+ "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
51
+ "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
52
+ "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
53
+ "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
54
+ "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
55
+ "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
56
+ "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
57
+ "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
58
+ "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
59
+ "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
60
+ "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
61
+ "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
62
+ "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
63
+ "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
64
+ "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
65
+ "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
66
+ "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
67
+ "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
68
+ "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
69
+ "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
70
+ "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
71
+ "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
72
+ "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
73
+ "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
74
+ "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
75
+ "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
76
+ "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
77
+ "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
78
+ "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
79
+ "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
80
+ "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
81
+ "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
82
+ "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
83
+ "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
84
+ "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
85
+ "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
86
+ "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
87
+ "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
88
+ "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
89
+ "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
90
+ "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
91
+ "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
92
+ "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
93
+ "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
94
+ "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
95
+ "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
96
+ "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
97
+ "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
98
+ "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
99
+ "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
100
+ "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
101
+ "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
102
+ "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
103
+ "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
104
+ "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
105
+ "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
106
+ "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
107
+ "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
108
+ "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
109
+ "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
110
+ "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
111
+ "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
112
+ "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
113
+ "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
114
+ "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
115
+ "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
116
+ "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
117
+ "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
118
+ "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
119
+ "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
120
+ "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
121
+ "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
122
+ "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
123
+ "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
124
+ "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
125
+ "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
126
+ "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
127
+ "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
128
+ "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
129
+ "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
130
+ "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
131
+ "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
132
+ "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
133
+ "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
134
+ "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
135
+ "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
136
+ "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
137
+ "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
138
+ "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
139
+ "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
140
+ "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
141
+ "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
142
+ "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
143
+ "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
144
+ "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
145
+ "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
146
+ "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
147
+ "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
148
+ "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
149
+ "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
150
+ "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
151
+ "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
152
+ "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
153
+ "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
154
+ "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
155
+ "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
156
+ "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
157
+ "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
158
+ "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
159
+ "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
160
+ "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
161
+ "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
162
+ "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
163
+ "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
164
+ "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
165
+ "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
166
+ "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
167
+ "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
168
+ "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
169
+ "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
170
+ "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
171
+ "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
172
+ "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
173
+ "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
174
+ "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
175
+ "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
176
+ "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
177
+ "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
178
+ "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
179
+ "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
180
+ "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
181
+ "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
182
+ "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
183
+ "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
184
+ "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
185
+ "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
186
+ "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
187
+ "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
188
+ "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
189
+ "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
190
+ "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
191
+ "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
192
+ "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
193
+ "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
194
+ "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
195
+ "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
196
+ "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
197
+ "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
198
+ "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
199
+ "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
200
+ "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
201
+ "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
202
+ "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
203
+ "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
204
+ "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
205
+ "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
206
+ "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
207
+ "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
208
+ "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
209
+ "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
210
+ "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
211
+ "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
212
+ "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
213
+ "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
214
+ "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
215
+ "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
216
+ "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
217
+ "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
218
+ "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
219
+ "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
220
+ "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
221
+ "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
222
+ "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
223
+ "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias"
224
+ }
225
+ }
BAAI/bge-large-en-v1.5/bert4torch_config.json CHANGED
@@ -1,412 +1,414 @@
1
- {
2
- "attention_probs_dropout_prob": 0.1,
3
- "hidden_act": "gelu",
4
- "hidden_dropout_prob": 0.1,
5
- "hidden_size": 1024,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 4096,
8
- "layer_norm_eps": 1e-12,
9
- "max_position_embeddings": 512,
10
- "model": "bert",
11
- "num_attention_heads": 16,
12
- "num_hidden_layers": 24,
13
- "pad_token_id": 0,
14
- "torch_dtype": "float32",
15
- "type_vocab_size": 2,
16
- "vocab_size": 30522,
17
- "with_pool": true,
18
- "pooling": {"pool_strategy": "cls"},
19
- "mapping": {
20
- "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
21
- "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
22
- "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
23
- "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
24
- "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
25
- "pooler.weight": "pooler.dense.weight",
26
- "pooler.bias": "pooler.dense.bias",
27
- "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
28
- "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
29
- "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
30
- "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
31
- "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
32
- "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
33
- "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
34
- "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
35
- "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
36
- "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
37
- "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
38
- "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
39
- "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
40
- "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
41
- "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
42
- "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
43
- "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
44
- "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
45
- "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
46
- "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
47
- "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
48
- "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
49
- "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
50
- "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
51
- "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
52
- "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
53
- "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
54
- "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
55
- "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
56
- "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
57
- "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
58
- "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
59
- "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
60
- "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
61
- "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
62
- "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
63
- "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
64
- "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
65
- "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
66
- "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
67
- "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
68
- "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
69
- "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
70
- "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
71
- "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
72
- "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
73
- "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
74
- "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
75
- "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
76
- "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
77
- "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
78
- "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
79
- "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
80
- "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
81
- "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
82
- "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
83
- "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
84
- "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
85
- "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
86
- "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
87
- "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
88
- "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
89
- "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
90
- "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
91
- "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
92
- "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
93
- "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
94
- "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
95
- "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
96
- "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
97
- "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
98
- "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
99
- "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
100
- "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
101
- "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
102
- "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
103
- "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
104
- "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
105
- "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
106
- "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
107
- "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
108
- "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
109
- "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
110
- "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
111
- "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
112
- "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
113
- "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
114
- "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
115
- "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
116
- "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
117
- "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
118
- "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
119
- "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
120
- "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
121
- "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
122
- "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
123
- "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
124
- "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
125
- "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
126
- "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
127
- "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
128
- "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
129
- "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
130
- "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
131
- "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
132
- "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
133
- "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
134
- "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
135
- "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
136
- "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
137
- "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
138
- "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
139
- "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
140
- "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
141
- "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
142
- "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
143
- "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
144
- "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
145
- "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
146
- "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
147
- "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
148
- "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
149
- "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
150
- "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
151
- "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
152
- "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
153
- "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
154
- "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
155
- "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
156
- "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
157
- "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
158
- "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
159
- "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
160
- "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
161
- "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
162
- "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
163
- "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
164
- "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
165
- "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
166
- "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
167
- "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
168
- "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
169
- "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
170
- "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
171
- "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
172
- "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
173
- "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
174
- "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
175
- "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
176
- "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
177
- "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
178
- "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
179
- "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
180
- "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
181
- "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
182
- "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
183
- "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
184
- "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
185
- "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
186
- "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
187
- "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
188
- "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
189
- "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
190
- "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
191
- "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
192
- "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
193
- "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
194
- "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
195
- "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
196
- "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
197
- "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
198
- "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
199
- "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
200
- "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
201
- "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
202
- "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
203
- "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
204
- "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
205
- "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
206
- "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
207
- "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
208
- "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
209
- "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
210
- "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
211
- "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
212
- "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
213
- "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
214
- "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
215
- "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
216
- "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
217
- "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
218
- "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias",
219
- "encoderLayer.12.multiHeadAttention.q.weight": "encoder.layer.12.attention.self.query.weight",
220
- "encoderLayer.12.multiHeadAttention.q.bias": "encoder.layer.12.attention.self.query.bias",
221
- "encoderLayer.12.multiHeadAttention.k.weight": "encoder.layer.12.attention.self.key.weight",
222
- "encoderLayer.12.multiHeadAttention.k.bias": "encoder.layer.12.attention.self.key.bias",
223
- "encoderLayer.12.multiHeadAttention.v.weight": "encoder.layer.12.attention.self.value.weight",
224
- "encoderLayer.12.multiHeadAttention.v.bias": "encoder.layer.12.attention.self.value.bias",
225
- "encoderLayer.12.multiHeadAttention.o.weight": "encoder.layer.12.attention.output.dense.weight",
226
- "encoderLayer.12.multiHeadAttention.o.bias": "encoder.layer.12.attention.output.dense.bias",
227
- "encoderLayer.12.attnLayerNorm.weight": "encoder.layer.12.attention.output.LayerNorm.weight",
228
- "encoderLayer.12.attnLayerNorm.bias": "encoder.layer.12.attention.output.LayerNorm.bias",
229
- "encoderLayer.12.feedForward.intermediateDense.weight": "encoder.layer.12.intermediate.dense.weight",
230
- "encoderLayer.12.feedForward.intermediateDense.bias": "encoder.layer.12.intermediate.dense.bias",
231
- "encoderLayer.12.feedForward.outputDense.weight": "encoder.layer.12.output.dense.weight",
232
- "encoderLayer.12.feedForward.outputDense.bias": "encoder.layer.12.output.dense.bias",
233
- "encoderLayer.12.ffnLayerNorm.weight": "encoder.layer.12.output.LayerNorm.weight",
234
- "encoderLayer.12.ffnLayerNorm.bias": "encoder.layer.12.output.LayerNorm.bias",
235
- "encoderLayer.13.multiHeadAttention.q.weight": "encoder.layer.13.attention.self.query.weight",
236
- "encoderLayer.13.multiHeadAttention.q.bias": "encoder.layer.13.attention.self.query.bias",
237
- "encoderLayer.13.multiHeadAttention.k.weight": "encoder.layer.13.attention.self.key.weight",
238
- "encoderLayer.13.multiHeadAttention.k.bias": "encoder.layer.13.attention.self.key.bias",
239
- "encoderLayer.13.multiHeadAttention.v.weight": "encoder.layer.13.attention.self.value.weight",
240
- "encoderLayer.13.multiHeadAttention.v.bias": "encoder.layer.13.attention.self.value.bias",
241
- "encoderLayer.13.multiHeadAttention.o.weight": "encoder.layer.13.attention.output.dense.weight",
242
- "encoderLayer.13.multiHeadAttention.o.bias": "encoder.layer.13.attention.output.dense.bias",
243
- "encoderLayer.13.attnLayerNorm.weight": "encoder.layer.13.attention.output.LayerNorm.weight",
244
- "encoderLayer.13.attnLayerNorm.bias": "encoder.layer.13.attention.output.LayerNorm.bias",
245
- "encoderLayer.13.feedForward.intermediateDense.weight": "encoder.layer.13.intermediate.dense.weight",
246
- "encoderLayer.13.feedForward.intermediateDense.bias": "encoder.layer.13.intermediate.dense.bias",
247
- "encoderLayer.13.feedForward.outputDense.weight": "encoder.layer.13.output.dense.weight",
248
- "encoderLayer.13.feedForward.outputDense.bias": "encoder.layer.13.output.dense.bias",
249
- "encoderLayer.13.ffnLayerNorm.weight": "encoder.layer.13.output.LayerNorm.weight",
250
- "encoderLayer.13.ffnLayerNorm.bias": "encoder.layer.13.output.LayerNorm.bias",
251
- "encoderLayer.14.multiHeadAttention.q.weight": "encoder.layer.14.attention.self.query.weight",
252
- "encoderLayer.14.multiHeadAttention.q.bias": "encoder.layer.14.attention.self.query.bias",
253
- "encoderLayer.14.multiHeadAttention.k.weight": "encoder.layer.14.attention.self.key.weight",
254
- "encoderLayer.14.multiHeadAttention.k.bias": "encoder.layer.14.attention.self.key.bias",
255
- "encoderLayer.14.multiHeadAttention.v.weight": "encoder.layer.14.attention.self.value.weight",
256
- "encoderLayer.14.multiHeadAttention.v.bias": "encoder.layer.14.attention.self.value.bias",
257
- "encoderLayer.14.multiHeadAttention.o.weight": "encoder.layer.14.attention.output.dense.weight",
258
- "encoderLayer.14.multiHeadAttention.o.bias": "encoder.layer.14.attention.output.dense.bias",
259
- "encoderLayer.14.attnLayerNorm.weight": "encoder.layer.14.attention.output.LayerNorm.weight",
260
- "encoderLayer.14.attnLayerNorm.bias": "encoder.layer.14.attention.output.LayerNorm.bias",
261
- "encoderLayer.14.feedForward.intermediateDense.weight": "encoder.layer.14.intermediate.dense.weight",
262
- "encoderLayer.14.feedForward.intermediateDense.bias": "encoder.layer.14.intermediate.dense.bias",
263
- "encoderLayer.14.feedForward.outputDense.weight": "encoder.layer.14.output.dense.weight",
264
- "encoderLayer.14.feedForward.outputDense.bias": "encoder.layer.14.output.dense.bias",
265
- "encoderLayer.14.ffnLayerNorm.weight": "encoder.layer.14.output.LayerNorm.weight",
266
- "encoderLayer.14.ffnLayerNorm.bias": "encoder.layer.14.output.LayerNorm.bias",
267
- "encoderLayer.15.multiHeadAttention.q.weight": "encoder.layer.15.attention.self.query.weight",
268
- "encoderLayer.15.multiHeadAttention.q.bias": "encoder.layer.15.attention.self.query.bias",
269
- "encoderLayer.15.multiHeadAttention.k.weight": "encoder.layer.15.attention.self.key.weight",
270
- "encoderLayer.15.multiHeadAttention.k.bias": "encoder.layer.15.attention.self.key.bias",
271
- "encoderLayer.15.multiHeadAttention.v.weight": "encoder.layer.15.attention.self.value.weight",
272
- "encoderLayer.15.multiHeadAttention.v.bias": "encoder.layer.15.attention.self.value.bias",
273
- "encoderLayer.15.multiHeadAttention.o.weight": "encoder.layer.15.attention.output.dense.weight",
274
- "encoderLayer.15.multiHeadAttention.o.bias": "encoder.layer.15.attention.output.dense.bias",
275
- "encoderLayer.15.attnLayerNorm.weight": "encoder.layer.15.attention.output.LayerNorm.weight",
276
- "encoderLayer.15.attnLayerNorm.bias": "encoder.layer.15.attention.output.LayerNorm.bias",
277
- "encoderLayer.15.feedForward.intermediateDense.weight": "encoder.layer.15.intermediate.dense.weight",
278
- "encoderLayer.15.feedForward.intermediateDense.bias": "encoder.layer.15.intermediate.dense.bias",
279
- "encoderLayer.15.feedForward.outputDense.weight": "encoder.layer.15.output.dense.weight",
280
- "encoderLayer.15.feedForward.outputDense.bias": "encoder.layer.15.output.dense.bias",
281
- "encoderLayer.15.ffnLayerNorm.weight": "encoder.layer.15.output.LayerNorm.weight",
282
- "encoderLayer.15.ffnLayerNorm.bias": "encoder.layer.15.output.LayerNorm.bias",
283
- "encoderLayer.16.multiHeadAttention.q.weight": "encoder.layer.16.attention.self.query.weight",
284
- "encoderLayer.16.multiHeadAttention.q.bias": "encoder.layer.16.attention.self.query.bias",
285
- "encoderLayer.16.multiHeadAttention.k.weight": "encoder.layer.16.attention.self.key.weight",
286
- "encoderLayer.16.multiHeadAttention.k.bias": "encoder.layer.16.attention.self.key.bias",
287
- "encoderLayer.16.multiHeadAttention.v.weight": "encoder.layer.16.attention.self.value.weight",
288
- "encoderLayer.16.multiHeadAttention.v.bias": "encoder.layer.16.attention.self.value.bias",
289
- "encoderLayer.16.multiHeadAttention.o.weight": "encoder.layer.16.attention.output.dense.weight",
290
- "encoderLayer.16.multiHeadAttention.o.bias": "encoder.layer.16.attention.output.dense.bias",
291
- "encoderLayer.16.attnLayerNorm.weight": "encoder.layer.16.attention.output.LayerNorm.weight",
292
- "encoderLayer.16.attnLayerNorm.bias": "encoder.layer.16.attention.output.LayerNorm.bias",
293
- "encoderLayer.16.feedForward.intermediateDense.weight": "encoder.layer.16.intermediate.dense.weight",
294
- "encoderLayer.16.feedForward.intermediateDense.bias": "encoder.layer.16.intermediate.dense.bias",
295
- "encoderLayer.16.feedForward.outputDense.weight": "encoder.layer.16.output.dense.weight",
296
- "encoderLayer.16.feedForward.outputDense.bias": "encoder.layer.16.output.dense.bias",
297
- "encoderLayer.16.ffnLayerNorm.weight": "encoder.layer.16.output.LayerNorm.weight",
298
- "encoderLayer.16.ffnLayerNorm.bias": "encoder.layer.16.output.LayerNorm.bias",
299
- "encoderLayer.17.multiHeadAttention.q.weight": "encoder.layer.17.attention.self.query.weight",
300
- "encoderLayer.17.multiHeadAttention.q.bias": "encoder.layer.17.attention.self.query.bias",
301
- "encoderLayer.17.multiHeadAttention.k.weight": "encoder.layer.17.attention.self.key.weight",
302
- "encoderLayer.17.multiHeadAttention.k.bias": "encoder.layer.17.attention.self.key.bias",
303
- "encoderLayer.17.multiHeadAttention.v.weight": "encoder.layer.17.attention.self.value.weight",
304
- "encoderLayer.17.multiHeadAttention.v.bias": "encoder.layer.17.attention.self.value.bias",
305
- "encoderLayer.17.multiHeadAttention.o.weight": "encoder.layer.17.attention.output.dense.weight",
306
- "encoderLayer.17.multiHeadAttention.o.bias": "encoder.layer.17.attention.output.dense.bias",
307
- "encoderLayer.17.attnLayerNorm.weight": "encoder.layer.17.attention.output.LayerNorm.weight",
308
- "encoderLayer.17.attnLayerNorm.bias": "encoder.layer.17.attention.output.LayerNorm.bias",
309
- "encoderLayer.17.feedForward.intermediateDense.weight": "encoder.layer.17.intermediate.dense.weight",
310
- "encoderLayer.17.feedForward.intermediateDense.bias": "encoder.layer.17.intermediate.dense.bias",
311
- "encoderLayer.17.feedForward.outputDense.weight": "encoder.layer.17.output.dense.weight",
312
- "encoderLayer.17.feedForward.outputDense.bias": "encoder.layer.17.output.dense.bias",
313
- "encoderLayer.17.ffnLayerNorm.weight": "encoder.layer.17.output.LayerNorm.weight",
314
- "encoderLayer.17.ffnLayerNorm.bias": "encoder.layer.17.output.LayerNorm.bias",
315
- "encoderLayer.18.multiHeadAttention.q.weight": "encoder.layer.18.attention.self.query.weight",
316
- "encoderLayer.18.multiHeadAttention.q.bias": "encoder.layer.18.attention.self.query.bias",
317
- "encoderLayer.18.multiHeadAttention.k.weight": "encoder.layer.18.attention.self.key.weight",
318
- "encoderLayer.18.multiHeadAttention.k.bias": "encoder.layer.18.attention.self.key.bias",
319
- "encoderLayer.18.multiHeadAttention.v.weight": "encoder.layer.18.attention.self.value.weight",
320
- "encoderLayer.18.multiHeadAttention.v.bias": "encoder.layer.18.attention.self.value.bias",
321
- "encoderLayer.18.multiHeadAttention.o.weight": "encoder.layer.18.attention.output.dense.weight",
322
- "encoderLayer.18.multiHeadAttention.o.bias": "encoder.layer.18.attention.output.dense.bias",
323
- "encoderLayer.18.attnLayerNorm.weight": "encoder.layer.18.attention.output.LayerNorm.weight",
324
- "encoderLayer.18.attnLayerNorm.bias": "encoder.layer.18.attention.output.LayerNorm.bias",
325
- "encoderLayer.18.feedForward.intermediateDense.weight": "encoder.layer.18.intermediate.dense.weight",
326
- "encoderLayer.18.feedForward.intermediateDense.bias": "encoder.layer.18.intermediate.dense.bias",
327
- "encoderLayer.18.feedForward.outputDense.weight": "encoder.layer.18.output.dense.weight",
328
- "encoderLayer.18.feedForward.outputDense.bias": "encoder.layer.18.output.dense.bias",
329
- "encoderLayer.18.ffnLayerNorm.weight": "encoder.layer.18.output.LayerNorm.weight",
330
- "encoderLayer.18.ffnLayerNorm.bias": "encoder.layer.18.output.LayerNorm.bias",
331
- "encoderLayer.19.multiHeadAttention.q.weight": "encoder.layer.19.attention.self.query.weight",
332
- "encoderLayer.19.multiHeadAttention.q.bias": "encoder.layer.19.attention.self.query.bias",
333
- "encoderLayer.19.multiHeadAttention.k.weight": "encoder.layer.19.attention.self.key.weight",
334
- "encoderLayer.19.multiHeadAttention.k.bias": "encoder.layer.19.attention.self.key.bias",
335
- "encoderLayer.19.multiHeadAttention.v.weight": "encoder.layer.19.attention.self.value.weight",
336
- "encoderLayer.19.multiHeadAttention.v.bias": "encoder.layer.19.attention.self.value.bias",
337
- "encoderLayer.19.multiHeadAttention.o.weight": "encoder.layer.19.attention.output.dense.weight",
338
- "encoderLayer.19.multiHeadAttention.o.bias": "encoder.layer.19.attention.output.dense.bias",
339
- "encoderLayer.19.attnLayerNorm.weight": "encoder.layer.19.attention.output.LayerNorm.weight",
340
- "encoderLayer.19.attnLayerNorm.bias": "encoder.layer.19.attention.output.LayerNorm.bias",
341
- "encoderLayer.19.feedForward.intermediateDense.weight": "encoder.layer.19.intermediate.dense.weight",
342
- "encoderLayer.19.feedForward.intermediateDense.bias": "encoder.layer.19.intermediate.dense.bias",
343
- "encoderLayer.19.feedForward.outputDense.weight": "encoder.layer.19.output.dense.weight",
344
- "encoderLayer.19.feedForward.outputDense.bias": "encoder.layer.19.output.dense.bias",
345
- "encoderLayer.19.ffnLayerNorm.weight": "encoder.layer.19.output.LayerNorm.weight",
346
- "encoderLayer.19.ffnLayerNorm.bias": "encoder.layer.19.output.LayerNorm.bias",
347
- "encoderLayer.20.multiHeadAttention.q.weight": "encoder.layer.20.attention.self.query.weight",
348
- "encoderLayer.20.multiHeadAttention.q.bias": "encoder.layer.20.attention.self.query.bias",
349
- "encoderLayer.20.multiHeadAttention.k.weight": "encoder.layer.20.attention.self.key.weight",
350
- "encoderLayer.20.multiHeadAttention.k.bias": "encoder.layer.20.attention.self.key.bias",
351
- "encoderLayer.20.multiHeadAttention.v.weight": "encoder.layer.20.attention.self.value.weight",
352
- "encoderLayer.20.multiHeadAttention.v.bias": "encoder.layer.20.attention.self.value.bias",
353
- "encoderLayer.20.multiHeadAttention.o.weight": "encoder.layer.20.attention.output.dense.weight",
354
- "encoderLayer.20.multiHeadAttention.o.bias": "encoder.layer.20.attention.output.dense.bias",
355
- "encoderLayer.20.attnLayerNorm.weight": "encoder.layer.20.attention.output.LayerNorm.weight",
356
- "encoderLayer.20.attnLayerNorm.bias": "encoder.layer.20.attention.output.LayerNorm.bias",
357
- "encoderLayer.20.feedForward.intermediateDense.weight": "encoder.layer.20.intermediate.dense.weight",
358
- "encoderLayer.20.feedForward.intermediateDense.bias": "encoder.layer.20.intermediate.dense.bias",
359
- "encoderLayer.20.feedForward.outputDense.weight": "encoder.layer.20.output.dense.weight",
360
- "encoderLayer.20.feedForward.outputDense.bias": "encoder.layer.20.output.dense.bias",
361
- "encoderLayer.20.ffnLayerNorm.weight": "encoder.layer.20.output.LayerNorm.weight",
362
- "encoderLayer.20.ffnLayerNorm.bias": "encoder.layer.20.output.LayerNorm.bias",
363
- "encoderLayer.21.multiHeadAttention.q.weight": "encoder.layer.21.attention.self.query.weight",
364
- "encoderLayer.21.multiHeadAttention.q.bias": "encoder.layer.21.attention.self.query.bias",
365
- "encoderLayer.21.multiHeadAttention.k.weight": "encoder.layer.21.attention.self.key.weight",
366
- "encoderLayer.21.multiHeadAttention.k.bias": "encoder.layer.21.attention.self.key.bias",
367
- "encoderLayer.21.multiHeadAttention.v.weight": "encoder.layer.21.attention.self.value.weight",
368
- "encoderLayer.21.multiHeadAttention.v.bias": "encoder.layer.21.attention.self.value.bias",
369
- "encoderLayer.21.multiHeadAttention.o.weight": "encoder.layer.21.attention.output.dense.weight",
370
- "encoderLayer.21.multiHeadAttention.o.bias": "encoder.layer.21.attention.output.dense.bias",
371
- "encoderLayer.21.attnLayerNorm.weight": "encoder.layer.21.attention.output.LayerNorm.weight",
372
- "encoderLayer.21.attnLayerNorm.bias": "encoder.layer.21.attention.output.LayerNorm.bias",
373
- "encoderLayer.21.feedForward.intermediateDense.weight": "encoder.layer.21.intermediate.dense.weight",
374
- "encoderLayer.21.feedForward.intermediateDense.bias": "encoder.layer.21.intermediate.dense.bias",
375
- "encoderLayer.21.feedForward.outputDense.weight": "encoder.layer.21.output.dense.weight",
376
- "encoderLayer.21.feedForward.outputDense.bias": "encoder.layer.21.output.dense.bias",
377
- "encoderLayer.21.ffnLayerNorm.weight": "encoder.layer.21.output.LayerNorm.weight",
378
- "encoderLayer.21.ffnLayerNorm.bias": "encoder.layer.21.output.LayerNorm.bias",
379
- "encoderLayer.22.multiHeadAttention.q.weight": "encoder.layer.22.attention.self.query.weight",
380
- "encoderLayer.22.multiHeadAttention.q.bias": "encoder.layer.22.attention.self.query.bias",
381
- "encoderLayer.22.multiHeadAttention.k.weight": "encoder.layer.22.attention.self.key.weight",
382
- "encoderLayer.22.multiHeadAttention.k.bias": "encoder.layer.22.attention.self.key.bias",
383
- "encoderLayer.22.multiHeadAttention.v.weight": "encoder.layer.22.attention.self.value.weight",
384
- "encoderLayer.22.multiHeadAttention.v.bias": "encoder.layer.22.attention.self.value.bias",
385
- "encoderLayer.22.multiHeadAttention.o.weight": "encoder.layer.22.attention.output.dense.weight",
386
- "encoderLayer.22.multiHeadAttention.o.bias": "encoder.layer.22.attention.output.dense.bias",
387
- "encoderLayer.22.attnLayerNorm.weight": "encoder.layer.22.attention.output.LayerNorm.weight",
388
- "encoderLayer.22.attnLayerNorm.bias": "encoder.layer.22.attention.output.LayerNorm.bias",
389
- "encoderLayer.22.feedForward.intermediateDense.weight": "encoder.layer.22.intermediate.dense.weight",
390
- "encoderLayer.22.feedForward.intermediateDense.bias": "encoder.layer.22.intermediate.dense.bias",
391
- "encoderLayer.22.feedForward.outputDense.weight": "encoder.layer.22.output.dense.weight",
392
- "encoderLayer.22.feedForward.outputDense.bias": "encoder.layer.22.output.dense.bias",
393
- "encoderLayer.22.ffnLayerNorm.weight": "encoder.layer.22.output.LayerNorm.weight",
394
- "encoderLayer.22.ffnLayerNorm.bias": "encoder.layer.22.output.LayerNorm.bias",
395
- "encoderLayer.23.multiHeadAttention.q.weight": "encoder.layer.23.attention.self.query.weight",
396
- "encoderLayer.23.multiHeadAttention.q.bias": "encoder.layer.23.attention.self.query.bias",
397
- "encoderLayer.23.multiHeadAttention.k.weight": "encoder.layer.23.attention.self.key.weight",
398
- "encoderLayer.23.multiHeadAttention.k.bias": "encoder.layer.23.attention.self.key.bias",
399
- "encoderLayer.23.multiHeadAttention.v.weight": "encoder.layer.23.attention.self.value.weight",
400
- "encoderLayer.23.multiHeadAttention.v.bias": "encoder.layer.23.attention.self.value.bias",
401
- "encoderLayer.23.multiHeadAttention.o.weight": "encoder.layer.23.attention.output.dense.weight",
402
- "encoderLayer.23.multiHeadAttention.o.bias": "encoder.layer.23.attention.output.dense.bias",
403
- "encoderLayer.23.attnLayerNorm.weight": "encoder.layer.23.attention.output.LayerNorm.weight",
404
- "encoderLayer.23.attnLayerNorm.bias": "encoder.layer.23.attention.output.LayerNorm.bias",
405
- "encoderLayer.23.feedForward.intermediateDense.weight": "encoder.layer.23.intermediate.dense.weight",
406
- "encoderLayer.23.feedForward.intermediateDense.bias": "encoder.layer.23.intermediate.dense.bias",
407
- "encoderLayer.23.feedForward.outputDense.weight": "encoder.layer.23.output.dense.weight",
408
- "encoderLayer.23.feedForward.outputDense.bias": "encoder.layer.23.output.dense.bias",
409
- "encoderLayer.23.ffnLayerNorm.weight": "encoder.layer.23.output.LayerNorm.weight",
410
- "encoderLayer.23.ffnLayerNorm.bias": "encoder.layer.23.output.LayerNorm.bias"
411
- }
412
- }
 
 
 
1
+ {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 1024,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 4096,
8
+ "layer_norm_eps": 1e-12,
9
+ "max_position_embeddings": 512,
10
+ "model": "bert",
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 24,
13
+ "pad_token_id": 0,
14
+ "torch_dtype": "float32",
15
+ "type_vocab_size": 2,
16
+ "vocab_size": 30522,
17
+ "with_pool": true,
18
+ "pooling": {
19
+ "pool_strategy": "cls"
20
+ },
21
+ "mapping": {
22
+ "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
23
+ "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
24
+ "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
25
+ "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
26
+ "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
27
+ "pooler.weight": "pooler.dense.weight",
28
+ "pooler.bias": "pooler.dense.bias",
29
+ "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
30
+ "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
31
+ "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
32
+ "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
33
+ "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
34
+ "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
35
+ "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
36
+ "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
37
+ "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
38
+ "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
39
+ "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
40
+ "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
41
+ "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
42
+ "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
43
+ "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
44
+ "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
45
+ "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
46
+ "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
47
+ "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
48
+ "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
49
+ "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
50
+ "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
51
+ "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
52
+ "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
53
+ "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
54
+ "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
55
+ "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
56
+ "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
57
+ "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
58
+ "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
59
+ "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
60
+ "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
61
+ "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
62
+ "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
63
+ "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
64
+ "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
65
+ "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
66
+ "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
67
+ "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
68
+ "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
69
+ "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
70
+ "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
71
+ "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
72
+ "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
73
+ "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
74
+ "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
75
+ "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
76
+ "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
77
+ "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
78
+ "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
79
+ "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
80
+ "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
81
+ "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
82
+ "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
83
+ "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
84
+ "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
85
+ "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
86
+ "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
87
+ "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
88
+ "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
89
+ "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
90
+ "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
91
+ "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
92
+ "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
93
+ "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
94
+ "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
95
+ "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
96
+ "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
97
+ "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
98
+ "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
99
+ "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
100
+ "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
101
+ "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
102
+ "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
103
+ "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
104
+ "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
105
+ "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
106
+ "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
107
+ "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
108
+ "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
109
+ "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
110
+ "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
111
+ "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
112
+ "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
113
+ "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
114
+ "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
115
+ "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
116
+ "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
117
+ "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
118
+ "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
119
+ "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
120
+ "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
121
+ "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
122
+ "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
123
+ "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
124
+ "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
125
+ "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
126
+ "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
127
+ "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
128
+ "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
129
+ "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
130
+ "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
131
+ "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
132
+ "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
133
+ "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
134
+ "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
135
+ "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
136
+ "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
137
+ "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
138
+ "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
139
+ "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
140
+ "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
141
+ "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
142
+ "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
143
+ "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
144
+ "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
145
+ "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
146
+ "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
147
+ "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
148
+ "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
149
+ "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
150
+ "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
151
+ "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
152
+ "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
153
+ "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
154
+ "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
155
+ "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
156
+ "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
157
+ "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
158
+ "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
159
+ "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
160
+ "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
161
+ "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
162
+ "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
163
+ "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
164
+ "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
165
+ "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
166
+ "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
167
+ "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
168
+ "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
169
+ "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
170
+ "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
171
+ "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
172
+ "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
173
+ "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
174
+ "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
175
+ "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
176
+ "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
177
+ "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
178
+ "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
179
+ "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
180
+ "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
181
+ "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
182
+ "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
183
+ "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
184
+ "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
185
+ "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
186
+ "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
187
+ "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
188
+ "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
189
+ "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
190
+ "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
191
+ "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
192
+ "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
193
+ "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
194
+ "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
195
+ "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
196
+ "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
197
+ "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
198
+ "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
199
+ "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
200
+ "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
201
+ "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
202
+ "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
203
+ "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
204
+ "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
205
+ "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
206
+ "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
207
+ "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
208
+ "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
209
+ "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
210
+ "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
211
+ "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
212
+ "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
213
+ "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
214
+ "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
215
+ "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
216
+ "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
217
+ "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
218
+ "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
219
+ "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
220
+ "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias",
221
+ "encoderLayer.12.multiHeadAttention.q.weight": "encoder.layer.12.attention.self.query.weight",
222
+ "encoderLayer.12.multiHeadAttention.q.bias": "encoder.layer.12.attention.self.query.bias",
223
+ "encoderLayer.12.multiHeadAttention.k.weight": "encoder.layer.12.attention.self.key.weight",
224
+ "encoderLayer.12.multiHeadAttention.k.bias": "encoder.layer.12.attention.self.key.bias",
225
+ "encoderLayer.12.multiHeadAttention.v.weight": "encoder.layer.12.attention.self.value.weight",
226
+ "encoderLayer.12.multiHeadAttention.v.bias": "encoder.layer.12.attention.self.value.bias",
227
+ "encoderLayer.12.multiHeadAttention.o.weight": "encoder.layer.12.attention.output.dense.weight",
228
+ "encoderLayer.12.multiHeadAttention.o.bias": "encoder.layer.12.attention.output.dense.bias",
229
+ "encoderLayer.12.attnLayerNorm.weight": "encoder.layer.12.attention.output.LayerNorm.weight",
230
+ "encoderLayer.12.attnLayerNorm.bias": "encoder.layer.12.attention.output.LayerNorm.bias",
231
+ "encoderLayer.12.feedForward.intermediateDense.weight": "encoder.layer.12.intermediate.dense.weight",
232
+ "encoderLayer.12.feedForward.intermediateDense.bias": "encoder.layer.12.intermediate.dense.bias",
233
+ "encoderLayer.12.feedForward.outputDense.weight": "encoder.layer.12.output.dense.weight",
234
+ "encoderLayer.12.feedForward.outputDense.bias": "encoder.layer.12.output.dense.bias",
235
+ "encoderLayer.12.ffnLayerNorm.weight": "encoder.layer.12.output.LayerNorm.weight",
236
+ "encoderLayer.12.ffnLayerNorm.bias": "encoder.layer.12.output.LayerNorm.bias",
237
+ "encoderLayer.13.multiHeadAttention.q.weight": "encoder.layer.13.attention.self.query.weight",
238
+ "encoderLayer.13.multiHeadAttention.q.bias": "encoder.layer.13.attention.self.query.bias",
239
+ "encoderLayer.13.multiHeadAttention.k.weight": "encoder.layer.13.attention.self.key.weight",
240
+ "encoderLayer.13.multiHeadAttention.k.bias": "encoder.layer.13.attention.self.key.bias",
241
+ "encoderLayer.13.multiHeadAttention.v.weight": "encoder.layer.13.attention.self.value.weight",
242
+ "encoderLayer.13.multiHeadAttention.v.bias": "encoder.layer.13.attention.self.value.bias",
243
+ "encoderLayer.13.multiHeadAttention.o.weight": "encoder.layer.13.attention.output.dense.weight",
244
+ "encoderLayer.13.multiHeadAttention.o.bias": "encoder.layer.13.attention.output.dense.bias",
245
+ "encoderLayer.13.attnLayerNorm.weight": "encoder.layer.13.attention.output.LayerNorm.weight",
246
+ "encoderLayer.13.attnLayerNorm.bias": "encoder.layer.13.attention.output.LayerNorm.bias",
247
+ "encoderLayer.13.feedForward.intermediateDense.weight": "encoder.layer.13.intermediate.dense.weight",
248
+ "encoderLayer.13.feedForward.intermediateDense.bias": "encoder.layer.13.intermediate.dense.bias",
249
+ "encoderLayer.13.feedForward.outputDense.weight": "encoder.layer.13.output.dense.weight",
250
+ "encoderLayer.13.feedForward.outputDense.bias": "encoder.layer.13.output.dense.bias",
251
+ "encoderLayer.13.ffnLayerNorm.weight": "encoder.layer.13.output.LayerNorm.weight",
252
+ "encoderLayer.13.ffnLayerNorm.bias": "encoder.layer.13.output.LayerNorm.bias",
253
+ "encoderLayer.14.multiHeadAttention.q.weight": "encoder.layer.14.attention.self.query.weight",
254
+ "encoderLayer.14.multiHeadAttention.q.bias": "encoder.layer.14.attention.self.query.bias",
255
+ "encoderLayer.14.multiHeadAttention.k.weight": "encoder.layer.14.attention.self.key.weight",
256
+ "encoderLayer.14.multiHeadAttention.k.bias": "encoder.layer.14.attention.self.key.bias",
257
+ "encoderLayer.14.multiHeadAttention.v.weight": "encoder.layer.14.attention.self.value.weight",
258
+ "encoderLayer.14.multiHeadAttention.v.bias": "encoder.layer.14.attention.self.value.bias",
259
+ "encoderLayer.14.multiHeadAttention.o.weight": "encoder.layer.14.attention.output.dense.weight",
260
+ "encoderLayer.14.multiHeadAttention.o.bias": "encoder.layer.14.attention.output.dense.bias",
261
+ "encoderLayer.14.attnLayerNorm.weight": "encoder.layer.14.attention.output.LayerNorm.weight",
262
+ "encoderLayer.14.attnLayerNorm.bias": "encoder.layer.14.attention.output.LayerNorm.bias",
263
+ "encoderLayer.14.feedForward.intermediateDense.weight": "encoder.layer.14.intermediate.dense.weight",
264
+ "encoderLayer.14.feedForward.intermediateDense.bias": "encoder.layer.14.intermediate.dense.bias",
265
+ "encoderLayer.14.feedForward.outputDense.weight": "encoder.layer.14.output.dense.weight",
266
+ "encoderLayer.14.feedForward.outputDense.bias": "encoder.layer.14.output.dense.bias",
267
+ "encoderLayer.14.ffnLayerNorm.weight": "encoder.layer.14.output.LayerNorm.weight",
268
+ "encoderLayer.14.ffnLayerNorm.bias": "encoder.layer.14.output.LayerNorm.bias",
269
+ "encoderLayer.15.multiHeadAttention.q.weight": "encoder.layer.15.attention.self.query.weight",
270
+ "encoderLayer.15.multiHeadAttention.q.bias": "encoder.layer.15.attention.self.query.bias",
271
+ "encoderLayer.15.multiHeadAttention.k.weight": "encoder.layer.15.attention.self.key.weight",
272
+ "encoderLayer.15.multiHeadAttention.k.bias": "encoder.layer.15.attention.self.key.bias",
273
+ "encoderLayer.15.multiHeadAttention.v.weight": "encoder.layer.15.attention.self.value.weight",
274
+ "encoderLayer.15.multiHeadAttention.v.bias": "encoder.layer.15.attention.self.value.bias",
275
+ "encoderLayer.15.multiHeadAttention.o.weight": "encoder.layer.15.attention.output.dense.weight",
276
+ "encoderLayer.15.multiHeadAttention.o.bias": "encoder.layer.15.attention.output.dense.bias",
277
+ "encoderLayer.15.attnLayerNorm.weight": "encoder.layer.15.attention.output.LayerNorm.weight",
278
+ "encoderLayer.15.attnLayerNorm.bias": "encoder.layer.15.attention.output.LayerNorm.bias",
279
+ "encoderLayer.15.feedForward.intermediateDense.weight": "encoder.layer.15.intermediate.dense.weight",
280
+ "encoderLayer.15.feedForward.intermediateDense.bias": "encoder.layer.15.intermediate.dense.bias",
281
+ "encoderLayer.15.feedForward.outputDense.weight": "encoder.layer.15.output.dense.weight",
282
+ "encoderLayer.15.feedForward.outputDense.bias": "encoder.layer.15.output.dense.bias",
283
+ "encoderLayer.15.ffnLayerNorm.weight": "encoder.layer.15.output.LayerNorm.weight",
284
+ "encoderLayer.15.ffnLayerNorm.bias": "encoder.layer.15.output.LayerNorm.bias",
285
+ "encoderLayer.16.multiHeadAttention.q.weight": "encoder.layer.16.attention.self.query.weight",
286
+ "encoderLayer.16.multiHeadAttention.q.bias": "encoder.layer.16.attention.self.query.bias",
287
+ "encoderLayer.16.multiHeadAttention.k.weight": "encoder.layer.16.attention.self.key.weight",
288
+ "encoderLayer.16.multiHeadAttention.k.bias": "encoder.layer.16.attention.self.key.bias",
289
+ "encoderLayer.16.multiHeadAttention.v.weight": "encoder.layer.16.attention.self.value.weight",
290
+ "encoderLayer.16.multiHeadAttention.v.bias": "encoder.layer.16.attention.self.value.bias",
291
+ "encoderLayer.16.multiHeadAttention.o.weight": "encoder.layer.16.attention.output.dense.weight",
292
+ "encoderLayer.16.multiHeadAttention.o.bias": "encoder.layer.16.attention.output.dense.bias",
293
+ "encoderLayer.16.attnLayerNorm.weight": "encoder.layer.16.attention.output.LayerNorm.weight",
294
+ "encoderLayer.16.attnLayerNorm.bias": "encoder.layer.16.attention.output.LayerNorm.bias",
295
+ "encoderLayer.16.feedForward.intermediateDense.weight": "encoder.layer.16.intermediate.dense.weight",
296
+ "encoderLayer.16.feedForward.intermediateDense.bias": "encoder.layer.16.intermediate.dense.bias",
297
+ "encoderLayer.16.feedForward.outputDense.weight": "encoder.layer.16.output.dense.weight",
298
+ "encoderLayer.16.feedForward.outputDense.bias": "encoder.layer.16.output.dense.bias",
299
+ "encoderLayer.16.ffnLayerNorm.weight": "encoder.layer.16.output.LayerNorm.weight",
300
+ "encoderLayer.16.ffnLayerNorm.bias": "encoder.layer.16.output.LayerNorm.bias",
301
+ "encoderLayer.17.multiHeadAttention.q.weight": "encoder.layer.17.attention.self.query.weight",
302
+ "encoderLayer.17.multiHeadAttention.q.bias": "encoder.layer.17.attention.self.query.bias",
303
+ "encoderLayer.17.multiHeadAttention.k.weight": "encoder.layer.17.attention.self.key.weight",
304
+ "encoderLayer.17.multiHeadAttention.k.bias": "encoder.layer.17.attention.self.key.bias",
305
+ "encoderLayer.17.multiHeadAttention.v.weight": "encoder.layer.17.attention.self.value.weight",
306
+ "encoderLayer.17.multiHeadAttention.v.bias": "encoder.layer.17.attention.self.value.bias",
307
+ "encoderLayer.17.multiHeadAttention.o.weight": "encoder.layer.17.attention.output.dense.weight",
308
+ "encoderLayer.17.multiHeadAttention.o.bias": "encoder.layer.17.attention.output.dense.bias",
309
+ "encoderLayer.17.attnLayerNorm.weight": "encoder.layer.17.attention.output.LayerNorm.weight",
310
+ "encoderLayer.17.attnLayerNorm.bias": "encoder.layer.17.attention.output.LayerNorm.bias",
311
+ "encoderLayer.17.feedForward.intermediateDense.weight": "encoder.layer.17.intermediate.dense.weight",
312
+ "encoderLayer.17.feedForward.intermediateDense.bias": "encoder.layer.17.intermediate.dense.bias",
313
+ "encoderLayer.17.feedForward.outputDense.weight": "encoder.layer.17.output.dense.weight",
314
+ "encoderLayer.17.feedForward.outputDense.bias": "encoder.layer.17.output.dense.bias",
315
+ "encoderLayer.17.ffnLayerNorm.weight": "encoder.layer.17.output.LayerNorm.weight",
316
+ "encoderLayer.17.ffnLayerNorm.bias": "encoder.layer.17.output.LayerNorm.bias",
317
+ "encoderLayer.18.multiHeadAttention.q.weight": "encoder.layer.18.attention.self.query.weight",
318
+ "encoderLayer.18.multiHeadAttention.q.bias": "encoder.layer.18.attention.self.query.bias",
319
+ "encoderLayer.18.multiHeadAttention.k.weight": "encoder.layer.18.attention.self.key.weight",
320
+ "encoderLayer.18.multiHeadAttention.k.bias": "encoder.layer.18.attention.self.key.bias",
321
+ "encoderLayer.18.multiHeadAttention.v.weight": "encoder.layer.18.attention.self.value.weight",
322
+ "encoderLayer.18.multiHeadAttention.v.bias": "encoder.layer.18.attention.self.value.bias",
323
+ "encoderLayer.18.multiHeadAttention.o.weight": "encoder.layer.18.attention.output.dense.weight",
324
+ "encoderLayer.18.multiHeadAttention.o.bias": "encoder.layer.18.attention.output.dense.bias",
325
+ "encoderLayer.18.attnLayerNorm.weight": "encoder.layer.18.attention.output.LayerNorm.weight",
326
+ "encoderLayer.18.attnLayerNorm.bias": "encoder.layer.18.attention.output.LayerNorm.bias",
327
+ "encoderLayer.18.feedForward.intermediateDense.weight": "encoder.layer.18.intermediate.dense.weight",
328
+ "encoderLayer.18.feedForward.intermediateDense.bias": "encoder.layer.18.intermediate.dense.bias",
329
+ "encoderLayer.18.feedForward.outputDense.weight": "encoder.layer.18.output.dense.weight",
330
+ "encoderLayer.18.feedForward.outputDense.bias": "encoder.layer.18.output.dense.bias",
331
+ "encoderLayer.18.ffnLayerNorm.weight": "encoder.layer.18.output.LayerNorm.weight",
332
+ "encoderLayer.18.ffnLayerNorm.bias": "encoder.layer.18.output.LayerNorm.bias",
333
+ "encoderLayer.19.multiHeadAttention.q.weight": "encoder.layer.19.attention.self.query.weight",
334
+ "encoderLayer.19.multiHeadAttention.q.bias": "encoder.layer.19.attention.self.query.bias",
335
+ "encoderLayer.19.multiHeadAttention.k.weight": "encoder.layer.19.attention.self.key.weight",
336
+ "encoderLayer.19.multiHeadAttention.k.bias": "encoder.layer.19.attention.self.key.bias",
337
+ "encoderLayer.19.multiHeadAttention.v.weight": "encoder.layer.19.attention.self.value.weight",
338
+ "encoderLayer.19.multiHeadAttention.v.bias": "encoder.layer.19.attention.self.value.bias",
339
+ "encoderLayer.19.multiHeadAttention.o.weight": "encoder.layer.19.attention.output.dense.weight",
340
+ "encoderLayer.19.multiHeadAttention.o.bias": "encoder.layer.19.attention.output.dense.bias",
341
+ "encoderLayer.19.attnLayerNorm.weight": "encoder.layer.19.attention.output.LayerNorm.weight",
342
+ "encoderLayer.19.attnLayerNorm.bias": "encoder.layer.19.attention.output.LayerNorm.bias",
343
+ "encoderLayer.19.feedForward.intermediateDense.weight": "encoder.layer.19.intermediate.dense.weight",
344
+ "encoderLayer.19.feedForward.intermediateDense.bias": "encoder.layer.19.intermediate.dense.bias",
345
+ "encoderLayer.19.feedForward.outputDense.weight": "encoder.layer.19.output.dense.weight",
346
+ "encoderLayer.19.feedForward.outputDense.bias": "encoder.layer.19.output.dense.bias",
347
+ "encoderLayer.19.ffnLayerNorm.weight": "encoder.layer.19.output.LayerNorm.weight",
348
+ "encoderLayer.19.ffnLayerNorm.bias": "encoder.layer.19.output.LayerNorm.bias",
349
+ "encoderLayer.20.multiHeadAttention.q.weight": "encoder.layer.20.attention.self.query.weight",
350
+ "encoderLayer.20.multiHeadAttention.q.bias": "encoder.layer.20.attention.self.query.bias",
351
+ "encoderLayer.20.multiHeadAttention.k.weight": "encoder.layer.20.attention.self.key.weight",
352
+ "encoderLayer.20.multiHeadAttention.k.bias": "encoder.layer.20.attention.self.key.bias",
353
+ "encoderLayer.20.multiHeadAttention.v.weight": "encoder.layer.20.attention.self.value.weight",
354
+ "encoderLayer.20.multiHeadAttention.v.bias": "encoder.layer.20.attention.self.value.bias",
355
+ "encoderLayer.20.multiHeadAttention.o.weight": "encoder.layer.20.attention.output.dense.weight",
356
+ "encoderLayer.20.multiHeadAttention.o.bias": "encoder.layer.20.attention.output.dense.bias",
357
+ "encoderLayer.20.attnLayerNorm.weight": "encoder.layer.20.attention.output.LayerNorm.weight",
358
+ "encoderLayer.20.attnLayerNorm.bias": "encoder.layer.20.attention.output.LayerNorm.bias",
359
+ "encoderLayer.20.feedForward.intermediateDense.weight": "encoder.layer.20.intermediate.dense.weight",
360
+ "encoderLayer.20.feedForward.intermediateDense.bias": "encoder.layer.20.intermediate.dense.bias",
361
+ "encoderLayer.20.feedForward.outputDense.weight": "encoder.layer.20.output.dense.weight",
362
+ "encoderLayer.20.feedForward.outputDense.bias": "encoder.layer.20.output.dense.bias",
363
+ "encoderLayer.20.ffnLayerNorm.weight": "encoder.layer.20.output.LayerNorm.weight",
364
+ "encoderLayer.20.ffnLayerNorm.bias": "encoder.layer.20.output.LayerNorm.bias",
365
+ "encoderLayer.21.multiHeadAttention.q.weight": "encoder.layer.21.attention.self.query.weight",
366
+ "encoderLayer.21.multiHeadAttention.q.bias": "encoder.layer.21.attention.self.query.bias",
367
+ "encoderLayer.21.multiHeadAttention.k.weight": "encoder.layer.21.attention.self.key.weight",
368
+ "encoderLayer.21.multiHeadAttention.k.bias": "encoder.layer.21.attention.self.key.bias",
369
+ "encoderLayer.21.multiHeadAttention.v.weight": "encoder.layer.21.attention.self.value.weight",
370
+ "encoderLayer.21.multiHeadAttention.v.bias": "encoder.layer.21.attention.self.value.bias",
371
+ "encoderLayer.21.multiHeadAttention.o.weight": "encoder.layer.21.attention.output.dense.weight",
372
+ "encoderLayer.21.multiHeadAttention.o.bias": "encoder.layer.21.attention.output.dense.bias",
373
+ "encoderLayer.21.attnLayerNorm.weight": "encoder.layer.21.attention.output.LayerNorm.weight",
374
+ "encoderLayer.21.attnLayerNorm.bias": "encoder.layer.21.attention.output.LayerNorm.bias",
375
+ "encoderLayer.21.feedForward.intermediateDense.weight": "encoder.layer.21.intermediate.dense.weight",
376
+ "encoderLayer.21.feedForward.intermediateDense.bias": "encoder.layer.21.intermediate.dense.bias",
377
+ "encoderLayer.21.feedForward.outputDense.weight": "encoder.layer.21.output.dense.weight",
378
+ "encoderLayer.21.feedForward.outputDense.bias": "encoder.layer.21.output.dense.bias",
379
+ "encoderLayer.21.ffnLayerNorm.weight": "encoder.layer.21.output.LayerNorm.weight",
380
+ "encoderLayer.21.ffnLayerNorm.bias": "encoder.layer.21.output.LayerNorm.bias",
381
+ "encoderLayer.22.multiHeadAttention.q.weight": "encoder.layer.22.attention.self.query.weight",
382
+ "encoderLayer.22.multiHeadAttention.q.bias": "encoder.layer.22.attention.self.query.bias",
383
+ "encoderLayer.22.multiHeadAttention.k.weight": "encoder.layer.22.attention.self.key.weight",
384
+ "encoderLayer.22.multiHeadAttention.k.bias": "encoder.layer.22.attention.self.key.bias",
385
+ "encoderLayer.22.multiHeadAttention.v.weight": "encoder.layer.22.attention.self.value.weight",
386
+ "encoderLayer.22.multiHeadAttention.v.bias": "encoder.layer.22.attention.self.value.bias",
387
+ "encoderLayer.22.multiHeadAttention.o.weight": "encoder.layer.22.attention.output.dense.weight",
388
+ "encoderLayer.22.multiHeadAttention.o.bias": "encoder.layer.22.attention.output.dense.bias",
389
+ "encoderLayer.22.attnLayerNorm.weight": "encoder.layer.22.attention.output.LayerNorm.weight",
390
+ "encoderLayer.22.attnLayerNorm.bias": "encoder.layer.22.attention.output.LayerNorm.bias",
391
+ "encoderLayer.22.feedForward.intermediateDense.weight": "encoder.layer.22.intermediate.dense.weight",
392
+ "encoderLayer.22.feedForward.intermediateDense.bias": "encoder.layer.22.intermediate.dense.bias",
393
+ "encoderLayer.22.feedForward.outputDense.weight": "encoder.layer.22.output.dense.weight",
394
+ "encoderLayer.22.feedForward.outputDense.bias": "encoder.layer.22.output.dense.bias",
395
+ "encoderLayer.22.ffnLayerNorm.weight": "encoder.layer.22.output.LayerNorm.weight",
396
+ "encoderLayer.22.ffnLayerNorm.bias": "encoder.layer.22.output.LayerNorm.bias",
397
+ "encoderLayer.23.multiHeadAttention.q.weight": "encoder.layer.23.attention.self.query.weight",
398
+ "encoderLayer.23.multiHeadAttention.q.bias": "encoder.layer.23.attention.self.query.bias",
399
+ "encoderLayer.23.multiHeadAttention.k.weight": "encoder.layer.23.attention.self.key.weight",
400
+ "encoderLayer.23.multiHeadAttention.k.bias": "encoder.layer.23.attention.self.key.bias",
401
+ "encoderLayer.23.multiHeadAttention.v.weight": "encoder.layer.23.attention.self.value.weight",
402
+ "encoderLayer.23.multiHeadAttention.v.bias": "encoder.layer.23.attention.self.value.bias",
403
+ "encoderLayer.23.multiHeadAttention.o.weight": "encoder.layer.23.attention.output.dense.weight",
404
+ "encoderLayer.23.multiHeadAttention.o.bias": "encoder.layer.23.attention.output.dense.bias",
405
+ "encoderLayer.23.attnLayerNorm.weight": "encoder.layer.23.attention.output.LayerNorm.weight",
406
+ "encoderLayer.23.attnLayerNorm.bias": "encoder.layer.23.attention.output.LayerNorm.bias",
407
+ "encoderLayer.23.feedForward.intermediateDense.weight": "encoder.layer.23.intermediate.dense.weight",
408
+ "encoderLayer.23.feedForward.intermediateDense.bias": "encoder.layer.23.intermediate.dense.bias",
409
+ "encoderLayer.23.feedForward.outputDense.weight": "encoder.layer.23.output.dense.weight",
410
+ "encoderLayer.23.feedForward.outputDense.bias": "encoder.layer.23.output.dense.bias",
411
+ "encoderLayer.23.ffnLayerNorm.weight": "encoder.layer.23.output.LayerNorm.weight",
412
+ "encoderLayer.23.ffnLayerNorm.bias": "encoder.layer.23.output.LayerNorm.bias"
413
+ }
414
+ }
BAAI/bge-large-zh-v1.5/bert4torch_config.json CHANGED
@@ -1,415 +1,417 @@
1
  {
2
- "attention_probs_dropout_prob": 0.1,
3
- "hidden_act": "gelu",
4
- "hidden_dropout_prob": 0.1,
5
- "hidden_size": 1024,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 4096,
8
- "layer_norm_eps": 1e-12,
9
- "max_position_embeddings": 512,
10
- "model": "bert",
11
- "num_attention_heads": 16,
12
- "num_hidden_layers": 24,
13
- "bos_token_id": 0,
14
- "eos_token_id": 2,
15
- "pad_token_id": 0,
16
- "torch_dtype": "float32",
17
- "type_vocab_size": 2,
18
- "vocab_size": 21128,
19
- "with_pool": true,
20
- "pooling": {"pool_strategy": "cls"},
21
- "norm_mode": "torch_buildin",
22
- "mapping": {
23
- "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
24
- "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
25
- "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
26
- "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
27
- "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
28
- "pooler.weight": "pooler.dense.weight",
29
- "pooler.bias": "pooler.dense.bias",
30
- "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
31
- "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
32
- "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
33
- "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
34
- "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
35
- "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
36
- "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
37
- "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
38
- "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
39
- "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
40
- "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
41
- "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
42
- "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
43
- "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
44
- "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
45
- "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
46
- "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
47
- "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
48
- "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
49
- "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
50
- "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
51
- "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
52
- "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
53
- "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
54
- "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
55
- "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
56
- "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
57
- "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
58
- "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
59
- "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
60
- "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
61
- "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
62
- "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
63
- "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
64
- "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
65
- "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
66
- "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
67
- "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
68
- "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
69
- "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
70
- "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
71
- "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
72
- "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
73
- "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
74
- "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
75
- "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
76
- "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
77
- "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
78
- "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
79
- "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
80
- "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
81
- "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
82
- "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
83
- "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
84
- "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
85
- "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
86
- "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
87
- "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
88
- "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
89
- "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
90
- "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
91
- "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
92
- "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
93
- "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
94
- "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
95
- "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
96
- "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
97
- "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
98
- "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
99
- "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
100
- "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
101
- "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
102
- "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
103
- "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
104
- "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
105
- "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
106
- "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
107
- "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
108
- "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
109
- "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
110
- "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
111
- "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
112
- "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
113
- "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
114
- "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
115
- "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
116
- "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
117
- "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
118
- "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
119
- "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
120
- "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
121
- "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
122
- "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
123
- "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
124
- "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
125
- "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
126
- "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
127
- "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
128
- "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
129
- "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
130
- "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
131
- "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
132
- "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
133
- "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
134
- "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
135
- "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
136
- "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
137
- "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
138
- "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
139
- "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
140
- "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
141
- "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
142
- "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
143
- "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
144
- "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
145
- "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
146
- "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
147
- "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
148
- "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
149
- "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
150
- "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
151
- "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
152
- "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
153
- "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
154
- "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
155
- "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
156
- "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
157
- "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
158
- "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
159
- "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
160
- "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
161
- "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
162
- "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
163
- "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
164
- "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
165
- "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
166
- "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
167
- "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
168
- "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
169
- "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
170
- "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
171
- "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
172
- "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
173
- "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
174
- "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
175
- "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
176
- "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
177
- "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
178
- "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
179
- "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
180
- "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
181
- "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
182
- "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
183
- "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
184
- "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
185
- "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
186
- "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
187
- "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
188
- "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
189
- "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
190
- "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
191
- "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
192
- "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
193
- "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
194
- "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
195
- "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
196
- "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
197
- "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
198
- "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
199
- "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
200
- "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
201
- "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
202
- "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
203
- "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
204
- "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
205
- "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
206
- "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
207
- "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
208
- "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
209
- "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
210
- "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
211
- "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
212
- "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
213
- "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
214
- "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
215
- "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
216
- "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
217
- "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
218
- "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
219
- "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
220
- "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
221
- "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias",
222
- "encoderLayer.12.multiHeadAttention.q.weight": "encoder.layer.12.attention.self.query.weight",
223
- "encoderLayer.12.multiHeadAttention.q.bias": "encoder.layer.12.attention.self.query.bias",
224
- "encoderLayer.12.multiHeadAttention.k.weight": "encoder.layer.12.attention.self.key.weight",
225
- "encoderLayer.12.multiHeadAttention.k.bias": "encoder.layer.12.attention.self.key.bias",
226
- "encoderLayer.12.multiHeadAttention.v.weight": "encoder.layer.12.attention.self.value.weight",
227
- "encoderLayer.12.multiHeadAttention.v.bias": "encoder.layer.12.attention.self.value.bias",
228
- "encoderLayer.12.multiHeadAttention.o.weight": "encoder.layer.12.attention.output.dense.weight",
229
- "encoderLayer.12.multiHeadAttention.o.bias": "encoder.layer.12.attention.output.dense.bias",
230
- "encoderLayer.12.attnLayerNorm.weight": "encoder.layer.12.attention.output.LayerNorm.weight",
231
- "encoderLayer.12.attnLayerNorm.bias": "encoder.layer.12.attention.output.LayerNorm.bias",
232
- "encoderLayer.12.feedForward.intermediateDense.weight": "encoder.layer.12.intermediate.dense.weight",
233
- "encoderLayer.12.feedForward.intermediateDense.bias": "encoder.layer.12.intermediate.dense.bias",
234
- "encoderLayer.12.feedForward.outputDense.weight": "encoder.layer.12.output.dense.weight",
235
- "encoderLayer.12.feedForward.outputDense.bias": "encoder.layer.12.output.dense.bias",
236
- "encoderLayer.12.ffnLayerNorm.weight": "encoder.layer.12.output.LayerNorm.weight",
237
- "encoderLayer.12.ffnLayerNorm.bias": "encoder.layer.12.output.LayerNorm.bias",
238
- "encoderLayer.13.multiHeadAttention.q.weight": "encoder.layer.13.attention.self.query.weight",
239
- "encoderLayer.13.multiHeadAttention.q.bias": "encoder.layer.13.attention.self.query.bias",
240
- "encoderLayer.13.multiHeadAttention.k.weight": "encoder.layer.13.attention.self.key.weight",
241
- "encoderLayer.13.multiHeadAttention.k.bias": "encoder.layer.13.attention.self.key.bias",
242
- "encoderLayer.13.multiHeadAttention.v.weight": "encoder.layer.13.attention.self.value.weight",
243
- "encoderLayer.13.multiHeadAttention.v.bias": "encoder.layer.13.attention.self.value.bias",
244
- "encoderLayer.13.multiHeadAttention.o.weight": "encoder.layer.13.attention.output.dense.weight",
245
- "encoderLayer.13.multiHeadAttention.o.bias": "encoder.layer.13.attention.output.dense.bias",
246
- "encoderLayer.13.attnLayerNorm.weight": "encoder.layer.13.attention.output.LayerNorm.weight",
247
- "encoderLayer.13.attnLayerNorm.bias": "encoder.layer.13.attention.output.LayerNorm.bias",
248
- "encoderLayer.13.feedForward.intermediateDense.weight": "encoder.layer.13.intermediate.dense.weight",
249
- "encoderLayer.13.feedForward.intermediateDense.bias": "encoder.layer.13.intermediate.dense.bias",
250
- "encoderLayer.13.feedForward.outputDense.weight": "encoder.layer.13.output.dense.weight",
251
- "encoderLayer.13.feedForward.outputDense.bias": "encoder.layer.13.output.dense.bias",
252
- "encoderLayer.13.ffnLayerNorm.weight": "encoder.layer.13.output.LayerNorm.weight",
253
- "encoderLayer.13.ffnLayerNorm.bias": "encoder.layer.13.output.LayerNorm.bias",
254
- "encoderLayer.14.multiHeadAttention.q.weight": "encoder.layer.14.attention.self.query.weight",
255
- "encoderLayer.14.multiHeadAttention.q.bias": "encoder.layer.14.attention.self.query.bias",
256
- "encoderLayer.14.multiHeadAttention.k.weight": "encoder.layer.14.attention.self.key.weight",
257
- "encoderLayer.14.multiHeadAttention.k.bias": "encoder.layer.14.attention.self.key.bias",
258
- "encoderLayer.14.multiHeadAttention.v.weight": "encoder.layer.14.attention.self.value.weight",
259
- "encoderLayer.14.multiHeadAttention.v.bias": "encoder.layer.14.attention.self.value.bias",
260
- "encoderLayer.14.multiHeadAttention.o.weight": "encoder.layer.14.attention.output.dense.weight",
261
- "encoderLayer.14.multiHeadAttention.o.bias": "encoder.layer.14.attention.output.dense.bias",
262
- "encoderLayer.14.attnLayerNorm.weight": "encoder.layer.14.attention.output.LayerNorm.weight",
263
- "encoderLayer.14.attnLayerNorm.bias": "encoder.layer.14.attention.output.LayerNorm.bias",
264
- "encoderLayer.14.feedForward.intermediateDense.weight": "encoder.layer.14.intermediate.dense.weight",
265
- "encoderLayer.14.feedForward.intermediateDense.bias": "encoder.layer.14.intermediate.dense.bias",
266
- "encoderLayer.14.feedForward.outputDense.weight": "encoder.layer.14.output.dense.weight",
267
- "encoderLayer.14.feedForward.outputDense.bias": "encoder.layer.14.output.dense.bias",
268
- "encoderLayer.14.ffnLayerNorm.weight": "encoder.layer.14.output.LayerNorm.weight",
269
- "encoderLayer.14.ffnLayerNorm.bias": "encoder.layer.14.output.LayerNorm.bias",
270
- "encoderLayer.15.multiHeadAttention.q.weight": "encoder.layer.15.attention.self.query.weight",
271
- "encoderLayer.15.multiHeadAttention.q.bias": "encoder.layer.15.attention.self.query.bias",
272
- "encoderLayer.15.multiHeadAttention.k.weight": "encoder.layer.15.attention.self.key.weight",
273
- "encoderLayer.15.multiHeadAttention.k.bias": "encoder.layer.15.attention.self.key.bias",
274
- "encoderLayer.15.multiHeadAttention.v.weight": "encoder.layer.15.attention.self.value.weight",
275
- "encoderLayer.15.multiHeadAttention.v.bias": "encoder.layer.15.attention.self.value.bias",
276
- "encoderLayer.15.multiHeadAttention.o.weight": "encoder.layer.15.attention.output.dense.weight",
277
- "encoderLayer.15.multiHeadAttention.o.bias": "encoder.layer.15.attention.output.dense.bias",
278
- "encoderLayer.15.attnLayerNorm.weight": "encoder.layer.15.attention.output.LayerNorm.weight",
279
- "encoderLayer.15.attnLayerNorm.bias": "encoder.layer.15.attention.output.LayerNorm.bias",
280
- "encoderLayer.15.feedForward.intermediateDense.weight": "encoder.layer.15.intermediate.dense.weight",
281
- "encoderLayer.15.feedForward.intermediateDense.bias": "encoder.layer.15.intermediate.dense.bias",
282
- "encoderLayer.15.feedForward.outputDense.weight": "encoder.layer.15.output.dense.weight",
283
- "encoderLayer.15.feedForward.outputDense.bias": "encoder.layer.15.output.dense.bias",
284
- "encoderLayer.15.ffnLayerNorm.weight": "encoder.layer.15.output.LayerNorm.weight",
285
- "encoderLayer.15.ffnLayerNorm.bias": "encoder.layer.15.output.LayerNorm.bias",
286
- "encoderLayer.16.multiHeadAttention.q.weight": "encoder.layer.16.attention.self.query.weight",
287
- "encoderLayer.16.multiHeadAttention.q.bias": "encoder.layer.16.attention.self.query.bias",
288
- "encoderLayer.16.multiHeadAttention.k.weight": "encoder.layer.16.attention.self.key.weight",
289
- "encoderLayer.16.multiHeadAttention.k.bias": "encoder.layer.16.attention.self.key.bias",
290
- "encoderLayer.16.multiHeadAttention.v.weight": "encoder.layer.16.attention.self.value.weight",
291
- "encoderLayer.16.multiHeadAttention.v.bias": "encoder.layer.16.attention.self.value.bias",
292
- "encoderLayer.16.multiHeadAttention.o.weight": "encoder.layer.16.attention.output.dense.weight",
293
- "encoderLayer.16.multiHeadAttention.o.bias": "encoder.layer.16.attention.output.dense.bias",
294
- "encoderLayer.16.attnLayerNorm.weight": "encoder.layer.16.attention.output.LayerNorm.weight",
295
- "encoderLayer.16.attnLayerNorm.bias": "encoder.layer.16.attention.output.LayerNorm.bias",
296
- "encoderLayer.16.feedForward.intermediateDense.weight": "encoder.layer.16.intermediate.dense.weight",
297
- "encoderLayer.16.feedForward.intermediateDense.bias": "encoder.layer.16.intermediate.dense.bias",
298
- "encoderLayer.16.feedForward.outputDense.weight": "encoder.layer.16.output.dense.weight",
299
- "encoderLayer.16.feedForward.outputDense.bias": "encoder.layer.16.output.dense.bias",
300
- "encoderLayer.16.ffnLayerNorm.weight": "encoder.layer.16.output.LayerNorm.weight",
301
- "encoderLayer.16.ffnLayerNorm.bias": "encoder.layer.16.output.LayerNorm.bias",
302
- "encoderLayer.17.multiHeadAttention.q.weight": "encoder.layer.17.attention.self.query.weight",
303
- "encoderLayer.17.multiHeadAttention.q.bias": "encoder.layer.17.attention.self.query.bias",
304
- "encoderLayer.17.multiHeadAttention.k.weight": "encoder.layer.17.attention.self.key.weight",
305
- "encoderLayer.17.multiHeadAttention.k.bias": "encoder.layer.17.attention.self.key.bias",
306
- "encoderLayer.17.multiHeadAttention.v.weight": "encoder.layer.17.attention.self.value.weight",
307
- "encoderLayer.17.multiHeadAttention.v.bias": "encoder.layer.17.attention.self.value.bias",
308
- "encoderLayer.17.multiHeadAttention.o.weight": "encoder.layer.17.attention.output.dense.weight",
309
- "encoderLayer.17.multiHeadAttention.o.bias": "encoder.layer.17.attention.output.dense.bias",
310
- "encoderLayer.17.attnLayerNorm.weight": "encoder.layer.17.attention.output.LayerNorm.weight",
311
- "encoderLayer.17.attnLayerNorm.bias": "encoder.layer.17.attention.output.LayerNorm.bias",
312
- "encoderLayer.17.feedForward.intermediateDense.weight": "encoder.layer.17.intermediate.dense.weight",
313
- "encoderLayer.17.feedForward.intermediateDense.bias": "encoder.layer.17.intermediate.dense.bias",
314
- "encoderLayer.17.feedForward.outputDense.weight": "encoder.layer.17.output.dense.weight",
315
- "encoderLayer.17.feedForward.outputDense.bias": "encoder.layer.17.output.dense.bias",
316
- "encoderLayer.17.ffnLayerNorm.weight": "encoder.layer.17.output.LayerNorm.weight",
317
- "encoderLayer.17.ffnLayerNorm.bias": "encoder.layer.17.output.LayerNorm.bias",
318
- "encoderLayer.18.multiHeadAttention.q.weight": "encoder.layer.18.attention.self.query.weight",
319
- "encoderLayer.18.multiHeadAttention.q.bias": "encoder.layer.18.attention.self.query.bias",
320
- "encoderLayer.18.multiHeadAttention.k.weight": "encoder.layer.18.attention.self.key.weight",
321
- "encoderLayer.18.multiHeadAttention.k.bias": "encoder.layer.18.attention.self.key.bias",
322
- "encoderLayer.18.multiHeadAttention.v.weight": "encoder.layer.18.attention.self.value.weight",
323
- "encoderLayer.18.multiHeadAttention.v.bias": "encoder.layer.18.attention.self.value.bias",
324
- "encoderLayer.18.multiHeadAttention.o.weight": "encoder.layer.18.attention.output.dense.weight",
325
- "encoderLayer.18.multiHeadAttention.o.bias": "encoder.layer.18.attention.output.dense.bias",
326
- "encoderLayer.18.attnLayerNorm.weight": "encoder.layer.18.attention.output.LayerNorm.weight",
327
- "encoderLayer.18.attnLayerNorm.bias": "encoder.layer.18.attention.output.LayerNorm.bias",
328
- "encoderLayer.18.feedForward.intermediateDense.weight": "encoder.layer.18.intermediate.dense.weight",
329
- "encoderLayer.18.feedForward.intermediateDense.bias": "encoder.layer.18.intermediate.dense.bias",
330
- "encoderLayer.18.feedForward.outputDense.weight": "encoder.layer.18.output.dense.weight",
331
- "encoderLayer.18.feedForward.outputDense.bias": "encoder.layer.18.output.dense.bias",
332
- "encoderLayer.18.ffnLayerNorm.weight": "encoder.layer.18.output.LayerNorm.weight",
333
- "encoderLayer.18.ffnLayerNorm.bias": "encoder.layer.18.output.LayerNorm.bias",
334
- "encoderLayer.19.multiHeadAttention.q.weight": "encoder.layer.19.attention.self.query.weight",
335
- "encoderLayer.19.multiHeadAttention.q.bias": "encoder.layer.19.attention.self.query.bias",
336
- "encoderLayer.19.multiHeadAttention.k.weight": "encoder.layer.19.attention.self.key.weight",
337
- "encoderLayer.19.multiHeadAttention.k.bias": "encoder.layer.19.attention.self.key.bias",
338
- "encoderLayer.19.multiHeadAttention.v.weight": "encoder.layer.19.attention.self.value.weight",
339
- "encoderLayer.19.multiHeadAttention.v.bias": "encoder.layer.19.attention.self.value.bias",
340
- "encoderLayer.19.multiHeadAttention.o.weight": "encoder.layer.19.attention.output.dense.weight",
341
- "encoderLayer.19.multiHeadAttention.o.bias": "encoder.layer.19.attention.output.dense.bias",
342
- "encoderLayer.19.attnLayerNorm.weight": "encoder.layer.19.attention.output.LayerNorm.weight",
343
- "encoderLayer.19.attnLayerNorm.bias": "encoder.layer.19.attention.output.LayerNorm.bias",
344
- "encoderLayer.19.feedForward.intermediateDense.weight": "encoder.layer.19.intermediate.dense.weight",
345
- "encoderLayer.19.feedForward.intermediateDense.bias": "encoder.layer.19.intermediate.dense.bias",
346
- "encoderLayer.19.feedForward.outputDense.weight": "encoder.layer.19.output.dense.weight",
347
- "encoderLayer.19.feedForward.outputDense.bias": "encoder.layer.19.output.dense.bias",
348
- "encoderLayer.19.ffnLayerNorm.weight": "encoder.layer.19.output.LayerNorm.weight",
349
- "encoderLayer.19.ffnLayerNorm.bias": "encoder.layer.19.output.LayerNorm.bias",
350
- "encoderLayer.20.multiHeadAttention.q.weight": "encoder.layer.20.attention.self.query.weight",
351
- "encoderLayer.20.multiHeadAttention.q.bias": "encoder.layer.20.attention.self.query.bias",
352
- "encoderLayer.20.multiHeadAttention.k.weight": "encoder.layer.20.attention.self.key.weight",
353
- "encoderLayer.20.multiHeadAttention.k.bias": "encoder.layer.20.attention.self.key.bias",
354
- "encoderLayer.20.multiHeadAttention.v.weight": "encoder.layer.20.attention.self.value.weight",
355
- "encoderLayer.20.multiHeadAttention.v.bias": "encoder.layer.20.attention.self.value.bias",
356
- "encoderLayer.20.multiHeadAttention.o.weight": "encoder.layer.20.attention.output.dense.weight",
357
- "encoderLayer.20.multiHeadAttention.o.bias": "encoder.layer.20.attention.output.dense.bias",
358
- "encoderLayer.20.attnLayerNorm.weight": "encoder.layer.20.attention.output.LayerNorm.weight",
359
- "encoderLayer.20.attnLayerNorm.bias": "encoder.layer.20.attention.output.LayerNorm.bias",
360
- "encoderLayer.20.feedForward.intermediateDense.weight": "encoder.layer.20.intermediate.dense.weight",
361
- "encoderLayer.20.feedForward.intermediateDense.bias": "encoder.layer.20.intermediate.dense.bias",
362
- "encoderLayer.20.feedForward.outputDense.weight": "encoder.layer.20.output.dense.weight",
363
- "encoderLayer.20.feedForward.outputDense.bias": "encoder.layer.20.output.dense.bias",
364
- "encoderLayer.20.ffnLayerNorm.weight": "encoder.layer.20.output.LayerNorm.weight",
365
- "encoderLayer.20.ffnLayerNorm.bias": "encoder.layer.20.output.LayerNorm.bias",
366
- "encoderLayer.21.multiHeadAttention.q.weight": "encoder.layer.21.attention.self.query.weight",
367
- "encoderLayer.21.multiHeadAttention.q.bias": "encoder.layer.21.attention.self.query.bias",
368
- "encoderLayer.21.multiHeadAttention.k.weight": "encoder.layer.21.attention.self.key.weight",
369
- "encoderLayer.21.multiHeadAttention.k.bias": "encoder.layer.21.attention.self.key.bias",
370
- "encoderLayer.21.multiHeadAttention.v.weight": "encoder.layer.21.attention.self.value.weight",
371
- "encoderLayer.21.multiHeadAttention.v.bias": "encoder.layer.21.attention.self.value.bias",
372
- "encoderLayer.21.multiHeadAttention.o.weight": "encoder.layer.21.attention.output.dense.weight",
373
- "encoderLayer.21.multiHeadAttention.o.bias": "encoder.layer.21.attention.output.dense.bias",
374
- "encoderLayer.21.attnLayerNorm.weight": "encoder.layer.21.attention.output.LayerNorm.weight",
375
- "encoderLayer.21.attnLayerNorm.bias": "encoder.layer.21.attention.output.LayerNorm.bias",
376
- "encoderLayer.21.feedForward.intermediateDense.weight": "encoder.layer.21.intermediate.dense.weight",
377
- "encoderLayer.21.feedForward.intermediateDense.bias": "encoder.layer.21.intermediate.dense.bias",
378
- "encoderLayer.21.feedForward.outputDense.weight": "encoder.layer.21.output.dense.weight",
379
- "encoderLayer.21.feedForward.outputDense.bias": "encoder.layer.21.output.dense.bias",
380
- "encoderLayer.21.ffnLayerNorm.weight": "encoder.layer.21.output.LayerNorm.weight",
381
- "encoderLayer.21.ffnLayerNorm.bias": "encoder.layer.21.output.LayerNorm.bias",
382
- "encoderLayer.22.multiHeadAttention.q.weight": "encoder.layer.22.attention.self.query.weight",
383
- "encoderLayer.22.multiHeadAttention.q.bias": "encoder.layer.22.attention.self.query.bias",
384
- "encoderLayer.22.multiHeadAttention.k.weight": "encoder.layer.22.attention.self.key.weight",
385
- "encoderLayer.22.multiHeadAttention.k.bias": "encoder.layer.22.attention.self.key.bias",
386
- "encoderLayer.22.multiHeadAttention.v.weight": "encoder.layer.22.attention.self.value.weight",
387
- "encoderLayer.22.multiHeadAttention.v.bias": "encoder.layer.22.attention.self.value.bias",
388
- "encoderLayer.22.multiHeadAttention.o.weight": "encoder.layer.22.attention.output.dense.weight",
389
- "encoderLayer.22.multiHeadAttention.o.bias": "encoder.layer.22.attention.output.dense.bias",
390
- "encoderLayer.22.attnLayerNorm.weight": "encoder.layer.22.attention.output.LayerNorm.weight",
391
- "encoderLayer.22.attnLayerNorm.bias": "encoder.layer.22.attention.output.LayerNorm.bias",
392
- "encoderLayer.22.feedForward.intermediateDense.weight": "encoder.layer.22.intermediate.dense.weight",
393
- "encoderLayer.22.feedForward.intermediateDense.bias": "encoder.layer.22.intermediate.dense.bias",
394
- "encoderLayer.22.feedForward.outputDense.weight": "encoder.layer.22.output.dense.weight",
395
- "encoderLayer.22.feedForward.outputDense.bias": "encoder.layer.22.output.dense.bias",
396
- "encoderLayer.22.ffnLayerNorm.weight": "encoder.layer.22.output.LayerNorm.weight",
397
- "encoderLayer.22.ffnLayerNorm.bias": "encoder.layer.22.output.LayerNorm.bias",
398
- "encoderLayer.23.multiHeadAttention.q.weight": "encoder.layer.23.attention.self.query.weight",
399
- "encoderLayer.23.multiHeadAttention.q.bias": "encoder.layer.23.attention.self.query.bias",
400
- "encoderLayer.23.multiHeadAttention.k.weight": "encoder.layer.23.attention.self.key.weight",
401
- "encoderLayer.23.multiHeadAttention.k.bias": "encoder.layer.23.attention.self.key.bias",
402
- "encoderLayer.23.multiHeadAttention.v.weight": "encoder.layer.23.attention.self.value.weight",
403
- "encoderLayer.23.multiHeadAttention.v.bias": "encoder.layer.23.attention.self.value.bias",
404
- "encoderLayer.23.multiHeadAttention.o.weight": "encoder.layer.23.attention.output.dense.weight",
405
- "encoderLayer.23.multiHeadAttention.o.bias": "encoder.layer.23.attention.output.dense.bias",
406
- "encoderLayer.23.attnLayerNorm.weight": "encoder.layer.23.attention.output.LayerNorm.weight",
407
- "encoderLayer.23.attnLayerNorm.bias": "encoder.layer.23.attention.output.LayerNorm.bias",
408
- "encoderLayer.23.feedForward.intermediateDense.weight": "encoder.layer.23.intermediate.dense.weight",
409
- "encoderLayer.23.feedForward.intermediateDense.bias": "encoder.layer.23.intermediate.dense.bias",
410
- "encoderLayer.23.feedForward.outputDense.weight": "encoder.layer.23.output.dense.weight",
411
- "encoderLayer.23.feedForward.outputDense.bias": "encoder.layer.23.output.dense.bias",
412
- "encoderLayer.23.ffnLayerNorm.weight": "encoder.layer.23.output.LayerNorm.weight",
413
- "encoderLayer.23.ffnLayerNorm.bias": "encoder.layer.23.output.LayerNorm.bias"
414
- }
415
- }
 
 
 
1
  {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 1024,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 4096,
8
+ "layer_norm_eps": 1e-12,
9
+ "max_position_embeddings": 512,
10
+ "model": "bert",
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 24,
13
+ "bos_token_id": 0,
14
+ "eos_token_id": 2,
15
+ "pad_token_id": 0,
16
+ "torch_dtype": "float32",
17
+ "type_vocab_size": 2,
18
+ "vocab_size": 21128,
19
+ "with_pool": true,
20
+ "pooling": {
21
+ "pool_strategy": "cls"
22
+ },
23
+ "norm_mode": "torch_buildin",
24
+ "mapping": {
25
+ "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
26
+ "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
27
+ "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
28
+ "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
29
+ "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
30
+ "pooler.weight": "pooler.dense.weight",
31
+ "pooler.bias": "pooler.dense.bias",
32
+ "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
33
+ "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
34
+ "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
35
+ "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
36
+ "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
37
+ "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
38
+ "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
39
+ "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
40
+ "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
41
+ "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
42
+ "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
43
+ "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
44
+ "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
45
+ "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
46
+ "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
47
+ "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
48
+ "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
49
+ "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
50
+ "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
51
+ "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
52
+ "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
53
+ "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
54
+ "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
55
+ "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
56
+ "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
57
+ "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
58
+ "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
59
+ "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
60
+ "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
61
+ "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
62
+ "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
63
+ "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
64
+ "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
65
+ "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
66
+ "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
67
+ "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
68
+ "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
69
+ "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
70
+ "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
71
+ "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
72
+ "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
73
+ "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
74
+ "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
75
+ "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
76
+ "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
77
+ "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
78
+ "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
79
+ "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
80
+ "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
81
+ "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
82
+ "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
83
+ "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
84
+ "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
85
+ "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
86
+ "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
87
+ "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
88
+ "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
89
+ "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
90
+ "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
91
+ "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
92
+ "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
93
+ "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
94
+ "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
95
+ "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
96
+ "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
97
+ "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
98
+ "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
99
+ "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
100
+ "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
101
+ "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
102
+ "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
103
+ "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
104
+ "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
105
+ "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
106
+ "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
107
+ "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
108
+ "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
109
+ "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
110
+ "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
111
+ "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
112
+ "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
113
+ "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
114
+ "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
115
+ "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
116
+ "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
117
+ "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
118
+ "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
119
+ "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
120
+ "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
121
+ "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
122
+ "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
123
+ "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
124
+ "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
125
+ "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
126
+ "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
127
+ "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
128
+ "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
129
+ "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
130
+ "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
131
+ "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
132
+ "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
133
+ "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
134
+ "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
135
+ "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
136
+ "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
137
+ "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
138
+ "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
139
+ "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
140
+ "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
141
+ "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
142
+ "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
143
+ "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
144
+ "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
145
+ "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
146
+ "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
147
+ "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
148
+ "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
149
+ "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
150
+ "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
151
+ "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
152
+ "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
153
+ "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
154
+ "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
155
+ "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
156
+ "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
157
+ "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
158
+ "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
159
+ "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
160
+ "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
161
+ "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
162
+ "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
163
+ "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
164
+ "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
165
+ "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
166
+ "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
167
+ "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
168
+ "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
169
+ "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
170
+ "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
171
+ "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
172
+ "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
173
+ "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
174
+ "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
175
+ "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
176
+ "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
177
+ "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
178
+ "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
179
+ "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
180
+ "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
181
+ "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
182
+ "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
183
+ "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
184
+ "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
185
+ "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
186
+ "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
187
+ "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
188
+ "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
189
+ "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
190
+ "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
191
+ "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
192
+ "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
193
+ "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
194
+ "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
195
+ "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
196
+ "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
197
+ "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
198
+ "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
199
+ "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
200
+ "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
201
+ "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
202
+ "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
203
+ "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
204
+ "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
205
+ "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
206
+ "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
207
+ "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
208
+ "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
209
+ "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
210
+ "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
211
+ "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
212
+ "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
213
+ "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
214
+ "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
215
+ "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
216
+ "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
217
+ "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
218
+ "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
219
+ "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
220
+ "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
221
+ "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
222
+ "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
223
+ "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias",
224
+ "encoderLayer.12.multiHeadAttention.q.weight": "encoder.layer.12.attention.self.query.weight",
225
+ "encoderLayer.12.multiHeadAttention.q.bias": "encoder.layer.12.attention.self.query.bias",
226
+ "encoderLayer.12.multiHeadAttention.k.weight": "encoder.layer.12.attention.self.key.weight",
227
+ "encoderLayer.12.multiHeadAttention.k.bias": "encoder.layer.12.attention.self.key.bias",
228
+ "encoderLayer.12.multiHeadAttention.v.weight": "encoder.layer.12.attention.self.value.weight",
229
+ "encoderLayer.12.multiHeadAttention.v.bias": "encoder.layer.12.attention.self.value.bias",
230
+ "encoderLayer.12.multiHeadAttention.o.weight": "encoder.layer.12.attention.output.dense.weight",
231
+ "encoderLayer.12.multiHeadAttention.o.bias": "encoder.layer.12.attention.output.dense.bias",
232
+ "encoderLayer.12.attnLayerNorm.weight": "encoder.layer.12.attention.output.LayerNorm.weight",
233
+ "encoderLayer.12.attnLayerNorm.bias": "encoder.layer.12.attention.output.LayerNorm.bias",
234
+ "encoderLayer.12.feedForward.intermediateDense.weight": "encoder.layer.12.intermediate.dense.weight",
235
+ "encoderLayer.12.feedForward.intermediateDense.bias": "encoder.layer.12.intermediate.dense.bias",
236
+ "encoderLayer.12.feedForward.outputDense.weight": "encoder.layer.12.output.dense.weight",
237
+ "encoderLayer.12.feedForward.outputDense.bias": "encoder.layer.12.output.dense.bias",
238
+ "encoderLayer.12.ffnLayerNorm.weight": "encoder.layer.12.output.LayerNorm.weight",
239
+ "encoderLayer.12.ffnLayerNorm.bias": "encoder.layer.12.output.LayerNorm.bias",
240
+ "encoderLayer.13.multiHeadAttention.q.weight": "encoder.layer.13.attention.self.query.weight",
241
+ "encoderLayer.13.multiHeadAttention.q.bias": "encoder.layer.13.attention.self.query.bias",
242
+ "encoderLayer.13.multiHeadAttention.k.weight": "encoder.layer.13.attention.self.key.weight",
243
+ "encoderLayer.13.multiHeadAttention.k.bias": "encoder.layer.13.attention.self.key.bias",
244
+ "encoderLayer.13.multiHeadAttention.v.weight": "encoder.layer.13.attention.self.value.weight",
245
+ "encoderLayer.13.multiHeadAttention.v.bias": "encoder.layer.13.attention.self.value.bias",
246
+ "encoderLayer.13.multiHeadAttention.o.weight": "encoder.layer.13.attention.output.dense.weight",
247
+ "encoderLayer.13.multiHeadAttention.o.bias": "encoder.layer.13.attention.output.dense.bias",
248
+ "encoderLayer.13.attnLayerNorm.weight": "encoder.layer.13.attention.output.LayerNorm.weight",
249
+ "encoderLayer.13.attnLayerNorm.bias": "encoder.layer.13.attention.output.LayerNorm.bias",
250
+ "encoderLayer.13.feedForward.intermediateDense.weight": "encoder.layer.13.intermediate.dense.weight",
251
+ "encoderLayer.13.feedForward.intermediateDense.bias": "encoder.layer.13.intermediate.dense.bias",
252
+ "encoderLayer.13.feedForward.outputDense.weight": "encoder.layer.13.output.dense.weight",
253
+ "encoderLayer.13.feedForward.outputDense.bias": "encoder.layer.13.output.dense.bias",
254
+ "encoderLayer.13.ffnLayerNorm.weight": "encoder.layer.13.output.LayerNorm.weight",
255
+ "encoderLayer.13.ffnLayerNorm.bias": "encoder.layer.13.output.LayerNorm.bias",
256
+ "encoderLayer.14.multiHeadAttention.q.weight": "encoder.layer.14.attention.self.query.weight",
257
+ "encoderLayer.14.multiHeadAttention.q.bias": "encoder.layer.14.attention.self.query.bias",
258
+ "encoderLayer.14.multiHeadAttention.k.weight": "encoder.layer.14.attention.self.key.weight",
259
+ "encoderLayer.14.multiHeadAttention.k.bias": "encoder.layer.14.attention.self.key.bias",
260
+ "encoderLayer.14.multiHeadAttention.v.weight": "encoder.layer.14.attention.self.value.weight",
261
+ "encoderLayer.14.multiHeadAttention.v.bias": "encoder.layer.14.attention.self.value.bias",
262
+ "encoderLayer.14.multiHeadAttention.o.weight": "encoder.layer.14.attention.output.dense.weight",
263
+ "encoderLayer.14.multiHeadAttention.o.bias": "encoder.layer.14.attention.output.dense.bias",
264
+ "encoderLayer.14.attnLayerNorm.weight": "encoder.layer.14.attention.output.LayerNorm.weight",
265
+ "encoderLayer.14.attnLayerNorm.bias": "encoder.layer.14.attention.output.LayerNorm.bias",
266
+ "encoderLayer.14.feedForward.intermediateDense.weight": "encoder.layer.14.intermediate.dense.weight",
267
+ "encoderLayer.14.feedForward.intermediateDense.bias": "encoder.layer.14.intermediate.dense.bias",
268
+ "encoderLayer.14.feedForward.outputDense.weight": "encoder.layer.14.output.dense.weight",
269
+ "encoderLayer.14.feedForward.outputDense.bias": "encoder.layer.14.output.dense.bias",
270
+ "encoderLayer.14.ffnLayerNorm.weight": "encoder.layer.14.output.LayerNorm.weight",
271
+ "encoderLayer.14.ffnLayerNorm.bias": "encoder.layer.14.output.LayerNorm.bias",
272
+ "encoderLayer.15.multiHeadAttention.q.weight": "encoder.layer.15.attention.self.query.weight",
273
+ "encoderLayer.15.multiHeadAttention.q.bias": "encoder.layer.15.attention.self.query.bias",
274
+ "encoderLayer.15.multiHeadAttention.k.weight": "encoder.layer.15.attention.self.key.weight",
275
+ "encoderLayer.15.multiHeadAttention.k.bias": "encoder.layer.15.attention.self.key.bias",
276
+ "encoderLayer.15.multiHeadAttention.v.weight": "encoder.layer.15.attention.self.value.weight",
277
+ "encoderLayer.15.multiHeadAttention.v.bias": "encoder.layer.15.attention.self.value.bias",
278
+ "encoderLayer.15.multiHeadAttention.o.weight": "encoder.layer.15.attention.output.dense.weight",
279
+ "encoderLayer.15.multiHeadAttention.o.bias": "encoder.layer.15.attention.output.dense.bias",
280
+ "encoderLayer.15.attnLayerNorm.weight": "encoder.layer.15.attention.output.LayerNorm.weight",
281
+ "encoderLayer.15.attnLayerNorm.bias": "encoder.layer.15.attention.output.LayerNorm.bias",
282
+ "encoderLayer.15.feedForward.intermediateDense.weight": "encoder.layer.15.intermediate.dense.weight",
283
+ "encoderLayer.15.feedForward.intermediateDense.bias": "encoder.layer.15.intermediate.dense.bias",
284
+ "encoderLayer.15.feedForward.outputDense.weight": "encoder.layer.15.output.dense.weight",
285
+ "encoderLayer.15.feedForward.outputDense.bias": "encoder.layer.15.output.dense.bias",
286
+ "encoderLayer.15.ffnLayerNorm.weight": "encoder.layer.15.output.LayerNorm.weight",
287
+ "encoderLayer.15.ffnLayerNorm.bias": "encoder.layer.15.output.LayerNorm.bias",
288
+ "encoderLayer.16.multiHeadAttention.q.weight": "encoder.layer.16.attention.self.query.weight",
289
+ "encoderLayer.16.multiHeadAttention.q.bias": "encoder.layer.16.attention.self.query.bias",
290
+ "encoderLayer.16.multiHeadAttention.k.weight": "encoder.layer.16.attention.self.key.weight",
291
+ "encoderLayer.16.multiHeadAttention.k.bias": "encoder.layer.16.attention.self.key.bias",
292
+ "encoderLayer.16.multiHeadAttention.v.weight": "encoder.layer.16.attention.self.value.weight",
293
+ "encoderLayer.16.multiHeadAttention.v.bias": "encoder.layer.16.attention.self.value.bias",
294
+ "encoderLayer.16.multiHeadAttention.o.weight": "encoder.layer.16.attention.output.dense.weight",
295
+ "encoderLayer.16.multiHeadAttention.o.bias": "encoder.layer.16.attention.output.dense.bias",
296
+ "encoderLayer.16.attnLayerNorm.weight": "encoder.layer.16.attention.output.LayerNorm.weight",
297
+ "encoderLayer.16.attnLayerNorm.bias": "encoder.layer.16.attention.output.LayerNorm.bias",
298
+ "encoderLayer.16.feedForward.intermediateDense.weight": "encoder.layer.16.intermediate.dense.weight",
299
+ "encoderLayer.16.feedForward.intermediateDense.bias": "encoder.layer.16.intermediate.dense.bias",
300
+ "encoderLayer.16.feedForward.outputDense.weight": "encoder.layer.16.output.dense.weight",
301
+ "encoderLayer.16.feedForward.outputDense.bias": "encoder.layer.16.output.dense.bias",
302
+ "encoderLayer.16.ffnLayerNorm.weight": "encoder.layer.16.output.LayerNorm.weight",
303
+ "encoderLayer.16.ffnLayerNorm.bias": "encoder.layer.16.output.LayerNorm.bias",
304
+ "encoderLayer.17.multiHeadAttention.q.weight": "encoder.layer.17.attention.self.query.weight",
305
+ "encoderLayer.17.multiHeadAttention.q.bias": "encoder.layer.17.attention.self.query.bias",
306
+ "encoderLayer.17.multiHeadAttention.k.weight": "encoder.layer.17.attention.self.key.weight",
307
+ "encoderLayer.17.multiHeadAttention.k.bias": "encoder.layer.17.attention.self.key.bias",
308
+ "encoderLayer.17.multiHeadAttention.v.weight": "encoder.layer.17.attention.self.value.weight",
309
+ "encoderLayer.17.multiHeadAttention.v.bias": "encoder.layer.17.attention.self.value.bias",
310
+ "encoderLayer.17.multiHeadAttention.o.weight": "encoder.layer.17.attention.output.dense.weight",
311
+ "encoderLayer.17.multiHeadAttention.o.bias": "encoder.layer.17.attention.output.dense.bias",
312
+ "encoderLayer.17.attnLayerNorm.weight": "encoder.layer.17.attention.output.LayerNorm.weight",
313
+ "encoderLayer.17.attnLayerNorm.bias": "encoder.layer.17.attention.output.LayerNorm.bias",
314
+ "encoderLayer.17.feedForward.intermediateDense.weight": "encoder.layer.17.intermediate.dense.weight",
315
+ "encoderLayer.17.feedForward.intermediateDense.bias": "encoder.layer.17.intermediate.dense.bias",
316
+ "encoderLayer.17.feedForward.outputDense.weight": "encoder.layer.17.output.dense.weight",
317
+ "encoderLayer.17.feedForward.outputDense.bias": "encoder.layer.17.output.dense.bias",
318
+ "encoderLayer.17.ffnLayerNorm.weight": "encoder.layer.17.output.LayerNorm.weight",
319
+ "encoderLayer.17.ffnLayerNorm.bias": "encoder.layer.17.output.LayerNorm.bias",
320
+ "encoderLayer.18.multiHeadAttention.q.weight": "encoder.layer.18.attention.self.query.weight",
321
+ "encoderLayer.18.multiHeadAttention.q.bias": "encoder.layer.18.attention.self.query.bias",
322
+ "encoderLayer.18.multiHeadAttention.k.weight": "encoder.layer.18.attention.self.key.weight",
323
+ "encoderLayer.18.multiHeadAttention.k.bias": "encoder.layer.18.attention.self.key.bias",
324
+ "encoderLayer.18.multiHeadAttention.v.weight": "encoder.layer.18.attention.self.value.weight",
325
+ "encoderLayer.18.multiHeadAttention.v.bias": "encoder.layer.18.attention.self.value.bias",
326
+ "encoderLayer.18.multiHeadAttention.o.weight": "encoder.layer.18.attention.output.dense.weight",
327
+ "encoderLayer.18.multiHeadAttention.o.bias": "encoder.layer.18.attention.output.dense.bias",
328
+ "encoderLayer.18.attnLayerNorm.weight": "encoder.layer.18.attention.output.LayerNorm.weight",
329
+ "encoderLayer.18.attnLayerNorm.bias": "encoder.layer.18.attention.output.LayerNorm.bias",
330
+ "encoderLayer.18.feedForward.intermediateDense.weight": "encoder.layer.18.intermediate.dense.weight",
331
+ "encoderLayer.18.feedForward.intermediateDense.bias": "encoder.layer.18.intermediate.dense.bias",
332
+ "encoderLayer.18.feedForward.outputDense.weight": "encoder.layer.18.output.dense.weight",
333
+ "encoderLayer.18.feedForward.outputDense.bias": "encoder.layer.18.output.dense.bias",
334
+ "encoderLayer.18.ffnLayerNorm.weight": "encoder.layer.18.output.LayerNorm.weight",
335
+ "encoderLayer.18.ffnLayerNorm.bias": "encoder.layer.18.output.LayerNorm.bias",
336
+ "encoderLayer.19.multiHeadAttention.q.weight": "encoder.layer.19.attention.self.query.weight",
337
+ "encoderLayer.19.multiHeadAttention.q.bias": "encoder.layer.19.attention.self.query.bias",
338
+ "encoderLayer.19.multiHeadAttention.k.weight": "encoder.layer.19.attention.self.key.weight",
339
+ "encoderLayer.19.multiHeadAttention.k.bias": "encoder.layer.19.attention.self.key.bias",
340
+ "encoderLayer.19.multiHeadAttention.v.weight": "encoder.layer.19.attention.self.value.weight",
341
+ "encoderLayer.19.multiHeadAttention.v.bias": "encoder.layer.19.attention.self.value.bias",
342
+ "encoderLayer.19.multiHeadAttention.o.weight": "encoder.layer.19.attention.output.dense.weight",
343
+ "encoderLayer.19.multiHeadAttention.o.bias": "encoder.layer.19.attention.output.dense.bias",
344
+ "encoderLayer.19.attnLayerNorm.weight": "encoder.layer.19.attention.output.LayerNorm.weight",
345
+ "encoderLayer.19.attnLayerNorm.bias": "encoder.layer.19.attention.output.LayerNorm.bias",
346
+ "encoderLayer.19.feedForward.intermediateDense.weight": "encoder.layer.19.intermediate.dense.weight",
347
+ "encoderLayer.19.feedForward.intermediateDense.bias": "encoder.layer.19.intermediate.dense.bias",
348
+ "encoderLayer.19.feedForward.outputDense.weight": "encoder.layer.19.output.dense.weight",
349
+ "encoderLayer.19.feedForward.outputDense.bias": "encoder.layer.19.output.dense.bias",
350
+ "encoderLayer.19.ffnLayerNorm.weight": "encoder.layer.19.output.LayerNorm.weight",
351
+ "encoderLayer.19.ffnLayerNorm.bias": "encoder.layer.19.output.LayerNorm.bias",
352
+ "encoderLayer.20.multiHeadAttention.q.weight": "encoder.layer.20.attention.self.query.weight",
353
+ "encoderLayer.20.multiHeadAttention.q.bias": "encoder.layer.20.attention.self.query.bias",
354
+ "encoderLayer.20.multiHeadAttention.k.weight": "encoder.layer.20.attention.self.key.weight",
355
+ "encoderLayer.20.multiHeadAttention.k.bias": "encoder.layer.20.attention.self.key.bias",
356
+ "encoderLayer.20.multiHeadAttention.v.weight": "encoder.layer.20.attention.self.value.weight",
357
+ "encoderLayer.20.multiHeadAttention.v.bias": "encoder.layer.20.attention.self.value.bias",
358
+ "encoderLayer.20.multiHeadAttention.o.weight": "encoder.layer.20.attention.output.dense.weight",
359
+ "encoderLayer.20.multiHeadAttention.o.bias": "encoder.layer.20.attention.output.dense.bias",
360
+ "encoderLayer.20.attnLayerNorm.weight": "encoder.layer.20.attention.output.LayerNorm.weight",
361
+ "encoderLayer.20.attnLayerNorm.bias": "encoder.layer.20.attention.output.LayerNorm.bias",
362
+ "encoderLayer.20.feedForward.intermediateDense.weight": "encoder.layer.20.intermediate.dense.weight",
363
+ "encoderLayer.20.feedForward.intermediateDense.bias": "encoder.layer.20.intermediate.dense.bias",
364
+ "encoderLayer.20.feedForward.outputDense.weight": "encoder.layer.20.output.dense.weight",
365
+ "encoderLayer.20.feedForward.outputDense.bias": "encoder.layer.20.output.dense.bias",
366
+ "encoderLayer.20.ffnLayerNorm.weight": "encoder.layer.20.output.LayerNorm.weight",
367
+ "encoderLayer.20.ffnLayerNorm.bias": "encoder.layer.20.output.LayerNorm.bias",
368
+ "encoderLayer.21.multiHeadAttention.q.weight": "encoder.layer.21.attention.self.query.weight",
369
+ "encoderLayer.21.multiHeadAttention.q.bias": "encoder.layer.21.attention.self.query.bias",
370
+ "encoderLayer.21.multiHeadAttention.k.weight": "encoder.layer.21.attention.self.key.weight",
371
+ "encoderLayer.21.multiHeadAttention.k.bias": "encoder.layer.21.attention.self.key.bias",
372
+ "encoderLayer.21.multiHeadAttention.v.weight": "encoder.layer.21.attention.self.value.weight",
373
+ "encoderLayer.21.multiHeadAttention.v.bias": "encoder.layer.21.attention.self.value.bias",
374
+ "encoderLayer.21.multiHeadAttention.o.weight": "encoder.layer.21.attention.output.dense.weight",
375
+ "encoderLayer.21.multiHeadAttention.o.bias": "encoder.layer.21.attention.output.dense.bias",
376
+ "encoderLayer.21.attnLayerNorm.weight": "encoder.layer.21.attention.output.LayerNorm.weight",
377
+ "encoderLayer.21.attnLayerNorm.bias": "encoder.layer.21.attention.output.LayerNorm.bias",
378
+ "encoderLayer.21.feedForward.intermediateDense.weight": "encoder.layer.21.intermediate.dense.weight",
379
+ "encoderLayer.21.feedForward.intermediateDense.bias": "encoder.layer.21.intermediate.dense.bias",
380
+ "encoderLayer.21.feedForward.outputDense.weight": "encoder.layer.21.output.dense.weight",
381
+ "encoderLayer.21.feedForward.outputDense.bias": "encoder.layer.21.output.dense.bias",
382
+ "encoderLayer.21.ffnLayerNorm.weight": "encoder.layer.21.output.LayerNorm.weight",
383
+ "encoderLayer.21.ffnLayerNorm.bias": "encoder.layer.21.output.LayerNorm.bias",
384
+ "encoderLayer.22.multiHeadAttention.q.weight": "encoder.layer.22.attention.self.query.weight",
385
+ "encoderLayer.22.multiHeadAttention.q.bias": "encoder.layer.22.attention.self.query.bias",
386
+ "encoderLayer.22.multiHeadAttention.k.weight": "encoder.layer.22.attention.self.key.weight",
387
+ "encoderLayer.22.multiHeadAttention.k.bias": "encoder.layer.22.attention.self.key.bias",
388
+ "encoderLayer.22.multiHeadAttention.v.weight": "encoder.layer.22.attention.self.value.weight",
389
+ "encoderLayer.22.multiHeadAttention.v.bias": "encoder.layer.22.attention.self.value.bias",
390
+ "encoderLayer.22.multiHeadAttention.o.weight": "encoder.layer.22.attention.output.dense.weight",
391
+ "encoderLayer.22.multiHeadAttention.o.bias": "encoder.layer.22.attention.output.dense.bias",
392
+ "encoderLayer.22.attnLayerNorm.weight": "encoder.layer.22.attention.output.LayerNorm.weight",
393
+ "encoderLayer.22.attnLayerNorm.bias": "encoder.layer.22.attention.output.LayerNorm.bias",
394
+ "encoderLayer.22.feedForward.intermediateDense.weight": "encoder.layer.22.intermediate.dense.weight",
395
+ "encoderLayer.22.feedForward.intermediateDense.bias": "encoder.layer.22.intermediate.dense.bias",
396
+ "encoderLayer.22.feedForward.outputDense.weight": "encoder.layer.22.output.dense.weight",
397
+ "encoderLayer.22.feedForward.outputDense.bias": "encoder.layer.22.output.dense.bias",
398
+ "encoderLayer.22.ffnLayerNorm.weight": "encoder.layer.22.output.LayerNorm.weight",
399
+ "encoderLayer.22.ffnLayerNorm.bias": "encoder.layer.22.output.LayerNorm.bias",
400
+ "encoderLayer.23.multiHeadAttention.q.weight": "encoder.layer.23.attention.self.query.weight",
401
+ "encoderLayer.23.multiHeadAttention.q.bias": "encoder.layer.23.attention.self.query.bias",
402
+ "encoderLayer.23.multiHeadAttention.k.weight": "encoder.layer.23.attention.self.key.weight",
403
+ "encoderLayer.23.multiHeadAttention.k.bias": "encoder.layer.23.attention.self.key.bias",
404
+ "encoderLayer.23.multiHeadAttention.v.weight": "encoder.layer.23.attention.self.value.weight",
405
+ "encoderLayer.23.multiHeadAttention.v.bias": "encoder.layer.23.attention.self.value.bias",
406
+ "encoderLayer.23.multiHeadAttention.o.weight": "encoder.layer.23.attention.output.dense.weight",
407
+ "encoderLayer.23.multiHeadAttention.o.bias": "encoder.layer.23.attention.output.dense.bias",
408
+ "encoderLayer.23.attnLayerNorm.weight": "encoder.layer.23.attention.output.LayerNorm.weight",
409
+ "encoderLayer.23.attnLayerNorm.bias": "encoder.layer.23.attention.output.LayerNorm.bias",
410
+ "encoderLayer.23.feedForward.intermediateDense.weight": "encoder.layer.23.intermediate.dense.weight",
411
+ "encoderLayer.23.feedForward.intermediateDense.bias": "encoder.layer.23.intermediate.dense.bias",
412
+ "encoderLayer.23.feedForward.outputDense.weight": "encoder.layer.23.output.dense.weight",
413
+ "encoderLayer.23.feedForward.outputDense.bias": "encoder.layer.23.output.dense.bias",
414
+ "encoderLayer.23.ffnLayerNorm.weight": "encoder.layer.23.output.LayerNorm.weight",
415
+ "encoderLayer.23.ffnLayerNorm.bias": "encoder.layer.23.output.LayerNorm.bias"
416
+ }
417
+ }
BAAI/bge-small-en-v1.5/bert4torch_config.json CHANGED
@@ -1,221 +1,223 @@
1
  {
2
- "attention_probs_dropout_prob": 0.1,
3
- "hidden_act": "gelu",
4
- "hidden_dropout_prob": 0.1,
5
- "hidden_size": 384,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 1536,
8
- "layer_norm_eps": 1e-12,
9
- "max_position_embeddings": 512,
10
- "model": "bert",
11
- "num_attention_heads": 12,
12
- "num_hidden_layers": 12,
13
- "pad_token_id": 0,
14
- "torch_dtype": "float32",
15
- "type_vocab_size": 2,
16
- "vocab_size": 30522,
17
- "with_pool": true,
18
- "pooling": {"pool_strategy": "cls"},
19
- "norm_mode": "torch_buildin",
20
- "mapping": {
21
- "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
22
- "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
23
- "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
24
- "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
25
- "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
26
- "pooler.weight": "pooler.dense.weight",
27
- "pooler.bias": "pooler.dense.bias",
28
- "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
29
- "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
30
- "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
31
- "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
32
- "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
33
- "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
34
- "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
35
- "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
36
- "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
37
- "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
38
- "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
39
- "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
40
- "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
41
- "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
42
- "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
43
- "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
44
- "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
45
- "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
46
- "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
47
- "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
48
- "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
49
- "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
50
- "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
51
- "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
52
- "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
53
- "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
54
- "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
55
- "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
56
- "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
57
- "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
58
- "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
59
- "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
60
- "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
61
- "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
62
- "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
63
- "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
64
- "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
65
- "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
66
- "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
67
- "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
68
- "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
69
- "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
70
- "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
71
- "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
72
- "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
73
- "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
74
- "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
75
- "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
76
- "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
77
- "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
78
- "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
79
- "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
80
- "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
81
- "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
82
- "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
83
- "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
84
- "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
85
- "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
86
- "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
87
- "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
88
- "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
89
- "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
90
- "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
91
- "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
92
- "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
93
- "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
94
- "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
95
- "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
96
- "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
97
- "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
98
- "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
99
- "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
100
- "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
101
- "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
102
- "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
103
- "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
104
- "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
105
- "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
106
- "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
107
- "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
108
- "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
109
- "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
110
- "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
111
- "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
112
- "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
113
- "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
114
- "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
115
- "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
116
- "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
117
- "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
118
- "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
119
- "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
120
- "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
121
- "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
122
- "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
123
- "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
124
- "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
125
- "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
126
- "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
127
- "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
128
- "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
129
- "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
130
- "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
131
- "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
132
- "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
133
- "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
134
- "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
135
- "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
136
- "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
137
- "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
138
- "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
139
- "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
140
- "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
141
- "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
142
- "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
143
- "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
144
- "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
145
- "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
146
- "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
147
- "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
148
- "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
149
- "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
150
- "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
151
- "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
152
- "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
153
- "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
154
- "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
155
- "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
156
- "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
157
- "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
158
- "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
159
- "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
160
- "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
161
- "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
162
- "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
163
- "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
164
- "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
165
- "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
166
- "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
167
- "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
168
- "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
169
- "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
170
- "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
171
- "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
172
- "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
173
- "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
174
- "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
175
- "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
176
- "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
177
- "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
178
- "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
179
- "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
180
- "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
181
- "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
182
- "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
183
- "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
184
- "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
185
- "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
186
- "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
187
- "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
188
- "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
189
- "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
190
- "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
191
- "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
192
- "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
193
- "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
194
- "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
195
- "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
196
- "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
197
- "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
198
- "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
199
- "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
200
- "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
201
- "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
202
- "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
203
- "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
204
- "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
205
- "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
206
- "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
207
- "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
208
- "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
209
- "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
210
- "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
211
- "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
212
- "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
213
- "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
214
- "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
215
- "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
216
- "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
217
- "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
218
- "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
219
- "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias"
220
- }
221
- }
 
 
 
1
  {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 384,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 1536,
8
+ "layer_norm_eps": 1e-12,
9
+ "max_position_embeddings": 512,
10
+ "model": "bert",
11
+ "num_attention_heads": 12,
12
+ "num_hidden_layers": 12,
13
+ "pad_token_id": 0,
14
+ "torch_dtype": "float32",
15
+ "type_vocab_size": 2,
16
+ "vocab_size": 30522,
17
+ "with_pool": true,
18
+ "pooling": {
19
+ "pool_strategy": "cls"
20
+ },
21
+ "norm_mode": "torch_buildin",
22
+ "mapping": {
23
+ "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
24
+ "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
25
+ "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
26
+ "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
27
+ "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
28
+ "pooler.weight": "pooler.dense.weight",
29
+ "pooler.bias": "pooler.dense.bias",
30
+ "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
31
+ "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
32
+ "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
33
+ "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
34
+ "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
35
+ "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
36
+ "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
37
+ "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
38
+ "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
39
+ "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
40
+ "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
41
+ "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
42
+ "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
43
+ "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
44
+ "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
45
+ "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
46
+ "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
47
+ "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
48
+ "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
49
+ "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
50
+ "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
51
+ "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
52
+ "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
53
+ "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
54
+ "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
55
+ "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
56
+ "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
57
+ "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
58
+ "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
59
+ "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
60
+ "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
61
+ "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
62
+ "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
63
+ "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
64
+ "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
65
+ "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
66
+ "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
67
+ "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
68
+ "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
69
+ "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
70
+ "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
71
+ "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
72
+ "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
73
+ "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
74
+ "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
75
+ "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
76
+ "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
77
+ "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
78
+ "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
79
+ "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
80
+ "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
81
+ "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
82
+ "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
83
+ "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
84
+ "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
85
+ "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
86
+ "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
87
+ "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
88
+ "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
89
+ "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
90
+ "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
91
+ "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
92
+ "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
93
+ "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
94
+ "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
95
+ "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
96
+ "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
97
+ "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
98
+ "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
99
+ "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
100
+ "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
101
+ "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
102
+ "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
103
+ "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
104
+ "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
105
+ "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
106
+ "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
107
+ "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
108
+ "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
109
+ "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
110
+ "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
111
+ "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
112
+ "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
113
+ "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
114
+ "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
115
+ "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
116
+ "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
117
+ "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
118
+ "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
119
+ "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
120
+ "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
121
+ "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
122
+ "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
123
+ "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
124
+ "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
125
+ "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
126
+ "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
127
+ "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
128
+ "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
129
+ "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
130
+ "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
131
+ "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
132
+ "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
133
+ "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
134
+ "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
135
+ "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
136
+ "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
137
+ "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
138
+ "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
139
+ "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
140
+ "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
141
+ "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
142
+ "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
143
+ "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
144
+ "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
145
+ "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
146
+ "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
147
+ "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
148
+ "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
149
+ "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
150
+ "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
151
+ "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
152
+ "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
153
+ "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
154
+ "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
155
+ "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
156
+ "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
157
+ "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
158
+ "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
159
+ "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
160
+ "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
161
+ "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
162
+ "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
163
+ "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
164
+ "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
165
+ "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
166
+ "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
167
+ "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
168
+ "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
169
+ "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
170
+ "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
171
+ "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
172
+ "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
173
+ "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
174
+ "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
175
+ "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
176
+ "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
177
+ "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
178
+ "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
179
+ "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
180
+ "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
181
+ "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
182
+ "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
183
+ "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
184
+ "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
185
+ "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
186
+ "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
187
+ "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
188
+ "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
189
+ "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
190
+ "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
191
+ "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
192
+ "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
193
+ "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
194
+ "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
195
+ "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
196
+ "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
197
+ "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
198
+ "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
199
+ "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
200
+ "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
201
+ "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
202
+ "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
203
+ "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
204
+ "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
205
+ "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
206
+ "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
207
+ "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
208
+ "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
209
+ "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
210
+ "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
211
+ "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
212
+ "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
213
+ "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
214
+ "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
215
+ "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
216
+ "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
217
+ "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
218
+ "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
219
+ "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
220
+ "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
221
+ "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias"
222
+ }
223
+ }
BAAI/bge-small-zh-v1.5/bert4torch_config.json CHANGED
@@ -1,221 +1,223 @@
1
  {
2
- "attention_probs_dropout_prob": 0.1,
3
- "hidden_act": "gelu",
4
- "hidden_dropout_prob": 0.1,
5
- "hidden_size": 512,
6
- "initializer_range": 0.02,
7
- "intermediate_size": 2048,
8
- "layer_norm_eps": 1e-12,
9
- "max_position_embeddings": 512,
10
- "model": "bert",
11
- "num_attention_heads": 8,
12
- "num_hidden_layers": 4,
13
- "pad_token_id": 0,
14
- "torch_dtype": "float32",
15
- "type_vocab_size": 2,
16
- "vocab_size": 21128,
17
- "with_pool": true,
18
- "pooling": {"pool_strategy": "cls"},
19
- "norm_mode": "torch_buildin",
20
- "mapping": {
21
- "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
22
- "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
23
- "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
24
- "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
25
- "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
26
- "pooler.weight": "pooler.dense.weight",
27
- "pooler.bias": "pooler.dense.bias",
28
- "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
29
- "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
30
- "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
31
- "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
32
- "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
33
- "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
34
- "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
35
- "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
36
- "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
37
- "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
38
- "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
39
- "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
40
- "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
41
- "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
42
- "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
43
- "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
44
- "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
45
- "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
46
- "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
47
- "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
48
- "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
49
- "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
50
- "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
51
- "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
52
- "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
53
- "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
54
- "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
55
- "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
56
- "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
57
- "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
58
- "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
59
- "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
60
- "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
61
- "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
62
- "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
63
- "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
64
- "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
65
- "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
66
- "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
67
- "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
68
- "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
69
- "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
70
- "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
71
- "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
72
- "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
73
- "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
74
- "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
75
- "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
76
- "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
77
- "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
78
- "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
79
- "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
80
- "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
81
- "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
82
- "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
83
- "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
84
- "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
85
- "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
86
- "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
87
- "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
88
- "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
89
- "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
90
- "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
91
- "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
92
- "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
93
- "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
94
- "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
95
- "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
96
- "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
97
- "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
98
- "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
99
- "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
100
- "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
101
- "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
102
- "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
103
- "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
104
- "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
105
- "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
106
- "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
107
- "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
108
- "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
109
- "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
110
- "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
111
- "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
112
- "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
113
- "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
114
- "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
115
- "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
116
- "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
117
- "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
118
- "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
119
- "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
120
- "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
121
- "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
122
- "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
123
- "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
124
- "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
125
- "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
126
- "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
127
- "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
128
- "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
129
- "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
130
- "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
131
- "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
132
- "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
133
- "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
134
- "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
135
- "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
136
- "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
137
- "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
138
- "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
139
- "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
140
- "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
141
- "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
142
- "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
143
- "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
144
- "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
145
- "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
146
- "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
147
- "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
148
- "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
149
- "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
150
- "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
151
- "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
152
- "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
153
- "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
154
- "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
155
- "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
156
- "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
157
- "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
158
- "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
159
- "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
160
- "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
161
- "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
162
- "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
163
- "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
164
- "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
165
- "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
166
- "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
167
- "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
168
- "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
169
- "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
170
- "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
171
- "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
172
- "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
173
- "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
174
- "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
175
- "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
176
- "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
177
- "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
178
- "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
179
- "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
180
- "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
181
- "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
182
- "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
183
- "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
184
- "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
185
- "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
186
- "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
187
- "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
188
- "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
189
- "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
190
- "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
191
- "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
192
- "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
193
- "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
194
- "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
195
- "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
196
- "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
197
- "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
198
- "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
199
- "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
200
- "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
201
- "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
202
- "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
203
- "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
204
- "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
205
- "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
206
- "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
207
- "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
208
- "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
209
- "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
210
- "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
211
- "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
212
- "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
213
- "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
214
- "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
215
- "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
216
- "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
217
- "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
218
- "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
219
- "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias"
220
- }
221
- }
 
 
 
1
  {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 512,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 2048,
8
+ "layer_norm_eps": 1e-12,
9
+ "max_position_embeddings": 512,
10
+ "model": "bert",
11
+ "num_attention_heads": 8,
12
+ "num_hidden_layers": 4,
13
+ "pad_token_id": 0,
14
+ "torch_dtype": "float32",
15
+ "type_vocab_size": 2,
16
+ "vocab_size": 21128,
17
+ "with_pool": true,
18
+ "pooling": {
19
+ "pool_strategy": "cls"
20
+ },
21
+ "norm_mode": "torch_buildin",
22
+ "mapping": {
23
+ "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
24
+ "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
25
+ "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
26
+ "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
27
+ "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
28
+ "pooler.weight": "pooler.dense.weight",
29
+ "pooler.bias": "pooler.dense.bias",
30
+ "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
31
+ "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
32
+ "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
33
+ "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
34
+ "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
35
+ "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
36
+ "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
37
+ "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
38
+ "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
39
+ "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
40
+ "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
41
+ "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
42
+ "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
43
+ "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
44
+ "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
45
+ "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
46
+ "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
47
+ "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
48
+ "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
49
+ "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
50
+ "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
51
+ "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
52
+ "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
53
+ "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
54
+ "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
55
+ "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
56
+ "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
57
+ "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
58
+ "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
59
+ "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
60
+ "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
61
+ "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
62
+ "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
63
+ "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
64
+ "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
65
+ "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
66
+ "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
67
+ "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
68
+ "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
69
+ "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
70
+ "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
71
+ "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
72
+ "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
73
+ "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
74
+ "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
75
+ "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
76
+ "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
77
+ "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
78
+ "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
79
+ "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
80
+ "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
81
+ "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
82
+ "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
83
+ "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
84
+ "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
85
+ "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
86
+ "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
87
+ "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
88
+ "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
89
+ "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
90
+ "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
91
+ "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
92
+ "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
93
+ "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
94
+ "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
95
+ "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
96
+ "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
97
+ "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
98
+ "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
99
+ "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
100
+ "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
101
+ "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
102
+ "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
103
+ "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
104
+ "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
105
+ "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
106
+ "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
107
+ "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
108
+ "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
109
+ "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
110
+ "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
111
+ "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
112
+ "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
113
+ "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
114
+ "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
115
+ "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
116
+ "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
117
+ "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
118
+ "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
119
+ "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
120
+ "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
121
+ "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
122
+ "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
123
+ "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
124
+ "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
125
+ "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
126
+ "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
127
+ "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
128
+ "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
129
+ "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
130
+ "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
131
+ "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
132
+ "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
133
+ "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
134
+ "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
135
+ "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
136
+ "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
137
+ "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
138
+ "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
139
+ "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
140
+ "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
141
+ "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
142
+ "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
143
+ "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
144
+ "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
145
+ "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
146
+ "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
147
+ "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
148
+ "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
149
+ "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
150
+ "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
151
+ "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
152
+ "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
153
+ "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
154
+ "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
155
+ "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
156
+ "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
157
+ "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
158
+ "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
159
+ "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
160
+ "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
161
+ "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
162
+ "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
163
+ "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
164
+ "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
165
+ "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
166
+ "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
167
+ "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
168
+ "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
169
+ "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
170
+ "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
171
+ "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
172
+ "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
173
+ "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
174
+ "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
175
+ "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
176
+ "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
177
+ "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
178
+ "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
179
+ "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
180
+ "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
181
+ "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
182
+ "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
183
+ "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
184
+ "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
185
+ "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
186
+ "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
187
+ "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
188
+ "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
189
+ "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
190
+ "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
191
+ "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
192
+ "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
193
+ "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
194
+ "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
195
+ "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
196
+ "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
197
+ "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
198
+ "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
199
+ "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
200
+ "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
201
+ "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
202
+ "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
203
+ "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
204
+ "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
205
+ "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
206
+ "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
207
+ "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
208
+ "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
209
+ "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
210
+ "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
211
+ "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
212
+ "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
213
+ "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
214
+ "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
215
+ "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
216
+ "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
217
+ "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
218
+ "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
219
+ "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
220
+ "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
221
+ "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias"
222
+ }
223
+ }
BelleGroup/BELLE-LLaMA-7B-2M-enc/bert4torch_config.json CHANGED
@@ -2,7 +2,7 @@
2
  "model": "llama",
3
  "template": "belle",
4
  "hidden_size": 4096,
5
- "intermediate_size": 11008,
6
  "num_attention_heads": 32,
7
  "num_hidden_layers": 32,
8
  "layer_norm_eps": 1e-06,
@@ -12,5 +12,11 @@
12
  "skip_init": true,
13
  "torch_dtype": "float16",
14
  "rope_rank": "updown",
15
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "max_length": 2048, "eos_token_id": 2}
 
 
 
 
 
 
16
  }
 
2
  "model": "llama",
3
  "template": "belle",
4
  "hidden_size": 4096,
5
+ "intermediate_size": 11008,
6
  "num_attention_heads": 32,
7
  "num_hidden_layers": 32,
8
  "layer_norm_eps": 1e-06,
 
12
  "skip_init": true,
13
  "torch_dtype": "float16",
14
  "rope_rank": "updown",
15
+ "generation_config": {
16
+ "tokenizer_config": {
17
+ "skip_special_tokens": true
18
+ },
19
+ "max_length": 2048,
20
+ "eos_token_id": 2
21
+ }
22
  }
ClueAI/ChatYuan-large-v1/bert4torch_config.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
- "model": "mt5.1.1",
3
- "hidden_act": "gelu",
4
- "hidden_dropout_prob": 0.1,
5
- "hidden_size": 1024,
6
- "intermediate_size": 2816,
7
- "num_attention_heads": 16,
8
- "attention_head_size": 64,
9
- "num_hidden_layers": 24,
10
- "vocab_size": 32128,
11
- "relative_attention_num_buckets": 32,
12
- "attention_scale": false,
13
- "is_dropout": true,
14
- "max_position_embeddings": 768,
15
- "segment_vocab_size": 0,
16
- "logit_scale": false
17
- }
 
1
  {
2
+ "model": "mt5.1.1",
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 1024,
6
+ "intermediate_size": 2816,
7
+ "num_attention_heads": 16,
8
+ "attention_head_size": 64,
9
+ "num_hidden_layers": 24,
10
+ "vocab_size": 32128,
11
+ "relative_attention_num_buckets": 32,
12
+ "attention_scale": false,
13
+ "is_dropout": true,
14
+ "max_position_embeddings": 768,
15
+ "segment_vocab_size": 0,
16
+ "logit_scale": false
17
+ }
ClueAI/ChatYuan-large-v2/bert4torch_config.json CHANGED
@@ -1,17 +1,17 @@
1
- {
2
- "model": "mt5.1.1",
3
- "hidden_act": "gelu",
4
- "hidden_dropout_prob": 0.1,
5
- "hidden_size": 1024,
6
- "intermediate_size": 2816,
7
- "num_attention_heads": 16,
8
- "attention_head_size": 64,
9
- "num_hidden_layers": 24,
10
- "vocab_size": 32128,
11
- "relative_attention_num_buckets": 32,
12
- "attention_scale": false,
13
- "is_dropout": true,
14
- "max_position_embeddings": 768,
15
- "segment_vocab_size": 0,
16
- "logit_scale": false
17
- }
 
1
+ {
2
+ "model": "mt5.1.1",
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 1024,
6
+ "intermediate_size": 2816,
7
+ "num_attention_heads": 16,
8
+ "attention_head_size": 64,
9
+ "num_hidden_layers": 24,
10
+ "vocab_size": 32128,
11
+ "relative_attention_num_buckets": 32,
12
+ "attention_scale": false,
13
+ "is_dropout": true,
14
+ "max_position_embeddings": 768,
15
+ "segment_vocab_size": 0,
16
+ "logit_scale": false
17
+ }
ClueAI/PromptCLUE-base-v1-5/bert4torch_config.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
- "model": "mt5.1.1",
3
- "hidden_act": "gelu",
4
- "hidden_dropout_prob": 0.1,
5
- "hidden_size": 768,
6
- "intermediate_size": 2048,
7
- "num_attention_heads": 12,
8
- "attention_head_size": 64,
9
- "num_hidden_layers": 12,
10
- "vocab_size": 32128,
11
- "relative_attention_num_buckets": 32,
12
- "attention_scale": false,
13
- "is_dropout": true,
14
- "max_position_embeddings": 768,
15
- "segment_vocab_size": 0,
16
- "logit_scale": false
17
  }
 
1
  {
2
+ "model": "mt5.1.1",
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 768,
6
+ "intermediate_size": 2048,
7
+ "num_attention_heads": 12,
8
+ "attention_head_size": 64,
9
+ "num_hidden_layers": 12,
10
+ "vocab_size": 32128,
11
+ "relative_attention_num_buckets": 32,
12
+ "attention_scale": false,
13
+ "is_dropout": true,
14
+ "max_position_embeddings": 768,
15
+ "segment_vocab_size": 0,
16
+ "logit_scale": false
17
  }
ClueAI/PromptCLUE-base/bert4torch_config.json CHANGED
@@ -1,17 +1,17 @@
1
- {
2
- "model": "mt5.1.1",
3
- "hidden_act": "gelu",
4
- "hidden_dropout_prob": 0.1,
5
- "hidden_size": 768,
6
- "intermediate_size": 2048,
7
- "num_attention_heads": 12,
8
- "attention_head_size": 64,
9
- "num_hidden_layers": 12,
10
- "vocab_size": 32128,
11
- "relative_attention_num_buckets": 32,
12
- "attention_scale": false,
13
- "is_dropout": true,
14
- "max_position_embeddings": 768,
15
- "segment_vocab_size": 0,
16
- "logit_scale": false
17
- }
 
1
+ {
2
+ "model": "mt5.1.1",
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 768,
6
+ "intermediate_size": 2048,
7
+ "num_attention_heads": 12,
8
+ "attention_head_size": 64,
9
+ "num_hidden_layers": 12,
10
+ "vocab_size": 32128,
11
+ "relative_attention_num_buckets": 32,
12
+ "attention_scale": false,
13
+ "is_dropout": true,
14
+ "max_position_embeddings": 768,
15
+ "segment_vocab_size": 0,
16
+ "logit_scale": false
17
+ }
FacebookAI/roberta-base/bert4torch_config.json CHANGED
@@ -1,226 +1,225 @@
1
- {
2
- "attention_probs_dropout_prob": 0.1,
3
- "bos_token_id": 0,
4
- "eos_token_id": 2,
5
- "hidden_act": "gelu",
6
- "hidden_dropout_prob": 0.1,
7
- "hidden_size": 768,
8
- "initializer_range": 0.02,
9
- "intermediate_size": 3072,
10
- "layer_norm_eps": 1e-05,
11
- "max_position_embeddings": 514,
12
- "model_type": "roberta",
13
- "num_attention_heads": 12,
14
- "num_hidden_layers": 12,
15
- "pad_token_id": 1,
16
- "type_vocab_size": 1,
17
- "vocab_size": 50265,
18
- "custom_position_ids": "start_at_padding",
19
- "mapping": {
20
- "embeddings.word_embeddings.weight": "roberta.embeddings.word_embeddings.weight",
21
- "embeddings.position_embeddings.weight": "roberta.embeddings.position_embeddings.weight",
22
- "embeddings.segment_embeddings.weight": "roberta.embeddings.token_type_embeddings.weight",
23
- "embeddings.layerNorm.weight": "roberta.embeddings.LayerNorm.weight",
24
- "embeddings.layerNorm.bias": "roberta.embeddings.LayerNorm.bias",
25
- "mlmDense.weight": "lm_head.dense.weight",
26
- "mlmDense.bias": "lm_head.dense.bias",
27
- "mlmLayerNorm.weight": "lm_head.layer_norm.weight",
28
- "mlmLayerNorm.bias": "lm_head.layer_norm.bias",
29
- "mlmBias": "lm_head.bias",
30
- "mlmDecoder.weight": "lm_head.decoder.weight",
31
- "mlmDecoder.bias": "lm_head.bias",
32
- "encoderLayer.0.multiHeadAttention.q.weight": "roberta.encoder.layer.0.attention.self.query.weight",
33
- "encoderLayer.0.multiHeadAttention.q.bias": "roberta.encoder.layer.0.attention.self.query.bias",
34
- "encoderLayer.0.multiHeadAttention.k.weight": "roberta.encoder.layer.0.attention.self.key.weight",
35
- "encoderLayer.0.multiHeadAttention.k.bias": "roberta.encoder.layer.0.attention.self.key.bias",
36
- "encoderLayer.0.multiHeadAttention.v.weight": "roberta.encoder.layer.0.attention.self.value.weight",
37
- "encoderLayer.0.multiHeadAttention.v.bias": "roberta.encoder.layer.0.attention.self.value.bias",
38
- "encoderLayer.0.multiHeadAttention.o.weight": "roberta.encoder.layer.0.attention.output.dense.weight",
39
- "encoderLayer.0.multiHeadAttention.o.bias": "roberta.encoder.layer.0.attention.output.dense.bias",
40
- "encoderLayer.0.attnLayerNorm.weight": "roberta.encoder.layer.0.attention.output.LayerNorm.weight",
41
- "encoderLayer.0.attnLayerNorm.bias": "roberta.encoder.layer.0.attention.output.LayerNorm.bias",
42
- "encoderLayer.0.feedForward.intermediateDense.weight": "roberta.encoder.layer.0.intermediate.dense.weight",
43
- "encoderLayer.0.feedForward.intermediateDense.bias": "roberta.encoder.layer.0.intermediate.dense.bias",
44
- "encoderLayer.0.feedForward.outputDense.weight": "roberta.encoder.layer.0.output.dense.weight",
45
- "encoderLayer.0.feedForward.outputDense.bias": "roberta.encoder.layer.0.output.dense.bias",
46
- "encoderLayer.0.ffnLayerNorm.weight": "roberta.encoder.layer.0.output.LayerNorm.weight",
47
- "encoderLayer.0.ffnLayerNorm.bias": "roberta.encoder.layer.0.output.LayerNorm.bias",
48
- "encoderLayer.1.multiHeadAttention.q.weight": "roberta.encoder.layer.1.attention.self.query.weight",
49
- "encoderLayer.1.multiHeadAttention.q.bias": "roberta.encoder.layer.1.attention.self.query.bias",
50
- "encoderLayer.1.multiHeadAttention.k.weight": "roberta.encoder.layer.1.attention.self.key.weight",
51
- "encoderLayer.1.multiHeadAttention.k.bias": "roberta.encoder.layer.1.attention.self.key.bias",
52
- "encoderLayer.1.multiHeadAttention.v.weight": "roberta.encoder.layer.1.attention.self.value.weight",
53
- "encoderLayer.1.multiHeadAttention.v.bias": "roberta.encoder.layer.1.attention.self.value.bias",
54
- "encoderLayer.1.multiHeadAttention.o.weight": "roberta.encoder.layer.1.attention.output.dense.weight",
55
- "encoderLayer.1.multiHeadAttention.o.bias": "roberta.encoder.layer.1.attention.output.dense.bias",
56
- "encoderLayer.1.attnLayerNorm.weight": "roberta.encoder.layer.1.attention.output.LayerNorm.weight",
57
- "encoderLayer.1.attnLayerNorm.bias": "roberta.encoder.layer.1.attention.output.LayerNorm.bias",
58
- "encoderLayer.1.feedForward.intermediateDense.weight": "roberta.encoder.layer.1.intermediate.dense.weight",
59
- "encoderLayer.1.feedForward.intermediateDense.bias": "roberta.encoder.layer.1.intermediate.dense.bias",
60
- "encoderLayer.1.feedForward.outputDense.weight": "roberta.encoder.layer.1.output.dense.weight",
61
- "encoderLayer.1.feedForward.outputDense.bias": "roberta.encoder.layer.1.output.dense.bias",
62
- "encoderLayer.1.ffnLayerNorm.weight": "roberta.encoder.layer.1.output.LayerNorm.weight",
63
- "encoderLayer.1.ffnLayerNorm.bias": "roberta.encoder.layer.1.output.LayerNorm.bias",
64
- "encoderLayer.2.multiHeadAttention.q.weight": "roberta.encoder.layer.2.attention.self.query.weight",
65
- "encoderLayer.2.multiHeadAttention.q.bias": "roberta.encoder.layer.2.attention.self.query.bias",
66
- "encoderLayer.2.multiHeadAttention.k.weight": "roberta.encoder.layer.2.attention.self.key.weight",
67
- "encoderLayer.2.multiHeadAttention.k.bias": "roberta.encoder.layer.2.attention.self.key.bias",
68
- "encoderLayer.2.multiHeadAttention.v.weight": "roberta.encoder.layer.2.attention.self.value.weight",
69
- "encoderLayer.2.multiHeadAttention.v.bias": "roberta.encoder.layer.2.attention.self.value.bias",
70
- "encoderLayer.2.multiHeadAttention.o.weight": "roberta.encoder.layer.2.attention.output.dense.weight",
71
- "encoderLayer.2.multiHeadAttention.o.bias": "roberta.encoder.layer.2.attention.output.dense.bias",
72
- "encoderLayer.2.attnLayerNorm.weight": "roberta.encoder.layer.2.attention.output.LayerNorm.weight",
73
- "encoderLayer.2.attnLayerNorm.bias": "roberta.encoder.layer.2.attention.output.LayerNorm.bias",
74
- "encoderLayer.2.feedForward.intermediateDense.weight": "roberta.encoder.layer.2.intermediate.dense.weight",
75
- "encoderLayer.2.feedForward.intermediateDense.bias": "roberta.encoder.layer.2.intermediate.dense.bias",
76
- "encoderLayer.2.feedForward.outputDense.weight": "roberta.encoder.layer.2.output.dense.weight",
77
- "encoderLayer.2.feedForward.outputDense.bias": "roberta.encoder.layer.2.output.dense.bias",
78
- "encoderLayer.2.ffnLayerNorm.weight": "roberta.encoder.layer.2.output.LayerNorm.weight",
79
- "encoderLayer.2.ffnLayerNorm.bias": "roberta.encoder.layer.2.output.LayerNorm.bias",
80
- "encoderLayer.3.multiHeadAttention.q.weight": "roberta.encoder.layer.3.attention.self.query.weight",
81
- "encoderLayer.3.multiHeadAttention.q.bias": "roberta.encoder.layer.3.attention.self.query.bias",
82
- "encoderLayer.3.multiHeadAttention.k.weight": "roberta.encoder.layer.3.attention.self.key.weight",
83
- "encoderLayer.3.multiHeadAttention.k.bias": "roberta.encoder.layer.3.attention.self.key.bias",
84
- "encoderLayer.3.multiHeadAttention.v.weight": "roberta.encoder.layer.3.attention.self.value.weight",
85
- "encoderLayer.3.multiHeadAttention.v.bias": "roberta.encoder.layer.3.attention.self.value.bias",
86
- "encoderLayer.3.multiHeadAttention.o.weight": "roberta.encoder.layer.3.attention.output.dense.weight",
87
- "encoderLayer.3.multiHeadAttention.o.bias": "roberta.encoder.layer.3.attention.output.dense.bias",
88
- "encoderLayer.3.attnLayerNorm.weight": "roberta.encoder.layer.3.attention.output.LayerNorm.weight",
89
- "encoderLayer.3.attnLayerNorm.bias": "roberta.encoder.layer.3.attention.output.LayerNorm.bias",
90
- "encoderLayer.3.feedForward.intermediateDense.weight": "roberta.encoder.layer.3.intermediate.dense.weight",
91
- "encoderLayer.3.feedForward.intermediateDense.bias": "roberta.encoder.layer.3.intermediate.dense.bias",
92
- "encoderLayer.3.feedForward.outputDense.weight": "roberta.encoder.layer.3.output.dense.weight",
93
- "encoderLayer.3.feedForward.outputDense.bias": "roberta.encoder.layer.3.output.dense.bias",
94
- "encoderLayer.3.ffnLayerNorm.weight": "roberta.encoder.layer.3.output.LayerNorm.weight",
95
- "encoderLayer.3.ffnLayerNorm.bias": "roberta.encoder.layer.3.output.LayerNorm.bias",
96
- "encoderLayer.4.multiHeadAttention.q.weight": "roberta.encoder.layer.4.attention.self.query.weight",
97
- "encoderLayer.4.multiHeadAttention.q.bias": "roberta.encoder.layer.4.attention.self.query.bias",
98
- "encoderLayer.4.multiHeadAttention.k.weight": "roberta.encoder.layer.4.attention.self.key.weight",
99
- "encoderLayer.4.multiHeadAttention.k.bias": "roberta.encoder.layer.4.attention.self.key.bias",
100
- "encoderLayer.4.multiHeadAttention.v.weight": "roberta.encoder.layer.4.attention.self.value.weight",
101
- "encoderLayer.4.multiHeadAttention.v.bias": "roberta.encoder.layer.4.attention.self.value.bias",
102
- "encoderLayer.4.multiHeadAttention.o.weight": "roberta.encoder.layer.4.attention.output.dense.weight",
103
- "encoderLayer.4.multiHeadAttention.o.bias": "roberta.encoder.layer.4.attention.output.dense.bias",
104
- "encoderLayer.4.attnLayerNorm.weight": "roberta.encoder.layer.4.attention.output.LayerNorm.weight",
105
- "encoderLayer.4.attnLayerNorm.bias": "roberta.encoder.layer.4.attention.output.LayerNorm.bias",
106
- "encoderLayer.4.feedForward.intermediateDense.weight": "roberta.encoder.layer.4.intermediate.dense.weight",
107
- "encoderLayer.4.feedForward.intermediateDense.bias": "roberta.encoder.layer.4.intermediate.dense.bias",
108
- "encoderLayer.4.feedForward.outputDense.weight": "roberta.encoder.layer.4.output.dense.weight",
109
- "encoderLayer.4.feedForward.outputDense.bias": "roberta.encoder.layer.4.output.dense.bias",
110
- "encoderLayer.4.ffnLayerNorm.weight": "roberta.encoder.layer.4.output.LayerNorm.weight",
111
- "encoderLayer.4.ffnLayerNorm.bias": "roberta.encoder.layer.4.output.LayerNorm.bias",
112
- "encoderLayer.5.multiHeadAttention.q.weight": "roberta.encoder.layer.5.attention.self.query.weight",
113
- "encoderLayer.5.multiHeadAttention.q.bias": "roberta.encoder.layer.5.attention.self.query.bias",
114
- "encoderLayer.5.multiHeadAttention.k.weight": "roberta.encoder.layer.5.attention.self.key.weight",
115
- "encoderLayer.5.multiHeadAttention.k.bias": "roberta.encoder.layer.5.attention.self.key.bias",
116
- "encoderLayer.5.multiHeadAttention.v.weight": "roberta.encoder.layer.5.attention.self.value.weight",
117
- "encoderLayer.5.multiHeadAttention.v.bias": "roberta.encoder.layer.5.attention.self.value.bias",
118
- "encoderLayer.5.multiHeadAttention.o.weight": "roberta.encoder.layer.5.attention.output.dense.weight",
119
- "encoderLayer.5.multiHeadAttention.o.bias": "roberta.encoder.layer.5.attention.output.dense.bias",
120
- "encoderLayer.5.attnLayerNorm.weight": "roberta.encoder.layer.5.attention.output.LayerNorm.weight",
121
- "encoderLayer.5.attnLayerNorm.bias": "roberta.encoder.layer.5.attention.output.LayerNorm.bias",
122
- "encoderLayer.5.feedForward.intermediateDense.weight": "roberta.encoder.layer.5.intermediate.dense.weight",
123
- "encoderLayer.5.feedForward.intermediateDense.bias": "roberta.encoder.layer.5.intermediate.dense.bias",
124
- "encoderLayer.5.feedForward.outputDense.weight": "roberta.encoder.layer.5.output.dense.weight",
125
- "encoderLayer.5.feedForward.outputDense.bias": "roberta.encoder.layer.5.output.dense.bias",
126
- "encoderLayer.5.ffnLayerNorm.weight": "roberta.encoder.layer.5.output.LayerNorm.weight",
127
- "encoderLayer.5.ffnLayerNorm.bias": "roberta.encoder.layer.5.output.LayerNorm.bias",
128
- "encoderLayer.6.multiHeadAttention.q.weight": "roberta.encoder.layer.6.attention.self.query.weight",
129
- "encoderLayer.6.multiHeadAttention.q.bias": "roberta.encoder.layer.6.attention.self.query.bias",
130
- "encoderLayer.6.multiHeadAttention.k.weight": "roberta.encoder.layer.6.attention.self.key.weight",
131
- "encoderLayer.6.multiHeadAttention.k.bias": "roberta.encoder.layer.6.attention.self.key.bias",
132
- "encoderLayer.6.multiHeadAttention.v.weight": "roberta.encoder.layer.6.attention.self.value.weight",
133
- "encoderLayer.6.multiHeadAttention.v.bias": "roberta.encoder.layer.6.attention.self.value.bias",
134
- "encoderLayer.6.multiHeadAttention.o.weight": "roberta.encoder.layer.6.attention.output.dense.weight",
135
- "encoderLayer.6.multiHeadAttention.o.bias": "roberta.encoder.layer.6.attention.output.dense.bias",
136
- "encoderLayer.6.attnLayerNorm.weight": "roberta.encoder.layer.6.attention.output.LayerNorm.weight",
137
- "encoderLayer.6.attnLayerNorm.bias": "roberta.encoder.layer.6.attention.output.LayerNorm.bias",
138
- "encoderLayer.6.feedForward.intermediateDense.weight": "roberta.encoder.layer.6.intermediate.dense.weight",
139
- "encoderLayer.6.feedForward.intermediateDense.bias": "roberta.encoder.layer.6.intermediate.dense.bias",
140
- "encoderLayer.6.feedForward.outputDense.weight": "roberta.encoder.layer.6.output.dense.weight",
141
- "encoderLayer.6.feedForward.outputDense.bias": "roberta.encoder.layer.6.output.dense.bias",
142
- "encoderLayer.6.ffnLayerNorm.weight": "roberta.encoder.layer.6.output.LayerNorm.weight",
143
- "encoderLayer.6.ffnLayerNorm.bias": "roberta.encoder.layer.6.output.LayerNorm.bias",
144
- "encoderLayer.7.multiHeadAttention.q.weight": "roberta.encoder.layer.7.attention.self.query.weight",
145
- "encoderLayer.7.multiHeadAttention.q.bias": "roberta.encoder.layer.7.attention.self.query.bias",
146
- "encoderLayer.7.multiHeadAttention.k.weight": "roberta.encoder.layer.7.attention.self.key.weight",
147
- "encoderLayer.7.multiHeadAttention.k.bias": "roberta.encoder.layer.7.attention.self.key.bias",
148
- "encoderLayer.7.multiHeadAttention.v.weight": "roberta.encoder.layer.7.attention.self.value.weight",
149
- "encoderLayer.7.multiHeadAttention.v.bias": "roberta.encoder.layer.7.attention.self.value.bias",
150
- "encoderLayer.7.multiHeadAttention.o.weight": "roberta.encoder.layer.7.attention.output.dense.weight",
151
- "encoderLayer.7.multiHeadAttention.o.bias": "roberta.encoder.layer.7.attention.output.dense.bias",
152
- "encoderLayer.7.attnLayerNorm.weight": "roberta.encoder.layer.7.attention.output.LayerNorm.weight",
153
- "encoderLayer.7.attnLayerNorm.bias": "roberta.encoder.layer.7.attention.output.LayerNorm.bias",
154
- "encoderLayer.7.feedForward.intermediateDense.weight": "roberta.encoder.layer.7.intermediate.dense.weight",
155
- "encoderLayer.7.feedForward.intermediateDense.bias": "roberta.encoder.layer.7.intermediate.dense.bias",
156
- "encoderLayer.7.feedForward.outputDense.weight": "roberta.encoder.layer.7.output.dense.weight",
157
- "encoderLayer.7.feedForward.outputDense.bias": "roberta.encoder.layer.7.output.dense.bias",
158
- "encoderLayer.7.ffnLayerNorm.weight": "roberta.encoder.layer.7.output.LayerNorm.weight",
159
- "encoderLayer.7.ffnLayerNorm.bias": "roberta.encoder.layer.7.output.LayerNorm.bias",
160
- "encoderLayer.8.multiHeadAttention.q.weight": "roberta.encoder.layer.8.attention.self.query.weight",
161
- "encoderLayer.8.multiHeadAttention.q.bias": "roberta.encoder.layer.8.attention.self.query.bias",
162
- "encoderLayer.8.multiHeadAttention.k.weight": "roberta.encoder.layer.8.attention.self.key.weight",
163
- "encoderLayer.8.multiHeadAttention.k.bias": "roberta.encoder.layer.8.attention.self.key.bias",
164
- "encoderLayer.8.multiHeadAttention.v.weight": "roberta.encoder.layer.8.attention.self.value.weight",
165
- "encoderLayer.8.multiHeadAttention.v.bias": "roberta.encoder.layer.8.attention.self.value.bias",
166
- "encoderLayer.8.multiHeadAttention.o.weight": "roberta.encoder.layer.8.attention.output.dense.weight",
167
- "encoderLayer.8.multiHeadAttention.o.bias": "roberta.encoder.layer.8.attention.output.dense.bias",
168
- "encoderLayer.8.attnLayerNorm.weight": "roberta.encoder.layer.8.attention.output.LayerNorm.weight",
169
- "encoderLayer.8.attnLayerNorm.bias": "roberta.encoder.layer.8.attention.output.LayerNorm.bias",
170
- "encoderLayer.8.feedForward.intermediateDense.weight": "roberta.encoder.layer.8.intermediate.dense.weight",
171
- "encoderLayer.8.feedForward.intermediateDense.bias": "roberta.encoder.layer.8.intermediate.dense.bias",
172
- "encoderLayer.8.feedForward.outputDense.weight": "roberta.encoder.layer.8.output.dense.weight",
173
- "encoderLayer.8.feedForward.outputDense.bias": "roberta.encoder.layer.8.output.dense.bias",
174
- "encoderLayer.8.ffnLayerNorm.weight": "roberta.encoder.layer.8.output.LayerNorm.weight",
175
- "encoderLayer.8.ffnLayerNorm.bias": "roberta.encoder.layer.8.output.LayerNorm.bias",
176
- "encoderLayer.9.multiHeadAttention.q.weight": "roberta.encoder.layer.9.attention.self.query.weight",
177
- "encoderLayer.9.multiHeadAttention.q.bias": "roberta.encoder.layer.9.attention.self.query.bias",
178
- "encoderLayer.9.multiHeadAttention.k.weight": "roberta.encoder.layer.9.attention.self.key.weight",
179
- "encoderLayer.9.multiHeadAttention.k.bias": "roberta.encoder.layer.9.attention.self.key.bias",
180
- "encoderLayer.9.multiHeadAttention.v.weight": "roberta.encoder.layer.9.attention.self.value.weight",
181
- "encoderLayer.9.multiHeadAttention.v.bias": "roberta.encoder.layer.9.attention.self.value.bias",
182
- "encoderLayer.9.multiHeadAttention.o.weight": "roberta.encoder.layer.9.attention.output.dense.weight",
183
- "encoderLayer.9.multiHeadAttention.o.bias": "roberta.encoder.layer.9.attention.output.dense.bias",
184
- "encoderLayer.9.attnLayerNorm.weight": "roberta.encoder.layer.9.attention.output.LayerNorm.weight",
185
- "encoderLayer.9.attnLayerNorm.bias": "roberta.encoder.layer.9.attention.output.LayerNorm.bias",
186
- "encoderLayer.9.feedForward.intermediateDense.weight": "roberta.encoder.layer.9.intermediate.dense.weight",
187
- "encoderLayer.9.feedForward.intermediateDense.bias": "roberta.encoder.layer.9.intermediate.dense.bias",
188
- "encoderLayer.9.feedForward.outputDense.weight": "roberta.encoder.layer.9.output.dense.weight",
189
- "encoderLayer.9.feedForward.outputDense.bias": "roberta.encoder.layer.9.output.dense.bias",
190
- "encoderLayer.9.ffnLayerNorm.weight": "roberta.encoder.layer.9.output.LayerNorm.weight",
191
- "encoderLayer.9.ffnLayerNorm.bias": "roberta.encoder.layer.9.output.LayerNorm.bias",
192
- "encoderLayer.10.multiHeadAttention.q.weight": "roberta.encoder.layer.10.attention.self.query.weight",
193
- "encoderLayer.10.multiHeadAttention.q.bias": "roberta.encoder.layer.10.attention.self.query.bias",
194
- "encoderLayer.10.multiHeadAttention.k.weight": "roberta.encoder.layer.10.attention.self.key.weight",
195
- "encoderLayer.10.multiHeadAttention.k.bias": "roberta.encoder.layer.10.attention.self.key.bias",
196
- "encoderLayer.10.multiHeadAttention.v.weight": "roberta.encoder.layer.10.attention.self.value.weight",
197
- "encoderLayer.10.multiHeadAttention.v.bias": "roberta.encoder.layer.10.attention.self.value.bias",
198
- "encoderLayer.10.multiHeadAttention.o.weight": "roberta.encoder.layer.10.attention.output.dense.weight",
199
- "encoderLayer.10.multiHeadAttention.o.bias": "roberta.encoder.layer.10.attention.output.dense.bias",
200
- "encoderLayer.10.attnLayerNorm.weight": "roberta.encoder.layer.10.attention.output.LayerNorm.weight",
201
- "encoderLayer.10.attnLayerNorm.bias": "roberta.encoder.layer.10.attention.output.LayerNorm.bias",
202
- "encoderLayer.10.feedForward.intermediateDense.weight": "roberta.encoder.layer.10.intermediate.dense.weight",
203
- "encoderLayer.10.feedForward.intermediateDense.bias": "roberta.encoder.layer.10.intermediate.dense.bias",
204
- "encoderLayer.10.feedForward.outputDense.weight": "roberta.encoder.layer.10.output.dense.weight",
205
- "encoderLayer.10.feedForward.outputDense.bias": "roberta.encoder.layer.10.output.dense.bias",
206
- "encoderLayer.10.ffnLayerNorm.weight": "roberta.encoder.layer.10.output.LayerNorm.weight",
207
- "encoderLayer.10.ffnLayerNorm.bias": "roberta.encoder.layer.10.output.LayerNorm.bias",
208
- "encoderLayer.11.multiHeadAttention.q.weight": "roberta.encoder.layer.11.attention.self.query.weight",
209
- "encoderLayer.11.multiHeadAttention.q.bias": "roberta.encoder.layer.11.attention.self.query.bias",
210
- "encoderLayer.11.multiHeadAttention.k.weight": "roberta.encoder.layer.11.attention.self.key.weight",
211
- "encoderLayer.11.multiHeadAttention.k.bias": "roberta.encoder.layer.11.attention.self.key.bias",
212
- "encoderLayer.11.multiHeadAttention.v.weight": "roberta.encoder.layer.11.attention.self.value.weight",
213
- "encoderLayer.11.multiHeadAttention.v.bias": "roberta.encoder.layer.11.attention.self.value.bias",
214
- "encoderLayer.11.multiHeadAttention.o.weight": "roberta.encoder.layer.11.attention.output.dense.weight",
215
- "encoderLayer.11.multiHeadAttention.o.bias": "roberta.encoder.layer.11.attention.output.dense.bias",
216
- "encoderLayer.11.attnLayerNorm.weight": "roberta.encoder.layer.11.attention.output.LayerNorm.weight",
217
- "encoderLayer.11.attnLayerNorm.bias": "roberta.encoder.layer.11.attention.output.LayerNorm.bias",
218
- "encoderLayer.11.feedForward.intermediateDense.weight": "roberta.encoder.layer.11.intermediate.dense.weight",
219
- "encoderLayer.11.feedForward.intermediateDense.bias": "roberta.encoder.layer.11.intermediate.dense.bias",
220
- "encoderLayer.11.feedForward.outputDense.weight": "roberta.encoder.layer.11.output.dense.weight",
221
- "encoderLayer.11.feedForward.outputDense.bias": "roberta.encoder.layer.11.output.dense.bias",
222
- "encoderLayer.11.ffnLayerNorm.weight": "roberta.encoder.layer.11.output.LayerNorm.weight",
223
- "encoderLayer.11.ffnLayerNorm.bias": "roberta.encoder.layer.11.output.LayerNorm.bias"
224
- }
225
- }
226
-
 
1
+ {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "hidden_act": "gelu",
6
+ "hidden_dropout_prob": 0.1,
7
+ "hidden_size": 768,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 3072,
10
+ "layer_norm_eps": 1e-05,
11
+ "max_position_embeddings": 514,
12
+ "model_type": "roberta",
13
+ "num_attention_heads": 12,
14
+ "num_hidden_layers": 12,
15
+ "pad_token_id": 1,
16
+ "type_vocab_size": 1,
17
+ "vocab_size": 50265,
18
+ "custom_position_ids": "start_at_padding",
19
+ "mapping": {
20
+ "embeddings.word_embeddings.weight": "roberta.embeddings.word_embeddings.weight",
21
+ "embeddings.position_embeddings.weight": "roberta.embeddings.position_embeddings.weight",
22
+ "embeddings.segment_embeddings.weight": "roberta.embeddings.token_type_embeddings.weight",
23
+ "embeddings.layerNorm.weight": "roberta.embeddings.LayerNorm.weight",
24
+ "embeddings.layerNorm.bias": "roberta.embeddings.LayerNorm.bias",
25
+ "mlmDense.weight": "lm_head.dense.weight",
26
+ "mlmDense.bias": "lm_head.dense.bias",
27
+ "mlmLayerNorm.weight": "lm_head.layer_norm.weight",
28
+ "mlmLayerNorm.bias": "lm_head.layer_norm.bias",
29
+ "mlmBias": "lm_head.bias",
30
+ "mlmDecoder.weight": "lm_head.decoder.weight",
31
+ "mlmDecoder.bias": "lm_head.bias",
32
+ "encoderLayer.0.multiHeadAttention.q.weight": "roberta.encoder.layer.0.attention.self.query.weight",
33
+ "encoderLayer.0.multiHeadAttention.q.bias": "roberta.encoder.layer.0.attention.self.query.bias",
34
+ "encoderLayer.0.multiHeadAttention.k.weight": "roberta.encoder.layer.0.attention.self.key.weight",
35
+ "encoderLayer.0.multiHeadAttention.k.bias": "roberta.encoder.layer.0.attention.self.key.bias",
36
+ "encoderLayer.0.multiHeadAttention.v.weight": "roberta.encoder.layer.0.attention.self.value.weight",
37
+ "encoderLayer.0.multiHeadAttention.v.bias": "roberta.encoder.layer.0.attention.self.value.bias",
38
+ "encoderLayer.0.multiHeadAttention.o.weight": "roberta.encoder.layer.0.attention.output.dense.weight",
39
+ "encoderLayer.0.multiHeadAttention.o.bias": "roberta.encoder.layer.0.attention.output.dense.bias",
40
+ "encoderLayer.0.attnLayerNorm.weight": "roberta.encoder.layer.0.attention.output.LayerNorm.weight",
41
+ "encoderLayer.0.attnLayerNorm.bias": "roberta.encoder.layer.0.attention.output.LayerNorm.bias",
42
+ "encoderLayer.0.feedForward.intermediateDense.weight": "roberta.encoder.layer.0.intermediate.dense.weight",
43
+ "encoderLayer.0.feedForward.intermediateDense.bias": "roberta.encoder.layer.0.intermediate.dense.bias",
44
+ "encoderLayer.0.feedForward.outputDense.weight": "roberta.encoder.layer.0.output.dense.weight",
45
+ "encoderLayer.0.feedForward.outputDense.bias": "roberta.encoder.layer.0.output.dense.bias",
46
+ "encoderLayer.0.ffnLayerNorm.weight": "roberta.encoder.layer.0.output.LayerNorm.weight",
47
+ "encoderLayer.0.ffnLayerNorm.bias": "roberta.encoder.layer.0.output.LayerNorm.bias",
48
+ "encoderLayer.1.multiHeadAttention.q.weight": "roberta.encoder.layer.1.attention.self.query.weight",
49
+ "encoderLayer.1.multiHeadAttention.q.bias": "roberta.encoder.layer.1.attention.self.query.bias",
50
+ "encoderLayer.1.multiHeadAttention.k.weight": "roberta.encoder.layer.1.attention.self.key.weight",
51
+ "encoderLayer.1.multiHeadAttention.k.bias": "roberta.encoder.layer.1.attention.self.key.bias",
52
+ "encoderLayer.1.multiHeadAttention.v.weight": "roberta.encoder.layer.1.attention.self.value.weight",
53
+ "encoderLayer.1.multiHeadAttention.v.bias": "roberta.encoder.layer.1.attention.self.value.bias",
54
+ "encoderLayer.1.multiHeadAttention.o.weight": "roberta.encoder.layer.1.attention.output.dense.weight",
55
+ "encoderLayer.1.multiHeadAttention.o.bias": "roberta.encoder.layer.1.attention.output.dense.bias",
56
+ "encoderLayer.1.attnLayerNorm.weight": "roberta.encoder.layer.1.attention.output.LayerNorm.weight",
57
+ "encoderLayer.1.attnLayerNorm.bias": "roberta.encoder.layer.1.attention.output.LayerNorm.bias",
58
+ "encoderLayer.1.feedForward.intermediateDense.weight": "roberta.encoder.layer.1.intermediate.dense.weight",
59
+ "encoderLayer.1.feedForward.intermediateDense.bias": "roberta.encoder.layer.1.intermediate.dense.bias",
60
+ "encoderLayer.1.feedForward.outputDense.weight": "roberta.encoder.layer.1.output.dense.weight",
61
+ "encoderLayer.1.feedForward.outputDense.bias": "roberta.encoder.layer.1.output.dense.bias",
62
+ "encoderLayer.1.ffnLayerNorm.weight": "roberta.encoder.layer.1.output.LayerNorm.weight",
63
+ "encoderLayer.1.ffnLayerNorm.bias": "roberta.encoder.layer.1.output.LayerNorm.bias",
64
+ "encoderLayer.2.multiHeadAttention.q.weight": "roberta.encoder.layer.2.attention.self.query.weight",
65
+ "encoderLayer.2.multiHeadAttention.q.bias": "roberta.encoder.layer.2.attention.self.query.bias",
66
+ "encoderLayer.2.multiHeadAttention.k.weight": "roberta.encoder.layer.2.attention.self.key.weight",
67
+ "encoderLayer.2.multiHeadAttention.k.bias": "roberta.encoder.layer.2.attention.self.key.bias",
68
+ "encoderLayer.2.multiHeadAttention.v.weight": "roberta.encoder.layer.2.attention.self.value.weight",
69
+ "encoderLayer.2.multiHeadAttention.v.bias": "roberta.encoder.layer.2.attention.self.value.bias",
70
+ "encoderLayer.2.multiHeadAttention.o.weight": "roberta.encoder.layer.2.attention.output.dense.weight",
71
+ "encoderLayer.2.multiHeadAttention.o.bias": "roberta.encoder.layer.2.attention.output.dense.bias",
72
+ "encoderLayer.2.attnLayerNorm.weight": "roberta.encoder.layer.2.attention.output.LayerNorm.weight",
73
+ "encoderLayer.2.attnLayerNorm.bias": "roberta.encoder.layer.2.attention.output.LayerNorm.bias",
74
+ "encoderLayer.2.feedForward.intermediateDense.weight": "roberta.encoder.layer.2.intermediate.dense.weight",
75
+ "encoderLayer.2.feedForward.intermediateDense.bias": "roberta.encoder.layer.2.intermediate.dense.bias",
76
+ "encoderLayer.2.feedForward.outputDense.weight": "roberta.encoder.layer.2.output.dense.weight",
77
+ "encoderLayer.2.feedForward.outputDense.bias": "roberta.encoder.layer.2.output.dense.bias",
78
+ "encoderLayer.2.ffnLayerNorm.weight": "roberta.encoder.layer.2.output.LayerNorm.weight",
79
+ "encoderLayer.2.ffnLayerNorm.bias": "roberta.encoder.layer.2.output.LayerNorm.bias",
80
+ "encoderLayer.3.multiHeadAttention.q.weight": "roberta.encoder.layer.3.attention.self.query.weight",
81
+ "encoderLayer.3.multiHeadAttention.q.bias": "roberta.encoder.layer.3.attention.self.query.bias",
82
+ "encoderLayer.3.multiHeadAttention.k.weight": "roberta.encoder.layer.3.attention.self.key.weight",
83
+ "encoderLayer.3.multiHeadAttention.k.bias": "roberta.encoder.layer.3.attention.self.key.bias",
84
+ "encoderLayer.3.multiHeadAttention.v.weight": "roberta.encoder.layer.3.attention.self.value.weight",
85
+ "encoderLayer.3.multiHeadAttention.v.bias": "roberta.encoder.layer.3.attention.self.value.bias",
86
+ "encoderLayer.3.multiHeadAttention.o.weight": "roberta.encoder.layer.3.attention.output.dense.weight",
87
+ "encoderLayer.3.multiHeadAttention.o.bias": "roberta.encoder.layer.3.attention.output.dense.bias",
88
+ "encoderLayer.3.attnLayerNorm.weight": "roberta.encoder.layer.3.attention.output.LayerNorm.weight",
89
+ "encoderLayer.3.attnLayerNorm.bias": "roberta.encoder.layer.3.attention.output.LayerNorm.bias",
90
+ "encoderLayer.3.feedForward.intermediateDense.weight": "roberta.encoder.layer.3.intermediate.dense.weight",
91
+ "encoderLayer.3.feedForward.intermediateDense.bias": "roberta.encoder.layer.3.intermediate.dense.bias",
92
+ "encoderLayer.3.feedForward.outputDense.weight": "roberta.encoder.layer.3.output.dense.weight",
93
+ "encoderLayer.3.feedForward.outputDense.bias": "roberta.encoder.layer.3.output.dense.bias",
94
+ "encoderLayer.3.ffnLayerNorm.weight": "roberta.encoder.layer.3.output.LayerNorm.weight",
95
+ "encoderLayer.3.ffnLayerNorm.bias": "roberta.encoder.layer.3.output.LayerNorm.bias",
96
+ "encoderLayer.4.multiHeadAttention.q.weight": "roberta.encoder.layer.4.attention.self.query.weight",
97
+ "encoderLayer.4.multiHeadAttention.q.bias": "roberta.encoder.layer.4.attention.self.query.bias",
98
+ "encoderLayer.4.multiHeadAttention.k.weight": "roberta.encoder.layer.4.attention.self.key.weight",
99
+ "encoderLayer.4.multiHeadAttention.k.bias": "roberta.encoder.layer.4.attention.self.key.bias",
100
+ "encoderLayer.4.multiHeadAttention.v.weight": "roberta.encoder.layer.4.attention.self.value.weight",
101
+ "encoderLayer.4.multiHeadAttention.v.bias": "roberta.encoder.layer.4.attention.self.value.bias",
102
+ "encoderLayer.4.multiHeadAttention.o.weight": "roberta.encoder.layer.4.attention.output.dense.weight",
103
+ "encoderLayer.4.multiHeadAttention.o.bias": "roberta.encoder.layer.4.attention.output.dense.bias",
104
+ "encoderLayer.4.attnLayerNorm.weight": "roberta.encoder.layer.4.attention.output.LayerNorm.weight",
105
+ "encoderLayer.4.attnLayerNorm.bias": "roberta.encoder.layer.4.attention.output.LayerNorm.bias",
106
+ "encoderLayer.4.feedForward.intermediateDense.weight": "roberta.encoder.layer.4.intermediate.dense.weight",
107
+ "encoderLayer.4.feedForward.intermediateDense.bias": "roberta.encoder.layer.4.intermediate.dense.bias",
108
+ "encoderLayer.4.feedForward.outputDense.weight": "roberta.encoder.layer.4.output.dense.weight",
109
+ "encoderLayer.4.feedForward.outputDense.bias": "roberta.encoder.layer.4.output.dense.bias",
110
+ "encoderLayer.4.ffnLayerNorm.weight": "roberta.encoder.layer.4.output.LayerNorm.weight",
111
+ "encoderLayer.4.ffnLayerNorm.bias": "roberta.encoder.layer.4.output.LayerNorm.bias",
112
+ "encoderLayer.5.multiHeadAttention.q.weight": "roberta.encoder.layer.5.attention.self.query.weight",
113
+ "encoderLayer.5.multiHeadAttention.q.bias": "roberta.encoder.layer.5.attention.self.query.bias",
114
+ "encoderLayer.5.multiHeadAttention.k.weight": "roberta.encoder.layer.5.attention.self.key.weight",
115
+ "encoderLayer.5.multiHeadAttention.k.bias": "roberta.encoder.layer.5.attention.self.key.bias",
116
+ "encoderLayer.5.multiHeadAttention.v.weight": "roberta.encoder.layer.5.attention.self.value.weight",
117
+ "encoderLayer.5.multiHeadAttention.v.bias": "roberta.encoder.layer.5.attention.self.value.bias",
118
+ "encoderLayer.5.multiHeadAttention.o.weight": "roberta.encoder.layer.5.attention.output.dense.weight",
119
+ "encoderLayer.5.multiHeadAttention.o.bias": "roberta.encoder.layer.5.attention.output.dense.bias",
120
+ "encoderLayer.5.attnLayerNorm.weight": "roberta.encoder.layer.5.attention.output.LayerNorm.weight",
121
+ "encoderLayer.5.attnLayerNorm.bias": "roberta.encoder.layer.5.attention.output.LayerNorm.bias",
122
+ "encoderLayer.5.feedForward.intermediateDense.weight": "roberta.encoder.layer.5.intermediate.dense.weight",
123
+ "encoderLayer.5.feedForward.intermediateDense.bias": "roberta.encoder.layer.5.intermediate.dense.bias",
124
+ "encoderLayer.5.feedForward.outputDense.weight": "roberta.encoder.layer.5.output.dense.weight",
125
+ "encoderLayer.5.feedForward.outputDense.bias": "roberta.encoder.layer.5.output.dense.bias",
126
+ "encoderLayer.5.ffnLayerNorm.weight": "roberta.encoder.layer.5.output.LayerNorm.weight",
127
+ "encoderLayer.5.ffnLayerNorm.bias": "roberta.encoder.layer.5.output.LayerNorm.bias",
128
+ "encoderLayer.6.multiHeadAttention.q.weight": "roberta.encoder.layer.6.attention.self.query.weight",
129
+ "encoderLayer.6.multiHeadAttention.q.bias": "roberta.encoder.layer.6.attention.self.query.bias",
130
+ "encoderLayer.6.multiHeadAttention.k.weight": "roberta.encoder.layer.6.attention.self.key.weight",
131
+ "encoderLayer.6.multiHeadAttention.k.bias": "roberta.encoder.layer.6.attention.self.key.bias",
132
+ "encoderLayer.6.multiHeadAttention.v.weight": "roberta.encoder.layer.6.attention.self.value.weight",
133
+ "encoderLayer.6.multiHeadAttention.v.bias": "roberta.encoder.layer.6.attention.self.value.bias",
134
+ "encoderLayer.6.multiHeadAttention.o.weight": "roberta.encoder.layer.6.attention.output.dense.weight",
135
+ "encoderLayer.6.multiHeadAttention.o.bias": "roberta.encoder.layer.6.attention.output.dense.bias",
136
+ "encoderLayer.6.attnLayerNorm.weight": "roberta.encoder.layer.6.attention.output.LayerNorm.weight",
137
+ "encoderLayer.6.attnLayerNorm.bias": "roberta.encoder.layer.6.attention.output.LayerNorm.bias",
138
+ "encoderLayer.6.feedForward.intermediateDense.weight": "roberta.encoder.layer.6.intermediate.dense.weight",
139
+ "encoderLayer.6.feedForward.intermediateDense.bias": "roberta.encoder.layer.6.intermediate.dense.bias",
140
+ "encoderLayer.6.feedForward.outputDense.weight": "roberta.encoder.layer.6.output.dense.weight",
141
+ "encoderLayer.6.feedForward.outputDense.bias": "roberta.encoder.layer.6.output.dense.bias",
142
+ "encoderLayer.6.ffnLayerNorm.weight": "roberta.encoder.layer.6.output.LayerNorm.weight",
143
+ "encoderLayer.6.ffnLayerNorm.bias": "roberta.encoder.layer.6.output.LayerNorm.bias",
144
+ "encoderLayer.7.multiHeadAttention.q.weight": "roberta.encoder.layer.7.attention.self.query.weight",
145
+ "encoderLayer.7.multiHeadAttention.q.bias": "roberta.encoder.layer.7.attention.self.query.bias",
146
+ "encoderLayer.7.multiHeadAttention.k.weight": "roberta.encoder.layer.7.attention.self.key.weight",
147
+ "encoderLayer.7.multiHeadAttention.k.bias": "roberta.encoder.layer.7.attention.self.key.bias",
148
+ "encoderLayer.7.multiHeadAttention.v.weight": "roberta.encoder.layer.7.attention.self.value.weight",
149
+ "encoderLayer.7.multiHeadAttention.v.bias": "roberta.encoder.layer.7.attention.self.value.bias",
150
+ "encoderLayer.7.multiHeadAttention.o.weight": "roberta.encoder.layer.7.attention.output.dense.weight",
151
+ "encoderLayer.7.multiHeadAttention.o.bias": "roberta.encoder.layer.7.attention.output.dense.bias",
152
+ "encoderLayer.7.attnLayerNorm.weight": "roberta.encoder.layer.7.attention.output.LayerNorm.weight",
153
+ "encoderLayer.7.attnLayerNorm.bias": "roberta.encoder.layer.7.attention.output.LayerNorm.bias",
154
+ "encoderLayer.7.feedForward.intermediateDense.weight": "roberta.encoder.layer.7.intermediate.dense.weight",
155
+ "encoderLayer.7.feedForward.intermediateDense.bias": "roberta.encoder.layer.7.intermediate.dense.bias",
156
+ "encoderLayer.7.feedForward.outputDense.weight": "roberta.encoder.layer.7.output.dense.weight",
157
+ "encoderLayer.7.feedForward.outputDense.bias": "roberta.encoder.layer.7.output.dense.bias",
158
+ "encoderLayer.7.ffnLayerNorm.weight": "roberta.encoder.layer.7.output.LayerNorm.weight",
159
+ "encoderLayer.7.ffnLayerNorm.bias": "roberta.encoder.layer.7.output.LayerNorm.bias",
160
+ "encoderLayer.8.multiHeadAttention.q.weight": "roberta.encoder.layer.8.attention.self.query.weight",
161
+ "encoderLayer.8.multiHeadAttention.q.bias": "roberta.encoder.layer.8.attention.self.query.bias",
162
+ "encoderLayer.8.multiHeadAttention.k.weight": "roberta.encoder.layer.8.attention.self.key.weight",
163
+ "encoderLayer.8.multiHeadAttention.k.bias": "roberta.encoder.layer.8.attention.self.key.bias",
164
+ "encoderLayer.8.multiHeadAttention.v.weight": "roberta.encoder.layer.8.attention.self.value.weight",
165
+ "encoderLayer.8.multiHeadAttention.v.bias": "roberta.encoder.layer.8.attention.self.value.bias",
166
+ "encoderLayer.8.multiHeadAttention.o.weight": "roberta.encoder.layer.8.attention.output.dense.weight",
167
+ "encoderLayer.8.multiHeadAttention.o.bias": "roberta.encoder.layer.8.attention.output.dense.bias",
168
+ "encoderLayer.8.attnLayerNorm.weight": "roberta.encoder.layer.8.attention.output.LayerNorm.weight",
169
+ "encoderLayer.8.attnLayerNorm.bias": "roberta.encoder.layer.8.attention.output.LayerNorm.bias",
170
+ "encoderLayer.8.feedForward.intermediateDense.weight": "roberta.encoder.layer.8.intermediate.dense.weight",
171
+ "encoderLayer.8.feedForward.intermediateDense.bias": "roberta.encoder.layer.8.intermediate.dense.bias",
172
+ "encoderLayer.8.feedForward.outputDense.weight": "roberta.encoder.layer.8.output.dense.weight",
173
+ "encoderLayer.8.feedForward.outputDense.bias": "roberta.encoder.layer.8.output.dense.bias",
174
+ "encoderLayer.8.ffnLayerNorm.weight": "roberta.encoder.layer.8.output.LayerNorm.weight",
175
+ "encoderLayer.8.ffnLayerNorm.bias": "roberta.encoder.layer.8.output.LayerNorm.bias",
176
+ "encoderLayer.9.multiHeadAttention.q.weight": "roberta.encoder.layer.9.attention.self.query.weight",
177
+ "encoderLayer.9.multiHeadAttention.q.bias": "roberta.encoder.layer.9.attention.self.query.bias",
178
+ "encoderLayer.9.multiHeadAttention.k.weight": "roberta.encoder.layer.9.attention.self.key.weight",
179
+ "encoderLayer.9.multiHeadAttention.k.bias": "roberta.encoder.layer.9.attention.self.key.bias",
180
+ "encoderLayer.9.multiHeadAttention.v.weight": "roberta.encoder.layer.9.attention.self.value.weight",
181
+ "encoderLayer.9.multiHeadAttention.v.bias": "roberta.encoder.layer.9.attention.self.value.bias",
182
+ "encoderLayer.9.multiHeadAttention.o.weight": "roberta.encoder.layer.9.attention.output.dense.weight",
183
+ "encoderLayer.9.multiHeadAttention.o.bias": "roberta.encoder.layer.9.attention.output.dense.bias",
184
+ "encoderLayer.9.attnLayerNorm.weight": "roberta.encoder.layer.9.attention.output.LayerNorm.weight",
185
+ "encoderLayer.9.attnLayerNorm.bias": "roberta.encoder.layer.9.attention.output.LayerNorm.bias",
186
+ "encoderLayer.9.feedForward.intermediateDense.weight": "roberta.encoder.layer.9.intermediate.dense.weight",
187
+ "encoderLayer.9.feedForward.intermediateDense.bias": "roberta.encoder.layer.9.intermediate.dense.bias",
188
+ "encoderLayer.9.feedForward.outputDense.weight": "roberta.encoder.layer.9.output.dense.weight",
189
+ "encoderLayer.9.feedForward.outputDense.bias": "roberta.encoder.layer.9.output.dense.bias",
190
+ "encoderLayer.9.ffnLayerNorm.weight": "roberta.encoder.layer.9.output.LayerNorm.weight",
191
+ "encoderLayer.9.ffnLayerNorm.bias": "roberta.encoder.layer.9.output.LayerNorm.bias",
192
+ "encoderLayer.10.multiHeadAttention.q.weight": "roberta.encoder.layer.10.attention.self.query.weight",
193
+ "encoderLayer.10.multiHeadAttention.q.bias": "roberta.encoder.layer.10.attention.self.query.bias",
194
+ "encoderLayer.10.multiHeadAttention.k.weight": "roberta.encoder.layer.10.attention.self.key.weight",
195
+ "encoderLayer.10.multiHeadAttention.k.bias": "roberta.encoder.layer.10.attention.self.key.bias",
196
+ "encoderLayer.10.multiHeadAttention.v.weight": "roberta.encoder.layer.10.attention.self.value.weight",
197
+ "encoderLayer.10.multiHeadAttention.v.bias": "roberta.encoder.layer.10.attention.self.value.bias",
198
+ "encoderLayer.10.multiHeadAttention.o.weight": "roberta.encoder.layer.10.attention.output.dense.weight",
199
+ "encoderLayer.10.multiHeadAttention.o.bias": "roberta.encoder.layer.10.attention.output.dense.bias",
200
+ "encoderLayer.10.attnLayerNorm.weight": "roberta.encoder.layer.10.attention.output.LayerNorm.weight",
201
+ "encoderLayer.10.attnLayerNorm.bias": "roberta.encoder.layer.10.attention.output.LayerNorm.bias",
202
+ "encoderLayer.10.feedForward.intermediateDense.weight": "roberta.encoder.layer.10.intermediate.dense.weight",
203
+ "encoderLayer.10.feedForward.intermediateDense.bias": "roberta.encoder.layer.10.intermediate.dense.bias",
204
+ "encoderLayer.10.feedForward.outputDense.weight": "roberta.encoder.layer.10.output.dense.weight",
205
+ "encoderLayer.10.feedForward.outputDense.bias": "roberta.encoder.layer.10.output.dense.bias",
206
+ "encoderLayer.10.ffnLayerNorm.weight": "roberta.encoder.layer.10.output.LayerNorm.weight",
207
+ "encoderLayer.10.ffnLayerNorm.bias": "roberta.encoder.layer.10.output.LayerNorm.bias",
208
+ "encoderLayer.11.multiHeadAttention.q.weight": "roberta.encoder.layer.11.attention.self.query.weight",
209
+ "encoderLayer.11.multiHeadAttention.q.bias": "roberta.encoder.layer.11.attention.self.query.bias",
210
+ "encoderLayer.11.multiHeadAttention.k.weight": "roberta.encoder.layer.11.attention.self.key.weight",
211
+ "encoderLayer.11.multiHeadAttention.k.bias": "roberta.encoder.layer.11.attention.self.key.bias",
212
+ "encoderLayer.11.multiHeadAttention.v.weight": "roberta.encoder.layer.11.attention.self.value.weight",
213
+ "encoderLayer.11.multiHeadAttention.v.bias": "roberta.encoder.layer.11.attention.self.value.bias",
214
+ "encoderLayer.11.multiHeadAttention.o.weight": "roberta.encoder.layer.11.attention.output.dense.weight",
215
+ "encoderLayer.11.multiHeadAttention.o.bias": "roberta.encoder.layer.11.attention.output.dense.bias",
216
+ "encoderLayer.11.attnLayerNorm.weight": "roberta.encoder.layer.11.attention.output.LayerNorm.weight",
217
+ "encoderLayer.11.attnLayerNorm.bias": "roberta.encoder.layer.11.attention.output.LayerNorm.bias",
218
+ "encoderLayer.11.feedForward.intermediateDense.weight": "roberta.encoder.layer.11.intermediate.dense.weight",
219
+ "encoderLayer.11.feedForward.intermediateDense.bias": "roberta.encoder.layer.11.intermediate.dense.bias",
220
+ "encoderLayer.11.feedForward.outputDense.weight": "roberta.encoder.layer.11.output.dense.weight",
221
+ "encoderLayer.11.feedForward.outputDense.bias": "roberta.encoder.layer.11.output.dense.bias",
222
+ "encoderLayer.11.ffnLayerNorm.weight": "roberta.encoder.layer.11.output.LayerNorm.weight",
223
+ "encoderLayer.11.ffnLayerNorm.bias": "roberta.encoder.layer.11.output.LayerNorm.bias"
224
+ }
225
+ }
 
IDEA-CCNL/Erlangshen-DeBERTa-v2-320M-Chinese/bert4torch_config.json CHANGED
@@ -1,25 +1,25 @@
1
- {
2
- "model": "deberta_v2",
3
- "attention_probs_dropout_prob": 0.1,
4
- "hidden_act": "gelu",
5
- "hidden_dropout_prob": 0.1,
6
- "hidden_size": 1024,
7
- "initializer_range": 0.02,
8
- "intermediate_size": 4096,
9
- "max_position_embeddings": 512,
10
- "relative_attention": true,
11
- "position_buckets": 256,
12
- "norm_rel_ebd": "layer_norm",
13
- "share_att_key": true,
14
- "pos_att_type": "c2p|p2c",
15
- "conv_kernel_size": 3,
16
- "conv_act": "gelu",
17
- "layer_norm_eps": 1e-7,
18
- "max_relative_positions": -1,
19
- "position_biased_input": false,
20
- "num_attention_heads": 16,
21
- "num_hidden_layers": 24,
22
- "type_vocab_size": 0,
23
- "vocab_size": 12800,
24
- "num_labels": 119
25
  }
 
1
+ {
2
+ "model": "deberta_v2",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 1024,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 4096,
9
+ "max_position_embeddings": 512,
10
+ "relative_attention": true,
11
+ "position_buckets": 256,
12
+ "norm_rel_ebd": "layer_norm",
13
+ "share_att_key": true,
14
+ "pos_att_type": "c2p|p2c",
15
+ "conv_kernel_size": 3,
16
+ "conv_act": "gelu",
17
+ "layer_norm_eps": 1e-07,
18
+ "max_relative_positions": -1,
19
+ "position_biased_input": false,
20
+ "num_attention_heads": 16,
21
+ "num_hidden_layers": 24,
22
+ "type_vocab_size": 0,
23
+ "vocab_size": 12800,
24
+ "num_labels": 119
25
  }
IDEA-CCNL/Erlangshen-DeBERTa-v2-710M-Chinese/bert4torch_config.json CHANGED
@@ -1,32 +1,32 @@
1
- {
2
- "model": "deberta_v2",
3
- "attention_probs_dropout_prob": 0.1,
4
- "attention_head_size": 64,
5
- "hidden_act": "gelu",
6
- "hidden_dropout_prob": 0.1,
7
- "hidden_size": 1536,
8
- "initializer_range": 0.02,
9
- "intermediate_size": 6144,
10
- "max_position_embeddings": 512,
11
- "relative_attention": true,
12
- "position_buckets": 256,
13
- "norm_rel_ebd": "layer_norm",
14
- "share_att_key": true,
15
- "pos_att_type": [
16
- "p2c",
17
- "c2p"
18
- ],
19
- "conv_kernel_size": 3,
20
- "pooler_dropout": 0,
21
- "pooler_hidden_act": "gelu",
22
- "pooler_hidden_size": 1536,
23
- "conv_act": "gelu",
24
- "layer_norm_eps": 1e-7,
25
- "max_relative_positions": -1,
26
- "position_biased_input": false,
27
- "num_attention_heads": 24,
28
- "num_hidden_layers": 24,
29
- "type_vocab_size": 0,
30
- "num_labels": 119,
31
- "vocab_size": 12800
32
  }
 
1
+ {
2
+ "model": "deberta_v2",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "attention_head_size": 64,
5
+ "hidden_act": "gelu",
6
+ "hidden_dropout_prob": 0.1,
7
+ "hidden_size": 1536,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 6144,
10
+ "max_position_embeddings": 512,
11
+ "relative_attention": true,
12
+ "position_buckets": 256,
13
+ "norm_rel_ebd": "layer_norm",
14
+ "share_att_key": true,
15
+ "pos_att_type": [
16
+ "p2c",
17
+ "c2p"
18
+ ],
19
+ "conv_kernel_size": 3,
20
+ "pooler_dropout": 0,
21
+ "pooler_hidden_act": "gelu",
22
+ "pooler_hidden_size": 1536,
23
+ "conv_act": "gelu",
24
+ "layer_norm_eps": 1e-07,
25
+ "max_relative_positions": -1,
26
+ "position_biased_input": false,
27
+ "num_attention_heads": 24,
28
+ "num_hidden_layers": 24,
29
+ "type_vocab_size": 0,
30
+ "num_labels": 119,
31
+ "vocab_size": 12800
32
  }
IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese/bert4torch_config.json CHANGED
@@ -1,24 +1,24 @@
1
- {
2
- "model": "deberta_v2",
3
- "attention_probs_dropout_prob": 0.1,
4
- "hidden_act": "gelu",
5
- "hidden_dropout_prob": 0.1,
6
- "hidden_size": 768,
7
- "initializer_range": 0.02,
8
- "intermediate_size": 3072,
9
- "max_position_embeddings": 512,
10
- "relative_attention": true,
11
- "position_buckets": 256,
12
- "norm_rel_ebd": "layer_norm",
13
- "share_att_key": true,
14
- "pos_att_type": "c2p|p2c",
15
- "conv_kernel_size": 3,
16
- "conv_act": "gelu",
17
- "layer_norm_eps": 1e-7,
18
- "max_relative_positions": -1,
19
- "position_biased_input": false,
20
- "num_attention_heads": 12,
21
- "num_hidden_layers": 12,
22
- "type_vocab_size": 0,
23
- "vocab_size": 12800
24
  }
 
1
+ {
2
+ "model": "deberta_v2",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 768,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 3072,
9
+ "max_position_embeddings": 512,
10
+ "relative_attention": true,
11
+ "position_buckets": 256,
12
+ "norm_rel_ebd": "layer_norm",
13
+ "share_att_key": true,
14
+ "pos_att_type": "c2p|p2c",
15
+ "conv_kernel_size": 3,
16
+ "conv_act": "gelu",
17
+ "layer_norm_eps": 1e-07,
18
+ "max_relative_positions": -1,
19
+ "position_biased_input": false,
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "type_vocab_size": 0,
23
+ "vocab_size": 12800
24
  }
IDEA-CCNL/Ziya-LLaMA-13B-v1.1/bert4torch_config.json CHANGED
@@ -1,22 +1,28 @@
1
- {
2
- "model": "llama",
3
- "template": "ziya",
4
- "bos_token_id": 1,
5
- "eos_token_id": 2,
6
- "hidden_act": "silu",
7
- "hidden_size": 5120,
8
- "initializer_range": 0.02,
9
- "intermediate_size": 13824,
10
- "max_position_embeddings": 2048,
11
- "num_attention_heads": 40,
12
- "num_hidden_layers": 40,
13
- "pad_token_id": 0,
14
- "layer_norm_eps": 1e-06,
15
- "tie_word_embeddings": false,
16
- "use_cache": true,
17
- "vocab_size": 39424,
18
- "segment_vocab_size": 0,
19
- "skip_init": true,
20
- "rope_rank": "updown",
21
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "max_length": 2048, "eos_token_id": 2}
 
 
 
 
 
 
22
  }
 
1
+ {
2
+ "model": "llama",
3
+ "template": "ziya",
4
+ "bos_token_id": 1,
5
+ "eos_token_id": 2,
6
+ "hidden_act": "silu",
7
+ "hidden_size": 5120,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 13824,
10
+ "max_position_embeddings": 2048,
11
+ "num_attention_heads": 40,
12
+ "num_hidden_layers": 40,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 39424,
18
+ "segment_vocab_size": 0,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "generation_config": {
22
+ "tokenizer_config": {
23
+ "skip_special_tokens": true
24
+ },
25
+ "max_length": 2048,
26
+ "eos_token_id": 2
27
+ }
28
  }
IDEA-CCNL/Ziya-LLaMA-13B-v1/bert4torch_config.json CHANGED
@@ -1,22 +1,28 @@
1
- {
2
- "model": "llama",
3
- "template": "ziya",
4
- "bos_token_id": 1,
5
- "eos_token_id": 2,
6
- "hidden_act": "silu",
7
- "hidden_size": 5120,
8
- "initializer_range": 0.02,
9
- "intermediate_size": 13824,
10
- "max_position_embeddings": 2048,
11
- "num_attention_heads": 40,
12
- "num_hidden_layers": 40,
13
- "pad_token_id": 0,
14
- "layer_norm_eps": 1e-06,
15
- "tie_word_embeddings": false,
16
- "use_cache": true,
17
- "vocab_size": 39424,
18
- "segment_vocab_size": 0,
19
- "skip_init": true,
20
- "rope_rank": "updown",
21
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "max_length": 2048, "eos_token_id": 2}
 
 
 
 
 
 
22
  }
 
1
+ {
2
+ "model": "llama",
3
+ "template": "ziya",
4
+ "bos_token_id": 1,
5
+ "eos_token_id": 2,
6
+ "hidden_act": "silu",
7
+ "hidden_size": 5120,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 13824,
10
+ "max_position_embeddings": 2048,
11
+ "num_attention_heads": 40,
12
+ "num_hidden_layers": 40,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "tie_word_embeddings": false,
16
+ "use_cache": true,
17
+ "vocab_size": 39424,
18
+ "segment_vocab_size": 0,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "generation_config": {
22
+ "tokenizer_config": {
23
+ "skip_special_tokens": true
24
+ },
25
+ "max_length": 2048,
26
+ "eos_token_id": 2
27
+ }
28
  }
OpenGVLab/InternVL2_5-1B/bert4torch_config.json CHANGED
@@ -1,83 +1,90 @@
1
  {
2
- "downsample_ratio": 0.5,
3
- "dynamic_image_size": true,
4
- "force_image_size": 448,
5
- "model": "internvl",
6
- "template": "internvl2_5",
7
- "template_config": {
8
- "name": "internvl2_5",
9
- "system_template": "<|im_start|>system\n{system_message}",
10
- "system_message": "你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
11
- "role_user": "<|im_start|>user\n",
12
- "role_assistant": "<|im_start|>assistant\n",
13
- "sep_style": "SeparatorStyle.MPT",
14
- "sep": "<|im_end|>\n"
15
- },
16
- "max_dynamic_patch": 12,
17
- "min_dynamic_patch": 1,
18
- "ps_version": "v2",
19
- "select_layer": -1,
20
- "skip_init": true,
21
- "torch_dtype": "bfloat16",
22
- "model_llm": "qwen2",
23
- "attention_dropout": 0.0,
24
- "bos_token_id": 151643,
25
- "eos_token_id": 151645,
26
- "pad_token_id": null,
27
- "img_context_token_id": 151667,
28
- "hidden_act": "silu",
29
- "hidden_size": 896,
30
- "initializer_range": 0.02,
31
- "intermediate_size": 4864,
32
- "length_penalty": 1.0,
33
- "max_position_embeddings": 32768,
34
- "max_window_layers": 21,
35
- "num_hidden_layers": 24,
36
- "num_attention_heads": 14,
37
- "num_key_value_heads": 2,
38
- "layer_norm_eps": 1e-06,
39
- "rope_theta": 1000000.0,
40
- "sliding_window": 32768,
41
- "tie_word_embeddings": false,
42
- "use_sliding_window": false,
43
- "vocab_size": 151674,
44
- "segment_vocab_size": 0,
45
- "rope_rank": "updown",
46
- "max_position": 32768,
47
- "generation_config": {
48
- "tokenizer_config": {
49
- "allowed_special": ["<|im_end|>", "<|im_start|>", "<|endoftext|>"],
50
- "skip_special_tokens": true
51
  },
52
- "eos_token_id": [151643, 151644, 151645],
53
- "max_length": 8192
54
- }
55
- ,
56
- "vision_config": {
 
 
57
  "attention_dropout": 0.0,
58
- "drop_path_rate": 0.0,
59
- "dropout": 0.0,
60
- "hidden_act": "gelu",
61
- "hidden_size": 1024,
62
- "image_size": 448,
63
- "initializer_factor": 1.0,
64
  "initializer_range": 0.02,
65
- "intermediate_size": 4096,
66
- "layer_norm_eps": 1e-06,
67
- "model_type": "intern_vit_6b",
68
- "norm_type": "layer_norm",
69
- "num_attention_heads": 16,
70
- "num_channels": 3,
71
  "num_hidden_layers": 24,
72
- "output_attentions": false,
73
- "output_hidden_states": false,
74
- "patch_size": 14,
75
- "qk_normalization": false,
76
- "qkv_bias": true,
77
- "return_dict": true,
78
- "torch_dtype": "bfloat16",
79
- "transformers_version": "4.37.2",
80
- "use_bfloat16": true,
81
- "use_flash_attn": true
82
- }
83
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "downsample_ratio": 0.5,
3
+ "dynamic_image_size": true,
4
+ "force_image_size": 448,
5
+ "model": "internvl",
6
+ "template": "internvl2_5",
7
+ "template_config": {
8
+ "name": "internvl2_5",
9
+ "system_template": "<|im_start|>system\n{system_message}",
10
+ "system_message": "你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
11
+ "role_user": "<|im_start|>user\n",
12
+ "role_assistant": "<|im_start|>assistant\n",
13
+ "sep_style": "SeparatorStyle.MPT",
14
+ "sep": "<|im_end|>\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  },
16
+ "max_dynamic_patch": 12,
17
+ "min_dynamic_patch": 1,
18
+ "ps_version": "v2",
19
+ "select_layer": -1,
20
+ "skip_init": true,
21
+ "torch_dtype": "bfloat16",
22
+ "model_llm": "qwen2",
23
  "attention_dropout": 0.0,
24
+ "bos_token_id": 151643,
25
+ "eos_token_id": 151645,
26
+ "pad_token_id": null,
27
+ "img_context_token_id": 151667,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 896,
30
  "initializer_range": 0.02,
31
+ "intermediate_size": 4864,
32
+ "length_penalty": 1.0,
33
+ "max_position_embeddings": 32768,
34
+ "max_window_layers": 21,
 
 
35
  "num_hidden_layers": 24,
36
+ "num_attention_heads": 14,
37
+ "num_key_value_heads": 2,
38
+ "layer_norm_eps": 1e-06,
39
+ "rope_theta": 1000000.0,
40
+ "sliding_window": 32768,
41
+ "tie_word_embeddings": false,
42
+ "use_sliding_window": false,
43
+ "vocab_size": 151674,
44
+ "segment_vocab_size": 0,
45
+ "rope_rank": "updown",
46
+ "max_position": 32768,
47
+ "generation_config": {
48
+ "tokenizer_config": {
49
+ "allowed_special": [
50
+ "<|im_end|>",
51
+ "<|im_start|>",
52
+ "<|endoftext|>"
53
+ ],
54
+ "skip_special_tokens": true
55
+ },
56
+ "eos_token_id": [
57
+ 151643,
58
+ 151644,
59
+ 151645
60
+ ],
61
+ "max_length": 8192
62
+ },
63
+ "vision_config": {
64
+ "attention_dropout": 0.0,
65
+ "drop_path_rate": 0.0,
66
+ "dropout": 0.0,
67
+ "hidden_act": "gelu",
68
+ "hidden_size": 1024,
69
+ "image_size": 448,
70
+ "initializer_factor": 1.0,
71
+ "initializer_range": 0.02,
72
+ "intermediate_size": 4096,
73
+ "layer_norm_eps": 1e-06,
74
+ "model_type": "intern_vit_6b",
75
+ "norm_type": "layer_norm",
76
+ "num_attention_heads": 16,
77
+ "num_channels": 3,
78
+ "num_hidden_layers": 24,
79
+ "output_attentions": false,
80
+ "output_hidden_states": false,
81
+ "patch_size": 14,
82
+ "qk_normalization": false,
83
+ "qkv_bias": true,
84
+ "return_dict": true,
85
+ "torch_dtype": "bfloat16",
86
+ "transformers_version": "4.37.2",
87
+ "use_bfloat16": true,
88
+ "use_flash_attn": true
89
+ }
90
+ }
Qwen/Qwen-14B-Chat/bert4torch_config.json CHANGED
@@ -1,25 +1,40 @@
1
- {
2
- "model": "qwen",
3
- "hidden_act": "silu",
4
- "intermediate_size": 13696,
5
- "initializer_range": 0.02,
6
- "layer_norm_eps": 1e-06,
7
- "hidden_size": 5120,
8
- "num_attention_heads": 40,
9
- "num_hidden_layers": 40,
10
- "rope_theta": 10000,
11
- "scale_attn_weights": true,
12
- "seq_length": 2048,
13
- "tie_word_embeddings": false,
14
- "_attn_implementation": "flash_attn_2",
15
- "vocab_size": 152064,
16
- "rope_scaling": {"type": "dynamic_qwen"},
17
- "use_logn_attn": true,
18
- "segment_vocab_size": 0,
19
- "skip_init": true,
20
- "rope_rank": "updown",
21
- "max_position_embeddings": 8192,
22
- "generation_config": {"tokenizer_config": {"allowed_special": ["<|im_end|>", "<|im_start|>", "<|endoftext|>"],
23
- "skip_special_tokens": true}, "eos_token_id": [151643, 151644, 151645],
24
- "max_length": 8192}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
 
1
+ {
2
+ "model": "qwen",
3
+ "hidden_act": "silu",
4
+ "intermediate_size": 13696,
5
+ "initializer_range": 0.02,
6
+ "layer_norm_eps": 1e-06,
7
+ "hidden_size": 5120,
8
+ "num_attention_heads": 40,
9
+ "num_hidden_layers": 40,
10
+ "rope_theta": 10000,
11
+ "scale_attn_weights": true,
12
+ "seq_length": 2048,
13
+ "tie_word_embeddings": false,
14
+ "_attn_implementation": "flash_attn_2",
15
+ "vocab_size": 152064,
16
+ "rope_scaling": {
17
+ "type": "dynamic_qwen"
18
+ },
19
+ "use_logn_attn": true,
20
+ "segment_vocab_size": 0,
21
+ "skip_init": true,
22
+ "rope_rank": "updown",
23
+ "max_position_embeddings": 8192,
24
+ "generation_config": {
25
+ "tokenizer_config": {
26
+ "allowed_special": [
27
+ "<|im_end|>",
28
+ "<|im_start|>",
29
+ "<|endoftext|>"
30
+ ],
31
+ "skip_special_tokens": true
32
+ },
33
+ "eos_token_id": [
34
+ 151643,
35
+ 151644,
36
+ 151645
37
+ ],
38
+ "max_length": 8192
39
+ }
40
  }
Qwen/Qwen-14B/bert4torch_config.json CHANGED
@@ -1,26 +1,39 @@
1
- {
2
- "model": "qwen",
3
- "template": "pretrained_text_continuation",
4
- "hidden_act": "silu",
5
- "intermediate_size": 13696,
6
- "initializer_range": 0.02,
7
- "layer_norm_eps": 1e-06,
8
- "hidden_size": 5120,
9
- "num_attention_heads": 40,
10
- "num_hidden_layers": 40,
11
- "rope_theta": 10000,
12
- "scale_attn_weights": true,
13
- "seq_length": 2048,
14
- "tie_word_embeddings": false,
15
- "_attn_implementation": "flash_attn_2",
16
- "vocab_size": 152064,
17
- "rope_scaling": {"type": "dynamic_qwen"},
18
- "use_logn_attn": true,
19
- "segment_vocab_size": 0,
20
- "skip_init": true,
21
- "rope_rank": "updown",
22
- "max_position_embeddings": 8192,
23
- "generation_config": {"tokenizer_config": {"allowed_special": ["<|im_end|>", "<|im_start|>", "<|endoftext|>"],
24
- "skip_special_tokens": true}, "eos_token_id": [151643],
25
- "max_length": 8192}
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
 
1
+ {
2
+ "model": "qwen",
3
+ "template": "pretrained_text_continuation",
4
+ "hidden_act": "silu",
5
+ "intermediate_size": 13696,
6
+ "initializer_range": 0.02,
7
+ "layer_norm_eps": 1e-06,
8
+ "hidden_size": 5120,
9
+ "num_attention_heads": 40,
10
+ "num_hidden_layers": 40,
11
+ "rope_theta": 10000,
12
+ "scale_attn_weights": true,
13
+ "seq_length": 2048,
14
+ "tie_word_embeddings": false,
15
+ "_attn_implementation": "flash_attn_2",
16
+ "vocab_size": 152064,
17
+ "rope_scaling": {
18
+ "type": "dynamic_qwen"
19
+ },
20
+ "use_logn_attn": true,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 8192,
25
+ "generation_config": {
26
+ "tokenizer_config": {
27
+ "allowed_special": [
28
+ "<|im_end|>",
29
+ "<|im_start|>",
30
+ "<|endoftext|>"
31
+ ],
32
+ "skip_special_tokens": true
33
+ },
34
+ "eos_token_id": [
35
+ 151643
36
+ ],
37
+ "max_length": 8192
38
+ }
39
  }
Qwen/Qwen-1_8B-Chat/bert4torch_config.json CHANGED
@@ -1,26 +1,41 @@
1
- {
2
- "model": "qwen",
3
- "hidden_act": "silu",
4
- "pad_token_id": 151643,
5
- "eos_token_id": 151643,
6
- "intermediate_size": 5504,
7
- "initializer_range": 0.02,
8
- "layer_norm_eps": 1e-06,
9
- "hidden_size": 2048,
10
- "num_attention_heads": 16,
11
- "num_hidden_layers": 24,
12
- "rope_theta": 10000,
13
- "scale_attn_weights": true,
14
- "tie_word_embeddings": false,
15
- "_attn_implementation": "sdpa",
16
- "vocab_size": 151936,
17
- "rope_scaling": {"type": "dynamic_qwen"},
18
- "use_logn_attn": true,
19
- "segment_vocab_size": 0,
20
- "skip_init": true,
21
- "rope_rank": "updown",
22
- "max_position": 8192,
23
- "generation_config": {"tokenizer_config": {"allowed_special": ["<|im_end|>", "<|im_start|>", "<|endoftext|>"],
24
- "skip_special_tokens": true}, "eos_token_id": [151643, 151644, 151645],
25
- "max_length": 8192}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
 
1
+ {
2
+ "model": "qwen",
3
+ "hidden_act": "silu",
4
+ "pad_token_id": 151643,
5
+ "eos_token_id": 151643,
6
+ "intermediate_size": 5504,
7
+ "initializer_range": 0.02,
8
+ "layer_norm_eps": 1e-06,
9
+ "hidden_size": 2048,
10
+ "num_attention_heads": 16,
11
+ "num_hidden_layers": 24,
12
+ "rope_theta": 10000,
13
+ "scale_attn_weights": true,
14
+ "tie_word_embeddings": false,
15
+ "_attn_implementation": "sdpa",
16
+ "vocab_size": 151936,
17
+ "rope_scaling": {
18
+ "type": "dynamic_qwen"
19
+ },
20
+ "use_logn_attn": true,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position": 8192,
25
+ "generation_config": {
26
+ "tokenizer_config": {
27
+ "allowed_special": [
28
+ "<|im_end|>",
29
+ "<|im_start|>",
30
+ "<|endoftext|>"
31
+ ],
32
+ "skip_special_tokens": true
33
+ },
34
+ "eos_token_id": [
35
+ 151643,
36
+ 151644,
37
+ 151645
38
+ ],
39
+ "max_length": 8192
40
+ }
41
  }
Qwen/Qwen-1_8B/bert4torch_config.json CHANGED
@@ -1,27 +1,40 @@
1
- {
2
- "model": "qwen",
3
- "template": "pretrained_text_continuation",
4
- "hidden_act": "silu",
5
- "pad_token_id": 151643,
6
- "eos_token_id": 151643,
7
- "intermediate_size": 5504,
8
- "initializer_range": 0.02,
9
- "layer_norm_eps": 1e-06,
10
- "hidden_size": 2048,
11
- "num_attention_heads": 16,
12
- "num_hidden_layers": 24,
13
- "rope_theta": 10000,
14
- "scale_attn_weights": true,
15
- "tie_word_embeddings": false,
16
- "_attn_implementation": "sdpa",
17
- "vocab_size": 151936,
18
- "rope_scaling": {"type": "dynamic_qwen"},
19
- "use_logn_attn": true,
20
- "segment_vocab_size": 0,
21
- "skip_init": true,
22
- "rope_rank": "updown",
23
- "max_position": 8192,
24
- "generation_config": {"tokenizer_config": {"allowed_special": ["<|im_end|>", "<|im_start|>", "<|endoftext|>"],
25
- "skip_special_tokens": true}, "eos_token_id": [151643],
26
- "max_length": 8192}
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  }
 
1
+ {
2
+ "model": "qwen",
3
+ "template": "pretrained_text_continuation",
4
+ "hidden_act": "silu",
5
+ "pad_token_id": 151643,
6
+ "eos_token_id": 151643,
7
+ "intermediate_size": 5504,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 2048,
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 24,
13
+ "rope_theta": 10000,
14
+ "scale_attn_weights": true,
15
+ "tie_word_embeddings": false,
16
+ "_attn_implementation": "sdpa",
17
+ "vocab_size": 151936,
18
+ "rope_scaling": {
19
+ "type": "dynamic_qwen"
20
+ },
21
+ "use_logn_attn": true,
22
+ "segment_vocab_size": 0,
23
+ "skip_init": true,
24
+ "rope_rank": "updown",
25
+ "max_position": 8192,
26
+ "generation_config": {
27
+ "tokenizer_config": {
28
+ "allowed_special": [
29
+ "<|im_end|>",
30
+ "<|im_start|>",
31
+ "<|endoftext|>"
32
+ ],
33
+ "skip_special_tokens": true
34
+ },
35
+ "eos_token_id": [
36
+ 151643
37
+ ],
38
+ "max_length": 8192
39
+ }
40
  }
Qwen/Qwen-7B-Chat/bert4torch_config.json CHANGED
@@ -1,26 +1,41 @@
1
- {
2
- "model": "qwen",
3
- "hidden_act": "silu",
4
- "pad_token_id": 151643,
5
- "eos_token_id": 151643,
6
- "intermediate_size": 11008,
7
- "initializer_range": 0.02,
8
- "layer_norm_eps": 1e-05,
9
- "hidden_size": 4096,
10
- "num_attention_heads": 32,
11
- "num_hidden_layers": 32,
12
- "rope_theta": 10000,
13
- "scale_attn_weights": true,
14
- "tie_word_embeddings": false,
15
- "_attn_implementation": "flash_attn_2",
16
- "vocab_size": 151936,
17
- "rope_scaling": {"type": "dynamic_qwen"},
18
- "use_logn_attn": true,
19
- "segment_vocab_size": 0,
20
- "skip_init": true,
21
- "rope_rank": "updown",
22
- "max_position": 8192,
23
- "generation_config": {"tokenizer_config": {"allowed_special": ["<|im_end|>", "<|im_start|>", "<|endoftext|>"],
24
- "skip_special_tokens": true}, "eos_token_id": [151643, 151644, 151645],
25
- "max_length": 8192}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
 
1
+ {
2
+ "model": "qwen",
3
+ "hidden_act": "silu",
4
+ "pad_token_id": 151643,
5
+ "eos_token_id": 151643,
6
+ "intermediate_size": 11008,
7
+ "initializer_range": 0.02,
8
+ "layer_norm_eps": 1e-05,
9
+ "hidden_size": 4096,
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "rope_theta": 10000,
13
+ "scale_attn_weights": true,
14
+ "tie_word_embeddings": false,
15
+ "_attn_implementation": "flash_attn_2",
16
+ "vocab_size": 151936,
17
+ "rope_scaling": {
18
+ "type": "dynamic_qwen"
19
+ },
20
+ "use_logn_attn": true,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position": 8192,
25
+ "generation_config": {
26
+ "tokenizer_config": {
27
+ "allowed_special": [
28
+ "<|im_end|>",
29
+ "<|im_start|>",
30
+ "<|endoftext|>"
31
+ ],
32
+ "skip_special_tokens": true
33
+ },
34
+ "eos_token_id": [
35
+ 151643,
36
+ 151644,
37
+ 151645
38
+ ],
39
+ "max_length": 8192
40
+ }
41
  }
Qwen/Qwen-7B/bert4torch_config.json CHANGED
@@ -1,27 +1,40 @@
1
- {
2
- "model": "qwen",
3
- "template": "pretrained_text_continuation",
4
- "hidden_act": "silu",
5
- "pad_token_id": 151643,
6
- "eos_token_id": 151643,
7
- "intermediate_size": 11008,
8
- "initializer_range": 0.02,
9
- "layer_norm_eps": 1e-05,
10
- "hidden_size": 4096,
11
- "num_attention_heads": 32,
12
- "num_hidden_layers": 32,
13
- "rope_theta": 10000,
14
- "scale_attn_weights": true,
15
- "tie_word_embeddings": false,
16
- "_attn_implementation": "flash_attn_2",
17
- "vocab_size": 151936,
18
- "rope_scaling": {"type": "dynamic_qwen"},
19
- "use_logn_attn": true,
20
- "segment_vocab_size": 0,
21
- "skip_init": true,
22
- "rope_rank": "updown",
23
- "max_position": 8192,
24
- "generation_config": {"tokenizer_config": {"allowed_special": ["<|im_end|>", "<|im_start|>", "<|endoftext|>"],
25
- "skip_special_tokens": true}, "eos_token_id": [151643],
26
- "max_length": 8192}
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  }
 
1
+ {
2
+ "model": "qwen",
3
+ "template": "pretrained_text_continuation",
4
+ "hidden_act": "silu",
5
+ "pad_token_id": 151643,
6
+ "eos_token_id": 151643,
7
+ "intermediate_size": 11008,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-05,
10
+ "hidden_size": 4096,
11
+ "num_attention_heads": 32,
12
+ "num_hidden_layers": 32,
13
+ "rope_theta": 10000,
14
+ "scale_attn_weights": true,
15
+ "tie_word_embeddings": false,
16
+ "_attn_implementation": "flash_attn_2",
17
+ "vocab_size": 151936,
18
+ "rope_scaling": {
19
+ "type": "dynamic_qwen"
20
+ },
21
+ "use_logn_attn": true,
22
+ "segment_vocab_size": 0,
23
+ "skip_init": true,
24
+ "rope_rank": "updown",
25
+ "max_position": 8192,
26
+ "generation_config": {
27
+ "tokenizer_config": {
28
+ "allowed_special": [
29
+ "<|im_end|>",
30
+ "<|im_start|>",
31
+ "<|endoftext|>"
32
+ ],
33
+ "skip_special_tokens": true
34
+ },
35
+ "eos_token_id": [
36
+ 151643
37
+ ],
38
+ "max_length": 8192
39
+ }
40
  }
Qwen/Qwen1.5-0.5B-Chat/bert4torch_config.json CHANGED
@@ -1,29 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "hidden_act": "silu",
4
- "attention_probs_dropout_prob": 0.0,
5
- "bos_token_id": 151643,
6
- "eos_token_id": 151645,
7
- "intermediate_size": 2816,
8
- "initializer_range": 0.02,
9
- "layer_norm_eps": 1e-06,
10
- "hidden_size": 1024,
11
- "num_attention_heads": 16,
12
- "num_hidden_layers": 24,
13
- "num_key_value_heads": 16,
14
- "rope_theta": 1000000.0,
15
- "tie_word_embeddings": true,
16
- "torch_dtype": "bfloat16",
17
- "_attn_implementation": "sdpa",
18
- "is_causal": true,
19
- "vocab_size": 151936,
20
- "segment_vocab_size": 0,
21
- "skip_init": true,
22
- "rope_rank": "updown",
23
- "max_position_embeddings": 32768,
24
- "sliding_window": 32768,
25
- "max_window_layers": 21,
26
- "convert_logits_dtype": "float32",
27
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
28
- "max_length": 32768}
 
 
 
 
 
 
 
 
29
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 2816,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 1024,
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 24,
13
+ "num_key_value_heads": 16,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": true,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "is_causal": true,
19
+ "vocab_size": 151936,
20
+ "segment_vocab_size": 0,
21
+ "skip_init": true,
22
+ "rope_rank": "updown",
23
+ "max_position_embeddings": 32768,
24
+ "sliding_window": 32768,
25
+ "max_window_layers": 21,
26
+ "convert_logits_dtype": "float32",
27
+ "generation_config": {
28
+ "tokenizer_config": {
29
+ "skip_special_tokens": true
30
+ },
31
+ "eos_token_id": [
32
+ 151643,
33
+ 151645
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen1.5-0.5B/bert4torch_config.json CHANGED
@@ -1,30 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "template": "pretrained_text_continuation",
4
- "hidden_act": "silu",
5
- "attention_probs_dropout_prob": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "intermediate_size": 2816,
9
- "initializer_range": 0.02,
10
- "layer_norm_eps": 1e-06,
11
- "hidden_size": 1024,
12
- "num_attention_heads": 16,
13
- "num_hidden_layers": 24,
14
- "num_key_value_heads": 16,
15
- "rope_theta": 1000000.0,
16
- "tie_word_embeddings": true,
17
- "torch_dtype": "bfloat16",
18
- "_attn_implementation": "sdpa",
19
- "is_causal": true,
20
- "vocab_size": 151936,
21
- "segment_vocab_size": 0,
22
- "skip_init": true,
23
- "rope_rank": "updown",
24
- "max_position_embeddings": 32768,
25
- "sliding_window": 32768,
26
- "max_window_layers": 21,
27
- "convert_logits_dtype": "float32",
28
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643],
29
- "max_length": 32768}
 
 
 
 
 
 
 
30
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "template": "pretrained_text_continuation",
4
+ "hidden_act": "silu",
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "intermediate_size": 2816,
9
+ "initializer_range": 0.02,
10
+ "layer_norm_eps": 1e-06,
11
+ "hidden_size": 1024,
12
+ "num_attention_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "num_key_value_heads": 16,
15
+ "rope_theta": 1000000.0,
16
+ "tie_word_embeddings": true,
17
+ "torch_dtype": "bfloat16",
18
+ "_attn_implementation": "sdpa",
19
+ "is_causal": true,
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 32768,
25
+ "sliding_window": 32768,
26
+ "max_window_layers": 21,
27
+ "convert_logits_dtype": "float32",
28
+ "generation_config": {
29
+ "tokenizer_config": {
30
+ "skip_special_tokens": true
31
+ },
32
+ "eos_token_id": [
33
+ 151643
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen1.5-1.8B-Chat/bert4torch_config.json CHANGED
@@ -1,29 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "hidden_act": "silu",
4
- "attention_probs_dropout_prob": 0.0,
5
- "bos_token_id": 151643,
6
- "eos_token_id": 151645,
7
- "intermediate_size": 5504,
8
- "initializer_range": 0.02,
9
- "layer_norm_eps": 1e-06,
10
- "hidden_size": 2048,
11
- "num_attention_heads": 16,
12
- "num_hidden_layers": 24,
13
- "num_key_value_heads": 16,
14
- "rope_theta": 1000000.0,
15
- "tie_word_embeddings": false,
16
- "torch_dtype": "bfloat16",
17
- "_attn_implementation": "sdpa",
18
- "is_causal": true,
19
- "vocab_size": 151936,
20
- "segment_vocab_size": 0,
21
- "skip_init": true,
22
- "rope_rank": "updown",
23
- "max_position_embeddings": 32768,
24
- "sliding_window": 32768,
25
- "max_window_layers": 21,
26
- "convert_logits_dtype": "float32",
27
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
28
- "max_length": 32768}
 
 
 
 
 
 
 
 
29
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 5504,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 2048,
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 24,
13
+ "num_key_value_heads": 16,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": false,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "is_causal": true,
19
+ "vocab_size": 151936,
20
+ "segment_vocab_size": 0,
21
+ "skip_init": true,
22
+ "rope_rank": "updown",
23
+ "max_position_embeddings": 32768,
24
+ "sliding_window": 32768,
25
+ "max_window_layers": 21,
26
+ "convert_logits_dtype": "float32",
27
+ "generation_config": {
28
+ "tokenizer_config": {
29
+ "skip_special_tokens": true
30
+ },
31
+ "eos_token_id": [
32
+ 151643,
33
+ 151645
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen1.5-1.8B/bert4torch_config.json CHANGED
@@ -1,30 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "template": "pretrained_text_continuation",
4
- "hidden_act": "silu",
5
- "attention_probs_dropout_prob": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "intermediate_size": 5504,
9
- "initializer_range": 0.02,
10
- "layer_norm_eps": 1e-06,
11
- "hidden_size": 2048,
12
- "num_attention_heads": 16,
13
- "num_hidden_layers": 24,
14
- "num_key_value_heads": 16,
15
- "rope_theta": 1000000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "_attn_implementation": "sdpa",
19
- "is_causal": true,
20
- "vocab_size": 151936,
21
- "segment_vocab_size": 0,
22
- "skip_init": true,
23
- "rope_rank": "updown",
24
- "max_position_embeddings": 32768,
25
- "sliding_window": 32768,
26
- "max_window_layers": 21,
27
- "convert_logits_dtype": "float32",
28
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643],
29
- "max_length": 32768}
 
 
 
 
 
 
 
30
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "template": "pretrained_text_continuation",
4
+ "hidden_act": "silu",
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "intermediate_size": 5504,
9
+ "initializer_range": 0.02,
10
+ "layer_norm_eps": 1e-06,
11
+ "hidden_size": 2048,
12
+ "num_attention_heads": 16,
13
+ "num_hidden_layers": 24,
14
+ "num_key_value_heads": 16,
15
+ "rope_theta": 1000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "_attn_implementation": "sdpa",
19
+ "is_causal": true,
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 32768,
25
+ "sliding_window": 32768,
26
+ "max_window_layers": 21,
27
+ "convert_logits_dtype": "float32",
28
+ "generation_config": {
29
+ "tokenizer_config": {
30
+ "skip_special_tokens": true
31
+ },
32
+ "eos_token_id": [
33
+ 151643
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen1.5-14B-Chat/bert4torch_config.json CHANGED
@@ -1,29 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "hidden_act": "silu",
4
- "attention_probs_dropout_prob": 0.0,
5
- "bos_token_id": 151643,
6
- "eos_token_id": 151645,
7
- "intermediate_size": 13696,
8
- "initializer_range": 0.02,
9
- "layer_norm_eps": 1e-06,
10
- "hidden_size": 5120,
11
- "num_attention_heads": 40,
12
- "num_hidden_layers": 40,
13
- "num_key_value_heads": 40,
14
- "rope_theta": 1000000.0,
15
- "tie_word_embeddings": false,
16
- "torch_dtype": "bfloat16",
17
- "_attn_implementation": "sdpa",
18
- "is_causal": true,
19
- "vocab_size": 152064,
20
- "segment_vocab_size": 0,
21
- "skip_init": true,
22
- "rope_rank": "updown",
23
- "max_position_embeddings": 32768,
24
- "sliding_window": 32768,
25
- "max_window_layers": 35,
26
- "convert_logits_dtype": "float32",
27
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
28
- "max_length": 32768}
 
 
 
 
 
 
 
 
29
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 13696,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 5120,
11
+ "num_attention_heads": 40,
12
+ "num_hidden_layers": 40,
13
+ "num_key_value_heads": 40,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": false,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "is_causal": true,
19
+ "vocab_size": 152064,
20
+ "segment_vocab_size": 0,
21
+ "skip_init": true,
22
+ "rope_rank": "updown",
23
+ "max_position_embeddings": 32768,
24
+ "sliding_window": 32768,
25
+ "max_window_layers": 35,
26
+ "convert_logits_dtype": "float32",
27
+ "generation_config": {
28
+ "tokenizer_config": {
29
+ "skip_special_tokens": true
30
+ },
31
+ "eos_token_id": [
32
+ 151643,
33
+ 151645
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen1.5-14B/bert4torch_config.json CHANGED
@@ -1,30 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "template": "pretrained_text_continuation",
4
- "hidden_act": "silu",
5
- "attention_probs_dropout_prob": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "intermediate_size": 13696,
9
- "initializer_range": 0.02,
10
- "layer_norm_eps": 1e-06,
11
- "hidden_size": 5120,
12
- "num_attention_heads": 40,
13
- "num_hidden_layers": 40,
14
- "num_key_value_heads": 40,
15
- "rope_theta": 1000000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "_attn_implementation": "sdpa",
19
- "is_causal": true,
20
- "vocab_size": 152064,
21
- "segment_vocab_size": 0,
22
- "skip_init": true,
23
- "rope_rank": "updown",
24
- "max_position_embeddings": 32768,
25
- "sliding_window": 32768,
26
- "max_window_layers": 35,
27
- "convert_logits_dtype": "float32",
28
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643],
29
- "max_length": 32768}
 
 
 
 
 
 
 
30
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "template": "pretrained_text_continuation",
4
+ "hidden_act": "silu",
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "intermediate_size": 13696,
9
+ "initializer_range": 0.02,
10
+ "layer_norm_eps": 1e-06,
11
+ "hidden_size": 5120,
12
+ "num_attention_heads": 40,
13
+ "num_hidden_layers": 40,
14
+ "num_key_value_heads": 40,
15
+ "rope_theta": 1000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "_attn_implementation": "sdpa",
19
+ "is_causal": true,
20
+ "vocab_size": 152064,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 32768,
25
+ "sliding_window": 32768,
26
+ "max_window_layers": 35,
27
+ "convert_logits_dtype": "float32",
28
+ "generation_config": {
29
+ "tokenizer_config": {
30
+ "skip_special_tokens": true
31
+ },
32
+ "eos_token_id": [
33
+ 151643
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen1.5-7B-Chat/bert4torch_config.json CHANGED
@@ -1,29 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "hidden_act": "silu",
4
- "attention_probs_dropout_prob": 0.0,
5
- "bos_token_id": 151643,
6
- "eos_token_id": 151645,
7
- "intermediate_size": 11008,
8
- "initializer_range": 0.02,
9
- "layer_norm_eps": 1e-06,
10
- "hidden_size": 4096,
11
- "num_attention_heads": 32,
12
- "num_hidden_layers": 32,
13
- "num_key_value_heads": 32,
14
- "rope_theta": 1000000.0,
15
- "tie_word_embeddings": false,
16
- "torch_dtype": "bfloat16",
17
- "_attn_implementation": "sdpa",
18
- "is_causal": true,
19
- "vocab_size": 151936,
20
- "segment_vocab_size": 0,
21
- "skip_init": true,
22
- "rope_rank": "updown",
23
- "max_position_embeddings": 32768,
24
- "sliding_window": 32768,
25
- "max_window_layers": 28,
26
- "convert_logits_dtype": "float32",
27
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
28
- "max_length": 32768}
 
 
 
 
 
 
 
 
29
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 11008,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 4096,
11
+ "num_attention_heads": 32,
12
+ "num_hidden_layers": 32,
13
+ "num_key_value_heads": 32,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": false,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "is_causal": true,
19
+ "vocab_size": 151936,
20
+ "segment_vocab_size": 0,
21
+ "skip_init": true,
22
+ "rope_rank": "updown",
23
+ "max_position_embeddings": 32768,
24
+ "sliding_window": 32768,
25
+ "max_window_layers": 28,
26
+ "convert_logits_dtype": "float32",
27
+ "generation_config": {
28
+ "tokenizer_config": {
29
+ "skip_special_tokens": true
30
+ },
31
+ "eos_token_id": [
32
+ 151643,
33
+ 151645
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen1.5-7B/bert4torch_config.json CHANGED
@@ -1,30 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "template": "pretrained_text_continuation",
4
- "hidden_act": "silu",
5
- "attention_probs_dropout_prob": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "intermediate_size": 11008,
9
- "initializer_range": 0.02,
10
- "layer_norm_eps": 1e-06,
11
- "hidden_size": 4096,
12
- "num_attention_heads": 32,
13
- "num_hidden_layers": 32,
14
- "num_key_value_heads": 32,
15
- "rope_theta": 1000000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "_attn_implementation": "sdpa",
19
- "is_causal": true,
20
- "vocab_size": 151936,
21
- "segment_vocab_size": 0,
22
- "skip_init": true,
23
- "rope_rank": "updown",
24
- "max_position_embeddings": 32768,
25
- "sliding_window": 32768,
26
- "max_window_layers": 28,
27
- "convert_logits_dtype": "float32",
28
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643],
29
- "max_length": 32768}
 
 
 
 
 
 
 
30
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "template": "pretrained_text_continuation",
4
+ "hidden_act": "silu",
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "intermediate_size": 11008,
9
+ "initializer_range": 0.02,
10
+ "layer_norm_eps": 1e-06,
11
+ "hidden_size": 4096,
12
+ "num_attention_heads": 32,
13
+ "num_hidden_layers": 32,
14
+ "num_key_value_heads": 32,
15
+ "rope_theta": 1000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "_attn_implementation": "sdpa",
19
+ "is_causal": true,
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 32768,
25
+ "sliding_window": 32768,
26
+ "max_window_layers": 28,
27
+ "convert_logits_dtype": "float32",
28
+ "generation_config": {
29
+ "tokenizer_config": {
30
+ "skip_special_tokens": true
31
+ },
32
+ "eos_token_id": [
33
+ 151643
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen2-0.5B-Instruct/bert4torch_config.json CHANGED
@@ -1,29 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "hidden_act": "silu",
4
- "attention_probs_dropout_prob": 0.0,
5
- "bos_token_id": 151643,
6
- "eos_token_id": 151645,
7
- "intermediate_size": 4864,
8
- "initializer_range": 0.02,
9
- "layer_norm_eps": 1e-06,
10
- "hidden_size": 896,
11
- "num_attention_heads": 14,
12
- "num_hidden_layers": 24,
13
- "num_key_value_heads": 2,
14
- "rope_theta": 1000000.0,
15
- "tie_word_embeddings": true,
16
- "torch_dtype": "bfloat16",
17
- "_attn_implementation": "sdpa",
18
- "is_causal": true,
19
- "vocab_size": 151936,
20
- "segment_vocab_size": 0,
21
- "skip_init": true,
22
- "rope_rank": "updown",
23
- "max_position_embeddings": 32768,
24
- "sliding_window": 32768,
25
- "max_window_layers": 24,
26
- "convert_logits_dtype": "float32",
27
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
28
- "max_length": 32768}
 
 
 
 
 
 
 
 
29
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 4864,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 896,
11
+ "num_attention_heads": 14,
12
+ "num_hidden_layers": 24,
13
+ "num_key_value_heads": 2,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": true,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "is_causal": true,
19
+ "vocab_size": 151936,
20
+ "segment_vocab_size": 0,
21
+ "skip_init": true,
22
+ "rope_rank": "updown",
23
+ "max_position_embeddings": 32768,
24
+ "sliding_window": 32768,
25
+ "max_window_layers": 24,
26
+ "convert_logits_dtype": "float32",
27
+ "generation_config": {
28
+ "tokenizer_config": {
29
+ "skip_special_tokens": true
30
+ },
31
+ "eos_token_id": [
32
+ 151643,
33
+ 151645
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen2-0.5B/bert4torch_config.json CHANGED
@@ -1,30 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "template": "pretrained_text_continuation",
4
- "hidden_act": "silu",
5
- "attention_probs_dropout_prob": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "intermediate_size": 4864,
9
- "initializer_range": 0.02,
10
- "layer_norm_eps": 1e-06,
11
- "hidden_size": 896,
12
- "num_attention_heads": 14,
13
- "num_hidden_layers": 24,
14
- "num_key_value_heads": 2,
15
- "rope_theta": 1000000.0,
16
- "tie_word_embeddings": true,
17
- "torch_dtype": "bfloat16",
18
- "_attn_implementation": "sdpa",
19
- "is_causal": true,
20
- "vocab_size": 151936,
21
- "segment_vocab_size": 0,
22
- "skip_init": true,
23
- "rope_rank": "updown",
24
- "max_position_embeddings": 32768,
25
- "sliding_window": 32768,
26
- "max_window_layers": 24,
27
- "convert_logits_dtype": "float32",
28
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643],
29
- "max_length": 32768}
 
 
 
 
 
 
 
30
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "template": "pretrained_text_continuation",
4
+ "hidden_act": "silu",
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "intermediate_size": 4864,
9
+ "initializer_range": 0.02,
10
+ "layer_norm_eps": 1e-06,
11
+ "hidden_size": 896,
12
+ "num_attention_heads": 14,
13
+ "num_hidden_layers": 24,
14
+ "num_key_value_heads": 2,
15
+ "rope_theta": 1000000.0,
16
+ "tie_word_embeddings": true,
17
+ "torch_dtype": "bfloat16",
18
+ "_attn_implementation": "sdpa",
19
+ "is_causal": true,
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 32768,
25
+ "sliding_window": 32768,
26
+ "max_window_layers": 24,
27
+ "convert_logits_dtype": "float32",
28
+ "generation_config": {
29
+ "tokenizer_config": {
30
+ "skip_special_tokens": true
31
+ },
32
+ "eos_token_id": [
33
+ 151643
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen2-1.5B-Instruct/bert4torch_config.json CHANGED
@@ -1,29 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "hidden_act": "silu",
4
- "attention_probs_dropout_prob": 0.0,
5
- "bos_token_id": 151643,
6
- "eos_token_id": 151645,
7
- "intermediate_size": 8960,
8
- "initializer_range": 0.02,
9
- "layer_norm_eps": 1e-06,
10
- "hidden_size": 1536,
11
- "num_attention_heads": 12,
12
- "num_hidden_layers": 28,
13
- "num_key_value_heads": 2,
14
- "rope_theta": 1000000.0,
15
- "tie_word_embeddings": true,
16
- "torch_dtype": "bfloat16",
17
- "_attn_implementation": "sdpa",
18
- "is_causal": true,
19
- "vocab_size": 151936,
20
- "segment_vocab_size": 0,
21
- "skip_init": true,
22
- "rope_rank": "updown",
23
- "max_position_embeddings": 32768,
24
- "sliding_window": 32768,
25
- "max_window_layers": 28,
26
- "convert_logits_dtype": "float32",
27
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
28
- "max_length": 32768}
 
 
 
 
 
 
 
 
29
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 8960,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 1536,
11
+ "num_attention_heads": 12,
12
+ "num_hidden_layers": 28,
13
+ "num_key_value_heads": 2,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": true,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "is_causal": true,
19
+ "vocab_size": 151936,
20
+ "segment_vocab_size": 0,
21
+ "skip_init": true,
22
+ "rope_rank": "updown",
23
+ "max_position_embeddings": 32768,
24
+ "sliding_window": 32768,
25
+ "max_window_layers": 28,
26
+ "convert_logits_dtype": "float32",
27
+ "generation_config": {
28
+ "tokenizer_config": {
29
+ "skip_special_tokens": true
30
+ },
31
+ "eos_token_id": [
32
+ 151643,
33
+ 151645
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen2-1.5B/bert4torch_config.json CHANGED
@@ -1,30 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "template": "pretrained_text_continuation",
4
- "hidden_act": "silu",
5
- "attention_probs_dropout_prob": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "intermediate_size": 8960,
9
- "initializer_range": 0.02,
10
- "layer_norm_eps": 1e-06,
11
- "hidden_size": 1536,
12
- "num_attention_heads": 12,
13
- "num_hidden_layers": 28,
14
- "num_key_value_heads": 2,
15
- "rope_theta": 1000000.0,
16
- "tie_word_embeddings": true,
17
- "torch_dtype": "bfloat16",
18
- "_attn_implementation": "sdpa",
19
- "is_causal": true,
20
- "vocab_size": 151936,
21
- "segment_vocab_size": 0,
22
- "skip_init": true,
23
- "rope_rank": "updown",
24
- "max_position_embeddings": 32768,
25
- "sliding_window": 32768,
26
- "max_window_layers": 28,
27
- "convert_logits_dtype": "float32",
28
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643],
29
- "max_length": 32768}
 
 
 
 
 
 
 
30
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "template": "pretrained_text_continuation",
4
+ "hidden_act": "silu",
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "intermediate_size": 8960,
9
+ "initializer_range": 0.02,
10
+ "layer_norm_eps": 1e-06,
11
+ "hidden_size": 1536,
12
+ "num_attention_heads": 12,
13
+ "num_hidden_layers": 28,
14
+ "num_key_value_heads": 2,
15
+ "rope_theta": 1000000.0,
16
+ "tie_word_embeddings": true,
17
+ "torch_dtype": "bfloat16",
18
+ "_attn_implementation": "sdpa",
19
+ "is_causal": true,
20
+ "vocab_size": 151936,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 32768,
25
+ "sliding_window": 32768,
26
+ "max_window_layers": 28,
27
+ "convert_logits_dtype": "float32",
28
+ "generation_config": {
29
+ "tokenizer_config": {
30
+ "skip_special_tokens": true
31
+ },
32
+ "eos_token_id": [
33
+ 151643
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen2-7B-Instruct/bert4torch_config.json CHANGED
@@ -1,29 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "hidden_act": "silu",
4
- "attention_probs_dropout_prob": 0.0,
5
- "bos_token_id": 151643,
6
- "eos_token_id": 151645,
7
- "intermediate_size": 18944,
8
- "initializer_range": 0.02,
9
- "layer_norm_eps": 1e-06,
10
- "hidden_size": 3584,
11
- "num_attention_heads": 28,
12
- "num_hidden_layers": 28,
13
- "num_key_value_heads": 4,
14
- "rope_theta": 1000000.0,
15
- "tie_word_embeddings": false,
16
- "torch_dtype": "bfloat16",
17
- "_attn_implementation": "sdpa",
18
- "is_causal": true,
19
- "vocab_size": 152064,
20
- "segment_vocab_size": 0,
21
- "skip_init": true,
22
- "rope_rank": "updown",
23
- "max_position_embeddings": 32768,
24
- "sliding_window": 131072,
25
- "max_window_layers": 28,
26
- "convert_logits_dtype": "float32",
27
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
28
- "max_length": 32768}
 
 
 
 
 
 
 
 
29
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 18944,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 3584,
11
+ "num_attention_heads": 28,
12
+ "num_hidden_layers": 28,
13
+ "num_key_value_heads": 4,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": false,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "is_causal": true,
19
+ "vocab_size": 152064,
20
+ "segment_vocab_size": 0,
21
+ "skip_init": true,
22
+ "rope_rank": "updown",
23
+ "max_position_embeddings": 32768,
24
+ "sliding_window": 131072,
25
+ "max_window_layers": 28,
26
+ "convert_logits_dtype": "float32",
27
+ "generation_config": {
28
+ "tokenizer_config": {
29
+ "skip_special_tokens": true
30
+ },
31
+ "eos_token_id": [
32
+ 151643,
33
+ 151645
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen2-7B/bert4torch_config.json CHANGED
@@ -1,30 +1,37 @@
1
- {
2
- "model": "qwen2",
3
- "template": "pretrained_text_continuation",
4
- "hidden_act": "silu",
5
- "attention_probs_dropout_prob": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "intermediate_size": 18944,
9
- "initializer_range": 0.02,
10
- "layer_norm_eps": 1e-06,
11
- "hidden_size": 3584,
12
- "num_attention_heads": 28,
13
- "num_hidden_layers": 28,
14
- "num_key_value_heads": 4,
15
- "rope_theta": 1000000.0,
16
- "tie_word_embeddings": false,
17
- "torch_dtype": "bfloat16",
18
- "_attn_implementation": "sdpa",
19
- "is_causal": true,
20
- "vocab_size": 152064,
21
- "segment_vocab_size": 0,
22
- "skip_init": true,
23
- "rope_rank": "updown",
24
- "max_position_embeddings": 32768,
25
- "sliding_window": 131072,
26
- "max_window_layers": 28,
27
- "convert_logits_dtype": "float32",
28
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643],
29
- "max_length": 32768}
 
 
 
 
 
 
 
30
  }
 
1
+ {
2
+ "model": "qwen2",
3
+ "template": "pretrained_text_continuation",
4
+ "hidden_act": "silu",
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "intermediate_size": 18944,
9
+ "initializer_range": 0.02,
10
+ "layer_norm_eps": 1e-06,
11
+ "hidden_size": 3584,
12
+ "num_attention_heads": 28,
13
+ "num_hidden_layers": 28,
14
+ "num_key_value_heads": 4,
15
+ "rope_theta": 1000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "_attn_implementation": "sdpa",
19
+ "is_causal": true,
20
+ "vocab_size": 152064,
21
+ "segment_vocab_size": 0,
22
+ "skip_init": true,
23
+ "rope_rank": "updown",
24
+ "max_position_embeddings": 32768,
25
+ "sliding_window": 131072,
26
+ "max_window_layers": 28,
27
+ "convert_logits_dtype": "float32",
28
+ "generation_config": {
29
+ "tokenizer_config": {
30
+ "skip_special_tokens": true
31
+ },
32
+ "eos_token_id": [
33
+ 151643
34
+ ],
35
+ "max_length": 32768
36
+ }
37
  }
Qwen/Qwen2-VL-2B-Instruct/bert4torch_config.json CHANGED
@@ -27,38 +27,42 @@
27
  "rope_rank": "updown",
28
  "convert_logits_dtype": "float32",
29
  "generation_config": {
30
- "tokenizer_config": {"skip_special_tokens": true},
31
- "eos_token_id": [151643, 151645],
32
- "max_length": 32768,
33
- "repetition_penalty": 1.0,
34
- "temperature": 0.01,
35
- "top_p": 0.001,
36
- "top_k": 1
 
 
 
 
 
37
  },
38
-
39
  "vision_start_token_id": 151652,
40
  "vision_end_token_id": 151653,
41
  "vision_token_id": 151654,
42
  "vision_config": {
43
- "depth": 32,
44
- "embed_dim": 1280,
45
- "mlp_ratio": 4,
46
- "num_heads": 16,
47
- "in_chans": 3,
48
- "hidden_size": 1536,
49
- "patch_size": 14,
50
- "spatial_merge_size": 2,
51
- "spatial_patch_size": 14,
52
- "temporal_patch_size": 2,
53
- "_attn_implementation_internal": null
54
  },
55
  "rope_scaling": {
56
- "type": "mrope",
57
- "mrope_section": [
58
- 16,
59
- 24,
60
- 24
61
- ]
62
  },
63
  "vocab_size": 151936
64
- }
 
27
  "rope_rank": "updown",
28
  "convert_logits_dtype": "float32",
29
  "generation_config": {
30
+ "tokenizer_config": {
31
+ "skip_special_tokens": true
32
+ },
33
+ "eos_token_id": [
34
+ 151643,
35
+ 151645
36
+ ],
37
+ "max_length": 32768,
38
+ "repetition_penalty": 1.0,
39
+ "temperature": 0.01,
40
+ "top_p": 0.001,
41
+ "top_k": 1
42
  },
 
43
  "vision_start_token_id": 151652,
44
  "vision_end_token_id": 151653,
45
  "vision_token_id": 151654,
46
  "vision_config": {
47
+ "depth": 32,
48
+ "embed_dim": 1280,
49
+ "mlp_ratio": 4,
50
+ "num_heads": 16,
51
+ "in_chans": 3,
52
+ "hidden_size": 1536,
53
+ "patch_size": 14,
54
+ "spatial_merge_size": 2,
55
+ "spatial_patch_size": 14,
56
+ "temporal_patch_size": 2,
57
+ "_attn_implementation_internal": null
58
  },
59
  "rope_scaling": {
60
+ "type": "mrope",
61
+ "mrope_section": [
62
+ 16,
63
+ 24,
64
+ 24
65
+ ]
66
  },
67
  "vocab_size": 151936
68
+ }
Qwen/Qwen2-VL-7B-Instruct/bert4torch_config.json CHANGED
@@ -1,63 +1,67 @@
1
- {
2
- "model": "qwen2_vl",
3
- "attention_dropout": 0.0,
4
- "bos_token_id": 151643,
5
- "eos_token_id": 151645,
6
- "image_token_id": 151655,
7
- "video_token_id": 151656,
8
- "hidden_act": "silu",
9
- "hidden_size": 3584,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 18944,
12
- "max_position_embeddings": 32768,
13
- "max_window_layers": 28,
14
- "num_attention_heads": 28,
15
- "num_hidden_layers": 28,
16
- "num_key_value_heads": 4,
17
- "layer_norm_eps": 1e-06,
18
- "rope_theta": 1000000.0,
19
- "sliding_window": 131072,
20
- "tie_word_embeddings": false,
21
- "torch_dtype": "bfloat16",
22
- "_attn_implementation": "sdpa",
23
- "use_sliding_window": false,
24
- "skip_init": true,
25
- "segment_vocab_size": 0,
26
- "rope_rank": "updown",
27
- "convert_logits_dtype": "float32",
28
- "generation_config": {
29
- "tokenizer_config": {"skip_special_tokens": true},
30
- "eos_token_id": [151643, 151645],
31
- "max_length": 32768,
32
- "repetition_penalty": 1.0,
33
- "temperature": 0.01,
34
- "top_p": 0.001,
35
- "top_k": 1
36
- },
37
-
38
- "vision_start_token_id": 151652,
39
- "vision_end_token_id": 151653,
40
- "vision_token_id": 151654,
41
- "vision_config": {
42
- "depth": 32,
43
- "embed_dim": 1280,
44
- "mlp_ratio": 4,
45
- "num_heads": 16,
46
- "in_chans": 3,
47
- "hidden_size": 3584,
48
- "patch_size": 14,
49
- "spatial_merge_size": 2,
50
- "spatial_patch_size": 14,
51
- "temporal_patch_size": 2,
52
- "_attn_implementation_internal": null
53
- },
54
- "rope_scaling": {
55
- "type": "mrope",
56
- "mrope_section": [
57
- 16,
58
- 24,
59
- 24
60
- ]
61
- },
62
- "vocab_size": 152064
63
- }
 
 
 
 
 
1
+ {
2
+ "model": "qwen2_vl",
3
+ "attention_dropout": 0.0,
4
+ "bos_token_id": 151643,
5
+ "eos_token_id": 151645,
6
+ "image_token_id": 151655,
7
+ "video_token_id": 151656,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 28,
14
+ "num_attention_heads": 28,
15
+ "num_hidden_layers": 28,
16
+ "num_key_value_heads": 4,
17
+ "layer_norm_eps": 1e-06,
18
+ "rope_theta": 1000000.0,
19
+ "sliding_window": 131072,
20
+ "tie_word_embeddings": false,
21
+ "torch_dtype": "bfloat16",
22
+ "_attn_implementation": "sdpa",
23
+ "use_sliding_window": false,
24
+ "skip_init": true,
25
+ "segment_vocab_size": 0,
26
+ "rope_rank": "updown",
27
+ "convert_logits_dtype": "float32",
28
+ "generation_config": {
29
+ "tokenizer_config": {
30
+ "skip_special_tokens": true
31
+ },
32
+ "eos_token_id": [
33
+ 151643,
34
+ 151645
35
+ ],
36
+ "max_length": 32768,
37
+ "repetition_penalty": 1.0,
38
+ "temperature": 0.01,
39
+ "top_p": 0.001,
40
+ "top_k": 1
41
+ },
42
+ "vision_start_token_id": 151652,
43
+ "vision_end_token_id": 151653,
44
+ "vision_token_id": 151654,
45
+ "vision_config": {
46
+ "depth": 32,
47
+ "embed_dim": 1280,
48
+ "mlp_ratio": 4,
49
+ "num_heads": 16,
50
+ "in_chans": 3,
51
+ "hidden_size": 3584,
52
+ "patch_size": 14,
53
+ "spatial_merge_size": 2,
54
+ "spatial_patch_size": 14,
55
+ "temporal_patch_size": 2,
56
+ "_attn_implementation_internal": null
57
+ },
58
+ "rope_scaling": {
59
+ "type": "mrope",
60
+ "mrope_section": [
61
+ 16,
62
+ 24,
63
+ 24
64
+ ]
65
+ },
66
+ "vocab_size": 152064
67
+ }