KublaiKhan1 commited on
Commit
bba7691
·
verified ·
1 Parent(s): 744d283

Delete limo_filtered_correct

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. limo_filtered_correct/README.md +0 -59
  2. limo_filtered_correct/added_tokens.json +0 -24
  3. limo_filtered_correct/all_results.json +0 -8
  4. limo_filtered_correct/chat_template.jinja +0 -54
  5. limo_filtered_correct/checkpoint-1141/added_tokens.json +0 -24
  6. limo_filtered_correct/checkpoint-1141/chat_template.jinja +0 -54
  7. limo_filtered_correct/checkpoint-1141/config.json +0 -58
  8. limo_filtered_correct/checkpoint-1141/generation_config.json +0 -9
  9. limo_filtered_correct/checkpoint-1141/merges.txt +0 -0
  10. limo_filtered_correct/checkpoint-1141/model.safetensors.index.json +0 -347
  11. limo_filtered_correct/checkpoint-1141/special_tokens_map.json +0 -31
  12. limo_filtered_correct/checkpoint-1141/tokenizer_config.json +0 -208
  13. limo_filtered_correct/checkpoint-1141/trainer_state.json +0 -0
  14. limo_filtered_correct/checkpoint-1141/vocab.json +0 -0
  15. limo_filtered_correct/checkpoint-1304/added_tokens.json +0 -24
  16. limo_filtered_correct/checkpoint-1304/chat_template.jinja +0 -54
  17. limo_filtered_correct/checkpoint-1304/config.json +0 -58
  18. limo_filtered_correct/checkpoint-1304/generation_config.json +0 -9
  19. limo_filtered_correct/checkpoint-1304/merges.txt +0 -0
  20. limo_filtered_correct/checkpoint-1304/model.safetensors.index.json +0 -347
  21. limo_filtered_correct/checkpoint-1304/special_tokens_map.json +0 -31
  22. limo_filtered_correct/checkpoint-1304/tokenizer_config.json +0 -208
  23. limo_filtered_correct/checkpoint-1304/trainer_state.json +0 -0
  24. limo_filtered_correct/checkpoint-1304/vocab.json +0 -0
  25. limo_filtered_correct/checkpoint-1467/added_tokens.json +0 -24
  26. limo_filtered_correct/checkpoint-1467/chat_template.jinja +0 -54
  27. limo_filtered_correct/checkpoint-1467/config.json +0 -58
  28. limo_filtered_correct/checkpoint-1467/generation_config.json +0 -9
  29. limo_filtered_correct/checkpoint-1467/merges.txt +0 -0
  30. limo_filtered_correct/checkpoint-1467/model.safetensors.index.json +0 -347
  31. limo_filtered_correct/checkpoint-1467/special_tokens_map.json +0 -31
  32. limo_filtered_correct/checkpoint-1467/tokenizer_config.json +0 -208
  33. limo_filtered_correct/checkpoint-1467/trainer_state.json +0 -0
  34. limo_filtered_correct/checkpoint-1467/vocab.json +0 -0
  35. limo_filtered_correct/checkpoint-163/added_tokens.json +0 -24
  36. limo_filtered_correct/checkpoint-163/chat_template.jinja +0 -54
  37. limo_filtered_correct/checkpoint-163/config.json +0 -58
  38. limo_filtered_correct/checkpoint-163/generation_config.json +0 -9
  39. limo_filtered_correct/checkpoint-163/merges.txt +0 -0
  40. limo_filtered_correct/checkpoint-163/model.safetensors.index.json +0 -347
  41. limo_filtered_correct/checkpoint-163/special_tokens_map.json +0 -31
  42. limo_filtered_correct/checkpoint-163/tokenizer_config.json +0 -208
  43. limo_filtered_correct/checkpoint-163/trainer_state.json +0 -1175
  44. limo_filtered_correct/checkpoint-163/vocab.json +0 -0
  45. limo_filtered_correct/checkpoint-1630/added_tokens.json +0 -24
  46. limo_filtered_correct/checkpoint-1630/chat_template.jinja +0 -54
  47. limo_filtered_correct/checkpoint-1630/config.json +0 -58
  48. limo_filtered_correct/checkpoint-1630/generation_config.json +0 -9
  49. limo_filtered_correct/checkpoint-1630/merges.txt +0 -0
  50. limo_filtered_correct/checkpoint-1630/model.safetensors.index.json +0 -347
limo_filtered_correct/README.md DELETED
@@ -1,59 +0,0 @@
1
- ---
2
- library_name: transformers
3
- license: other
4
- base_model: Qwen/Qwen2.5-Math-7B-Instruct
5
- tags:
6
- - llama-factory
7
- - full
8
- - generated_from_trainer
9
- model-index:
10
- - name: limo_filtered_correct
11
- results: []
12
- ---
13
-
14
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
- should probably proofread and complete it, then remove this comment. -->
16
-
17
- # limo_filtered_correct
18
-
19
- This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) on the limo_filtered_correct dataset.
20
-
21
- ## Model description
22
-
23
- More information needed
24
-
25
- ## Intended uses & limitations
26
-
27
- More information needed
28
-
29
- ## Training and evaluation data
30
-
31
- More information needed
32
-
33
- ## Training procedure
34
-
35
- ### Training hyperparameters
36
-
37
- The following hyperparameters were used during training:
38
- - learning_rate: 5e-06
39
- - train_batch_size: 1
40
- - eval_batch_size: 8
41
- - seed: 42
42
- - distributed_type: multi-GPU
43
- - num_devices: 4
44
- - total_train_batch_size: 4
45
- - total_eval_batch_size: 32
46
- - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
- - lr_scheduler_type: cosine
48
- - num_epochs: 10
49
-
50
- ### Training results
51
-
52
-
53
-
54
- ### Framework versions
55
-
56
- - Transformers 4.55.0
57
- - Pytorch 2.5.1+cu124
58
- - Datasets 3.6.0
59
- - Tokenizers 0.21.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/added_tokens.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "</tool_call>": 151658,
3
- "<tool_call>": 151657,
4
- "<|box_end|>": 151649,
5
- "<|box_start|>": 151648,
6
- "<|endoftext|>": 151643,
7
- "<|file_sep|>": 151664,
8
- "<|fim_middle|>": 151660,
9
- "<|fim_pad|>": 151662,
10
- "<|fim_prefix|>": 151659,
11
- "<|fim_suffix|>": 151661,
12
- "<|im_end|>": 151645,
13
- "<|im_start|>": 151644,
14
- "<|image_pad|>": 151655,
15
- "<|object_ref_end|>": 151647,
16
- "<|object_ref_start|>": 151646,
17
- "<|quad_end|>": 151651,
18
- "<|quad_start|>": 151650,
19
- "<|repo_name|>": 151663,
20
- "<|video_pad|>": 151656,
21
- "<|vision_end|>": 151653,
22
- "<|vision_pad|>": 151654,
23
- "<|vision_start|>": 151652
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/all_results.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "epoch": 10.0,
3
- "total_flos": 4.164325545607168e+17,
4
- "train_loss": 0.32652053014243854,
5
- "train_runtime": 6369.4859,
6
- "train_samples_per_second": 1.022,
7
- "train_steps_per_second": 0.256
8
- }
 
 
 
 
 
 
 
 
 
limo_filtered_correct/chat_template.jinja DELETED
@@ -1,54 +0,0 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0]['role'] == 'system' %}
4
- {{- messages[0]['content'] }}
5
- {%- else %}
6
- {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
7
- {%- endif %}
8
- {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
- {%- for tool in tools %}
10
- {{- "\n" }}
11
- {{- tool | tojson }}
12
- {%- endfor %}
13
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
- {%- else %}
15
- {%- if messages[0]['role'] == 'system' %}
16
- {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
- {%- else %}
18
- {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
19
- {%- endif %}
20
- {%- endif %}
21
- {%- for message in messages %}
22
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
- {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
- {%- elif message.role == "assistant" %}
25
- {{- '<|im_start|>' + message.role }}
26
- {%- if message.content %}
27
- {{- '\n' + message.content }}
28
- {%- endif %}
29
- {%- for tool_call in message.tool_calls %}
30
- {%- if tool_call.function is defined %}
31
- {%- set tool_call = tool_call.function %}
32
- {%- endif %}
33
- {{- '\n<tool_call>\n{"name": "' }}
34
- {{- tool_call.name }}
35
- {{- '", "arguments": ' }}
36
- {{- tool_call.arguments | tojson }}
37
- {{- '}\n</tool_call>' }}
38
- {%- endfor %}
39
- {{- '<|im_end|>\n' }}
40
- {%- elif message.role == "tool" %}
41
- {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
- {{- '<|im_start|>user' }}
43
- {%- endif %}
44
- {{- '\n<tool_response>\n' }}
45
- {{- message.content }}
46
- {{- '\n</tool_response>' }}
47
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
- {{- '<|im_end|>\n' }}
49
- {%- endif %}
50
- {%- endif %}
51
- {%- endfor %}
52
- {%- if add_generation_prompt %}
53
- {{- '<|im_start|>assistant\n' }}
54
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1141/added_tokens.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "</tool_call>": 151658,
3
- "<tool_call>": 151657,
4
- "<|box_end|>": 151649,
5
- "<|box_start|>": 151648,
6
- "<|endoftext|>": 151643,
7
- "<|file_sep|>": 151664,
8
- "<|fim_middle|>": 151660,
9
- "<|fim_pad|>": 151662,
10
- "<|fim_prefix|>": 151659,
11
- "<|fim_suffix|>": 151661,
12
- "<|im_end|>": 151645,
13
- "<|im_start|>": 151644,
14
- "<|image_pad|>": 151655,
15
- "<|object_ref_end|>": 151647,
16
- "<|object_ref_start|>": 151646,
17
- "<|quad_end|>": 151651,
18
- "<|quad_start|>": 151650,
19
- "<|repo_name|>": 151663,
20
- "<|video_pad|>": 151656,
21
- "<|vision_end|>": 151653,
22
- "<|vision_pad|>": 151654,
23
- "<|vision_start|>": 151652
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1141/chat_template.jinja DELETED
@@ -1,54 +0,0 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0]['role'] == 'system' %}
4
- {{- messages[0]['content'] }}
5
- {%- else %}
6
- {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
7
- {%- endif %}
8
- {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
- {%- for tool in tools %}
10
- {{- "\n" }}
11
- {{- tool | tojson }}
12
- {%- endfor %}
13
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
- {%- else %}
15
- {%- if messages[0]['role'] == 'system' %}
16
- {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
- {%- else %}
18
- {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
19
- {%- endif %}
20
- {%- endif %}
21
- {%- for message in messages %}
22
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
- {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
- {%- elif message.role == "assistant" %}
25
- {{- '<|im_start|>' + message.role }}
26
- {%- if message.content %}
27
- {{- '\n' + message.content }}
28
- {%- endif %}
29
- {%- for tool_call in message.tool_calls %}
30
- {%- if tool_call.function is defined %}
31
- {%- set tool_call = tool_call.function %}
32
- {%- endif %}
33
- {{- '\n<tool_call>\n{"name": "' }}
34
- {{- tool_call.name }}
35
- {{- '", "arguments": ' }}
36
- {{- tool_call.arguments | tojson }}
37
- {{- '}\n</tool_call>' }}
38
- {%- endfor %}
39
- {{- '<|im_end|>\n' }}
40
- {%- elif message.role == "tool" %}
41
- {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
- {{- '<|im_start|>user' }}
43
- {%- endif %}
44
- {{- '\n<tool_response>\n' }}
45
- {{- message.content }}
46
- {{- '\n</tool_response>' }}
47
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
- {{- '<|im_end|>\n' }}
49
- {%- endif %}
50
- {%- endif %}
51
- {%- endfor %}
52
- {%- if add_generation_prompt %}
53
- {{- '<|im_start|>assistant\n' }}
54
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1141/config.json DELETED
@@ -1,58 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen2ForCausalLM"
4
- ],
5
- "attention_dropout": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "hidden_act": "silu",
9
- "hidden_size": 3584,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 18944,
12
- "layer_types": [
13
- "full_attention",
14
- "full_attention",
15
- "full_attention",
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention"
41
- ],
42
- "max_position_embeddings": 4096,
43
- "max_window_layers": 28,
44
- "model_type": "qwen2",
45
- "num_attention_heads": 28,
46
- "num_hidden_layers": 28,
47
- "num_key_value_heads": 4,
48
- "rms_norm_eps": 1e-06,
49
- "rope_scaling": null,
50
- "rope_theta": 10000.0,
51
- "sliding_window": null,
52
- "tie_word_embeddings": false,
53
- "torch_dtype": "float32",
54
- "transformers_version": "4.55.0",
55
- "use_cache": false,
56
- "use_sliding_window": false,
57
- "vocab_size": 152064
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1141/generation_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "bos_token_id": 151643,
3
- "eos_token_id": [
4
- 151645,
5
- 151643
6
- ],
7
- "pad_token_id": 151643,
8
- "transformers_version": "4.55.0"
9
- }
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1141/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-1141/model.safetensors.index.json DELETED
@@ -1,347 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_parameters": 1903904128,
4
- "total_size": 30462466048
5
- },
6
- "weight_map": {
7
- "lm_head.weight": "model-00007-of-00007.safetensors",
8
- "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
9
- "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
10
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
11
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
12
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
13
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
14
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
15
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
16
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
17
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
18
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
19
- "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
20
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
21
- "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
22
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
23
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
24
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
25
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
26
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
27
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
28
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
29
- "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
30
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
31
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
32
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
33
- "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
34
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
35
- "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
36
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
37
- "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
38
- "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
39
- "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
40
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
41
- "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
42
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
43
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
44
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
45
- "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
46
- "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
47
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
48
- "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
49
- "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
50
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
51
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
52
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
53
- "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
54
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
55
- "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
56
- "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
57
- "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
58
- "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
59
- "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
60
- "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
61
- "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
62
- "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
63
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
64
- "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
65
- "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
66
- "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
67
- "model.layers.12.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
68
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
69
- "model.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
70
- "model.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
71
- "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
72
- "model.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
73
- "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
74
- "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
75
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
76
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
77
- "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
78
- "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
79
- "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
80
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
81
- "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
82
- "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
83
- "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
84
- "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
85
- "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
86
- "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
87
- "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
88
- "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
89
- "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
90
- "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
91
- "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
92
- "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
93
- "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
94
- "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
95
- "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
96
- "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
97
- "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
98
- "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
99
- "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
100
- "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
101
- "model.layers.15.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
102
- "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
103
- "model.layers.15.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
104
- "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
105
- "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
106
- "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
107
- "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
108
- "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
109
- "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
110
- "model.layers.16.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
111
- "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
112
- "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
113
- "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
114
- "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
115
- "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
116
- "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
117
- "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
118
- "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
119
- "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
120
- "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
121
- "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
122
- "model.layers.17.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
123
- "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
124
- "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
125
- "model.layers.17.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
126
- "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
127
- "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
128
- "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
129
- "model.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
130
- "model.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
131
- "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
132
- "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
133
- "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
134
- "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
135
- "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
136
- "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
137
- "model.layers.18.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
138
- "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
139
- "model.layers.18.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
140
- "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
141
- "model.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
142
- "model.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
143
- "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
144
- "model.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
145
- "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
146
- "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
147
- "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
148
- "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
149
- "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
150
- "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
151
- "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
152
- "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
153
- "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
154
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
155
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
156
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
157
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
158
- "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
159
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
160
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
161
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
162
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
163
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
164
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
165
- "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
166
- "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
167
- "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
168
- "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
169
- "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
170
- "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
171
- "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
172
- "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
173
- "model.layers.20.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
174
- "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
175
- "model.layers.20.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
176
- "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
177
- "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
178
- "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
179
- "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
180
- "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
181
- "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
182
- "model.layers.21.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
183
- "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
184
- "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
185
- "model.layers.21.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
186
- "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
187
- "model.layers.21.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
188
- "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
189
- "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
190
- "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
191
- "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
192
- "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
193
- "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
194
- "model.layers.22.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
195
- "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
196
- "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
197
- "model.layers.22.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
198
- "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
199
- "model.layers.22.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
200
- "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
201
- "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
202
- "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
203
- "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
204
- "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
205
- "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
206
- "model.layers.23.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
207
- "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
208
- "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
209
- "model.layers.23.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
210
- "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
211
- "model.layers.23.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
212
- "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
213
- "model.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
214
- "model.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
215
- "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
216
- "model.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
217
- "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
218
- "model.layers.24.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
219
- "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
220
- "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
221
- "model.layers.24.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
222
- "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
223
- "model.layers.24.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
224
- "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
225
- "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
226
- "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
227
- "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
228
- "model.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
229
- "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
230
- "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
231
- "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
232
- "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
233
- "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
234
- "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
235
- "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
236
- "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
237
- "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
238
- "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
239
- "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
240
- "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
241
- "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
242
- "model.layers.26.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
243
- "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
244
- "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
245
- "model.layers.26.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
246
- "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
247
- "model.layers.26.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
248
- "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
249
- "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
250
- "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
251
- "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
252
- "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
253
- "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
254
- "model.layers.27.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
255
- "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
256
- "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
257
- "model.layers.27.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
258
- "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
259
- "model.layers.27.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
260
- "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
261
- "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
262
- "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
263
- "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
264
- "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
265
- "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
266
- "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
267
- "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
268
- "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
269
- "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
270
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
271
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
272
- "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
273
- "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
274
- "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
275
- "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
276
- "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
277
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
278
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
279
- "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
280
- "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
281
- "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
282
- "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
283
- "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
284
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
285
- "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
286
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
287
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
288
- "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
289
- "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
290
- "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
291
- "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
292
- "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
293
- "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
294
- "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
295
- "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
296
- "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
297
- "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
298
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
299
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
300
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
301
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
302
- "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
303
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
304
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
305
- "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
306
- "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
307
- "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
308
- "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
309
- "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
310
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
311
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
312
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
313
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
314
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
315
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
316
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
317
- "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
318
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
319
- "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
320
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
321
- "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
322
- "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
323
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
324
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
325
- "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
326
- "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
327
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
328
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
329
- "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
330
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
331
- "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
332
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
333
- "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
334
- "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
335
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
336
- "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
337
- "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
338
- "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
339
- "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
340
- "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
341
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
342
- "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
343
- "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
344
- "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
345
- "model.norm.weight": "model-00006-of-00007.safetensors"
346
- }
347
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1141/special_tokens_map.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>"
16
- ],
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "<|endoftext|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1141/tokenizer_config.json DELETED
@@ -1,208 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "151643": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "151644": {
14
- "content": "<|im_start|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "151645": {
22
- "content": "<|im_end|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "151646": {
30
- "content": "<|object_ref_start|>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "151647": {
38
- "content": "<|object_ref_end|>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "151648": {
46
- "content": "<|box_start|>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "151649": {
54
- "content": "<|box_end|>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "151650": {
62
- "content": "<|quad_start|>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "151651": {
70
- "content": "<|quad_end|>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "151652": {
78
- "content": "<|vision_start|>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "151653": {
86
- "content": "<|vision_end|>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "151654": {
94
- "content": "<|vision_pad|>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "151655": {
102
- "content": "<|image_pad|>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "151656": {
110
- "content": "<|video_pad|>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "151657": {
118
- "content": "<tool_call>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "151658": {
126
- "content": "</tool_call>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "151659": {
134
- "content": "<|fim_prefix|>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "151660": {
142
- "content": "<|fim_middle|>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "151661": {
150
- "content": "<|fim_suffix|>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "151662": {
158
- "content": "<|fim_pad|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "151663": {
166
- "content": "<|repo_name|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "151664": {
174
- "content": "<|file_sep|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- }
181
- },
182
- "additional_special_tokens": [
183
- "<|im_start|>",
184
- "<|im_end|>",
185
- "<|object_ref_start|>",
186
- "<|object_ref_end|>",
187
- "<|box_start|>",
188
- "<|box_end|>",
189
- "<|quad_start|>",
190
- "<|quad_end|>",
191
- "<|vision_start|>",
192
- "<|vision_end|>",
193
- "<|vision_pad|>",
194
- "<|image_pad|>",
195
- "<|video_pad|>"
196
- ],
197
- "bos_token": null,
198
- "clean_up_tokenization_spaces": false,
199
- "eos_token": "<|im_end|>",
200
- "errors": "replace",
201
- "extra_special_tokens": {},
202
- "model_max_length": 131072,
203
- "pad_token": "<|endoftext|>",
204
- "padding_side": "right",
205
- "split_special_tokens": false,
206
- "tokenizer_class": "Qwen2Tokenizer",
207
- "unk_token": null
208
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1141/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-1141/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-1304/added_tokens.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "</tool_call>": 151658,
3
- "<tool_call>": 151657,
4
- "<|box_end|>": 151649,
5
- "<|box_start|>": 151648,
6
- "<|endoftext|>": 151643,
7
- "<|file_sep|>": 151664,
8
- "<|fim_middle|>": 151660,
9
- "<|fim_pad|>": 151662,
10
- "<|fim_prefix|>": 151659,
11
- "<|fim_suffix|>": 151661,
12
- "<|im_end|>": 151645,
13
- "<|im_start|>": 151644,
14
- "<|image_pad|>": 151655,
15
- "<|object_ref_end|>": 151647,
16
- "<|object_ref_start|>": 151646,
17
- "<|quad_end|>": 151651,
18
- "<|quad_start|>": 151650,
19
- "<|repo_name|>": 151663,
20
- "<|video_pad|>": 151656,
21
- "<|vision_end|>": 151653,
22
- "<|vision_pad|>": 151654,
23
- "<|vision_start|>": 151652
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1304/chat_template.jinja DELETED
@@ -1,54 +0,0 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0]['role'] == 'system' %}
4
- {{- messages[0]['content'] }}
5
- {%- else %}
6
- {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
7
- {%- endif %}
8
- {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
- {%- for tool in tools %}
10
- {{- "\n" }}
11
- {{- tool | tojson }}
12
- {%- endfor %}
13
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
- {%- else %}
15
- {%- if messages[0]['role'] == 'system' %}
16
- {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
- {%- else %}
18
- {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
19
- {%- endif %}
20
- {%- endif %}
21
- {%- for message in messages %}
22
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
- {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
- {%- elif message.role == "assistant" %}
25
- {{- '<|im_start|>' + message.role }}
26
- {%- if message.content %}
27
- {{- '\n' + message.content }}
28
- {%- endif %}
29
- {%- for tool_call in message.tool_calls %}
30
- {%- if tool_call.function is defined %}
31
- {%- set tool_call = tool_call.function %}
32
- {%- endif %}
33
- {{- '\n<tool_call>\n{"name": "' }}
34
- {{- tool_call.name }}
35
- {{- '", "arguments": ' }}
36
- {{- tool_call.arguments | tojson }}
37
- {{- '}\n</tool_call>' }}
38
- {%- endfor %}
39
- {{- '<|im_end|>\n' }}
40
- {%- elif message.role == "tool" %}
41
- {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
- {{- '<|im_start|>user' }}
43
- {%- endif %}
44
- {{- '\n<tool_response>\n' }}
45
- {{- message.content }}
46
- {{- '\n</tool_response>' }}
47
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
- {{- '<|im_end|>\n' }}
49
- {%- endif %}
50
- {%- endif %}
51
- {%- endfor %}
52
- {%- if add_generation_prompt %}
53
- {{- '<|im_start|>assistant\n' }}
54
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1304/config.json DELETED
@@ -1,58 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen2ForCausalLM"
4
- ],
5
- "attention_dropout": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "hidden_act": "silu",
9
- "hidden_size": 3584,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 18944,
12
- "layer_types": [
13
- "full_attention",
14
- "full_attention",
15
- "full_attention",
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention"
41
- ],
42
- "max_position_embeddings": 4096,
43
- "max_window_layers": 28,
44
- "model_type": "qwen2",
45
- "num_attention_heads": 28,
46
- "num_hidden_layers": 28,
47
- "num_key_value_heads": 4,
48
- "rms_norm_eps": 1e-06,
49
- "rope_scaling": null,
50
- "rope_theta": 10000.0,
51
- "sliding_window": null,
52
- "tie_word_embeddings": false,
53
- "torch_dtype": "float32",
54
- "transformers_version": "4.55.0",
55
- "use_cache": false,
56
- "use_sliding_window": false,
57
- "vocab_size": 152064
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1304/generation_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "bos_token_id": 151643,
3
- "eos_token_id": [
4
- 151645,
5
- 151643
6
- ],
7
- "pad_token_id": 151643,
8
- "transformers_version": "4.55.0"
9
- }
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1304/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-1304/model.safetensors.index.json DELETED
@@ -1,347 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_parameters": 1903904128,
4
- "total_size": 30462466048
5
- },
6
- "weight_map": {
7
- "lm_head.weight": "model-00007-of-00007.safetensors",
8
- "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
9
- "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
10
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
11
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
12
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
13
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
14
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
15
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
16
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
17
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
18
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
19
- "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
20
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
21
- "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
22
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
23
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
24
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
25
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
26
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
27
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
28
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
29
- "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
30
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
31
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
32
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
33
- "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
34
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
35
- "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
36
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
37
- "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
38
- "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
39
- "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
40
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
41
- "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
42
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
43
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
44
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
45
- "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
46
- "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
47
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
48
- "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
49
- "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
50
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
51
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
52
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
53
- "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
54
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
55
- "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
56
- "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
57
- "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
58
- "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
59
- "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
60
- "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
61
- "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
62
- "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
63
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
64
- "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
65
- "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
66
- "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
67
- "model.layers.12.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
68
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
69
- "model.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
70
- "model.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
71
- "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
72
- "model.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
73
- "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
74
- "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
75
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
76
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
77
- "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
78
- "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
79
- "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
80
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
81
- "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
82
- "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
83
- "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
84
- "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
85
- "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
86
- "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
87
- "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
88
- "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
89
- "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
90
- "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
91
- "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
92
- "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
93
- "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
94
- "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
95
- "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
96
- "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
97
- "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
98
- "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
99
- "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
100
- "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
101
- "model.layers.15.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
102
- "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
103
- "model.layers.15.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
104
- "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
105
- "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
106
- "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
107
- "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
108
- "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
109
- "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
110
- "model.layers.16.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
111
- "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
112
- "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
113
- "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
114
- "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
115
- "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
116
- "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
117
- "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
118
- "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
119
- "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
120
- "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
121
- "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
122
- "model.layers.17.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
123
- "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
124
- "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
125
- "model.layers.17.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
126
- "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
127
- "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
128
- "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
129
- "model.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
130
- "model.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
131
- "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
132
- "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
133
- "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
134
- "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
135
- "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
136
- "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
137
- "model.layers.18.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
138
- "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
139
- "model.layers.18.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
140
- "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
141
- "model.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
142
- "model.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
143
- "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
144
- "model.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
145
- "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
146
- "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
147
- "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
148
- "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
149
- "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
150
- "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
151
- "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
152
- "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
153
- "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
154
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
155
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
156
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
157
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
158
- "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
159
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
160
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
161
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
162
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
163
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
164
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
165
- "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
166
- "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
167
- "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
168
- "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
169
- "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
170
- "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
171
- "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
172
- "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
173
- "model.layers.20.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
174
- "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
175
- "model.layers.20.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
176
- "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
177
- "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
178
- "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
179
- "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
180
- "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
181
- "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
182
- "model.layers.21.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
183
- "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
184
- "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
185
- "model.layers.21.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
186
- "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
187
- "model.layers.21.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
188
- "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
189
- "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
190
- "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
191
- "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
192
- "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
193
- "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
194
- "model.layers.22.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
195
- "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
196
- "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
197
- "model.layers.22.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
198
- "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
199
- "model.layers.22.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
200
- "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
201
- "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
202
- "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
203
- "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
204
- "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
205
- "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
206
- "model.layers.23.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
207
- "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
208
- "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
209
- "model.layers.23.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
210
- "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
211
- "model.layers.23.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
212
- "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
213
- "model.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
214
- "model.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
215
- "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
216
- "model.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
217
- "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
218
- "model.layers.24.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
219
- "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
220
- "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
221
- "model.layers.24.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
222
- "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
223
- "model.layers.24.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
224
- "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
225
- "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
226
- "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
227
- "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
228
- "model.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
229
- "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
230
- "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
231
- "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
232
- "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
233
- "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
234
- "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
235
- "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
236
- "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
237
- "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
238
- "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
239
- "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
240
- "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
241
- "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
242
- "model.layers.26.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
243
- "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
244
- "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
245
- "model.layers.26.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
246
- "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
247
- "model.layers.26.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
248
- "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
249
- "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
250
- "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
251
- "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
252
- "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
253
- "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
254
- "model.layers.27.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
255
- "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
256
- "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
257
- "model.layers.27.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
258
- "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
259
- "model.layers.27.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
260
- "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
261
- "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
262
- "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
263
- "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
264
- "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
265
- "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
266
- "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
267
- "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
268
- "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
269
- "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
270
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
271
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
272
- "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
273
- "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
274
- "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
275
- "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
276
- "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
277
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
278
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
279
- "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
280
- "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
281
- "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
282
- "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
283
- "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
284
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
285
- "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
286
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
287
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
288
- "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
289
- "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
290
- "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
291
- "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
292
- "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
293
- "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
294
- "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
295
- "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
296
- "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
297
- "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
298
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
299
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
300
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
301
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
302
- "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
303
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
304
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
305
- "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
306
- "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
307
- "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
308
- "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
309
- "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
310
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
311
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
312
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
313
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
314
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
315
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
316
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
317
- "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
318
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
319
- "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
320
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
321
- "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
322
- "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
323
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
324
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
325
- "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
326
- "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
327
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
328
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
329
- "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
330
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
331
- "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
332
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
333
- "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
334
- "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
335
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
336
- "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
337
- "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
338
- "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
339
- "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
340
- "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
341
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
342
- "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
343
- "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
344
- "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
345
- "model.norm.weight": "model-00006-of-00007.safetensors"
346
- }
347
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1304/special_tokens_map.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>"
16
- ],
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "<|endoftext|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1304/tokenizer_config.json DELETED
@@ -1,208 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "151643": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "151644": {
14
- "content": "<|im_start|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "151645": {
22
- "content": "<|im_end|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "151646": {
30
- "content": "<|object_ref_start|>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "151647": {
38
- "content": "<|object_ref_end|>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "151648": {
46
- "content": "<|box_start|>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "151649": {
54
- "content": "<|box_end|>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "151650": {
62
- "content": "<|quad_start|>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "151651": {
70
- "content": "<|quad_end|>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "151652": {
78
- "content": "<|vision_start|>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "151653": {
86
- "content": "<|vision_end|>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "151654": {
94
- "content": "<|vision_pad|>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "151655": {
102
- "content": "<|image_pad|>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "151656": {
110
- "content": "<|video_pad|>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "151657": {
118
- "content": "<tool_call>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "151658": {
126
- "content": "</tool_call>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "151659": {
134
- "content": "<|fim_prefix|>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "151660": {
142
- "content": "<|fim_middle|>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "151661": {
150
- "content": "<|fim_suffix|>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "151662": {
158
- "content": "<|fim_pad|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "151663": {
166
- "content": "<|repo_name|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "151664": {
174
- "content": "<|file_sep|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- }
181
- },
182
- "additional_special_tokens": [
183
- "<|im_start|>",
184
- "<|im_end|>",
185
- "<|object_ref_start|>",
186
- "<|object_ref_end|>",
187
- "<|box_start|>",
188
- "<|box_end|>",
189
- "<|quad_start|>",
190
- "<|quad_end|>",
191
- "<|vision_start|>",
192
- "<|vision_end|>",
193
- "<|vision_pad|>",
194
- "<|image_pad|>",
195
- "<|video_pad|>"
196
- ],
197
- "bos_token": null,
198
- "clean_up_tokenization_spaces": false,
199
- "eos_token": "<|im_end|>",
200
- "errors": "replace",
201
- "extra_special_tokens": {},
202
- "model_max_length": 131072,
203
- "pad_token": "<|endoftext|>",
204
- "padding_side": "right",
205
- "split_special_tokens": false,
206
- "tokenizer_class": "Qwen2Tokenizer",
207
- "unk_token": null
208
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1304/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-1304/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-1467/added_tokens.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "</tool_call>": 151658,
3
- "<tool_call>": 151657,
4
- "<|box_end|>": 151649,
5
- "<|box_start|>": 151648,
6
- "<|endoftext|>": 151643,
7
- "<|file_sep|>": 151664,
8
- "<|fim_middle|>": 151660,
9
- "<|fim_pad|>": 151662,
10
- "<|fim_prefix|>": 151659,
11
- "<|fim_suffix|>": 151661,
12
- "<|im_end|>": 151645,
13
- "<|im_start|>": 151644,
14
- "<|image_pad|>": 151655,
15
- "<|object_ref_end|>": 151647,
16
- "<|object_ref_start|>": 151646,
17
- "<|quad_end|>": 151651,
18
- "<|quad_start|>": 151650,
19
- "<|repo_name|>": 151663,
20
- "<|video_pad|>": 151656,
21
- "<|vision_end|>": 151653,
22
- "<|vision_pad|>": 151654,
23
- "<|vision_start|>": 151652
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1467/chat_template.jinja DELETED
@@ -1,54 +0,0 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0]['role'] == 'system' %}
4
- {{- messages[0]['content'] }}
5
- {%- else %}
6
- {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
7
- {%- endif %}
8
- {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
- {%- for tool in tools %}
10
- {{- "\n" }}
11
- {{- tool | tojson }}
12
- {%- endfor %}
13
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
- {%- else %}
15
- {%- if messages[0]['role'] == 'system' %}
16
- {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
- {%- else %}
18
- {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
19
- {%- endif %}
20
- {%- endif %}
21
- {%- for message in messages %}
22
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
- {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
- {%- elif message.role == "assistant" %}
25
- {{- '<|im_start|>' + message.role }}
26
- {%- if message.content %}
27
- {{- '\n' + message.content }}
28
- {%- endif %}
29
- {%- for tool_call in message.tool_calls %}
30
- {%- if tool_call.function is defined %}
31
- {%- set tool_call = tool_call.function %}
32
- {%- endif %}
33
- {{- '\n<tool_call>\n{"name": "' }}
34
- {{- tool_call.name }}
35
- {{- '", "arguments": ' }}
36
- {{- tool_call.arguments | tojson }}
37
- {{- '}\n</tool_call>' }}
38
- {%- endfor %}
39
- {{- '<|im_end|>\n' }}
40
- {%- elif message.role == "tool" %}
41
- {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
- {{- '<|im_start|>user' }}
43
- {%- endif %}
44
- {{- '\n<tool_response>\n' }}
45
- {{- message.content }}
46
- {{- '\n</tool_response>' }}
47
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
- {{- '<|im_end|>\n' }}
49
- {%- endif %}
50
- {%- endif %}
51
- {%- endfor %}
52
- {%- if add_generation_prompt %}
53
- {{- '<|im_start|>assistant\n' }}
54
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1467/config.json DELETED
@@ -1,58 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen2ForCausalLM"
4
- ],
5
- "attention_dropout": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "hidden_act": "silu",
9
- "hidden_size": 3584,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 18944,
12
- "layer_types": [
13
- "full_attention",
14
- "full_attention",
15
- "full_attention",
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention"
41
- ],
42
- "max_position_embeddings": 4096,
43
- "max_window_layers": 28,
44
- "model_type": "qwen2",
45
- "num_attention_heads": 28,
46
- "num_hidden_layers": 28,
47
- "num_key_value_heads": 4,
48
- "rms_norm_eps": 1e-06,
49
- "rope_scaling": null,
50
- "rope_theta": 10000.0,
51
- "sliding_window": null,
52
- "tie_word_embeddings": false,
53
- "torch_dtype": "float32",
54
- "transformers_version": "4.55.0",
55
- "use_cache": false,
56
- "use_sliding_window": false,
57
- "vocab_size": 152064
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1467/generation_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "bos_token_id": 151643,
3
- "eos_token_id": [
4
- 151645,
5
- 151643
6
- ],
7
- "pad_token_id": 151643,
8
- "transformers_version": "4.55.0"
9
- }
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1467/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-1467/model.safetensors.index.json DELETED
@@ -1,347 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_parameters": 1903904128,
4
- "total_size": 30462466048
5
- },
6
- "weight_map": {
7
- "lm_head.weight": "model-00007-of-00007.safetensors",
8
- "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
9
- "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
10
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
11
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
12
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
13
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
14
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
15
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
16
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
17
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
18
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
19
- "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
20
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
21
- "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
22
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
23
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
24
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
25
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
26
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
27
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
28
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
29
- "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
30
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
31
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
32
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
33
- "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
34
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
35
- "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
36
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
37
- "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
38
- "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
39
- "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
40
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
41
- "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
42
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
43
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
44
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
45
- "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
46
- "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
47
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
48
- "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
49
- "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
50
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
51
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
52
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
53
- "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
54
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
55
- "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
56
- "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
57
- "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
58
- "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
59
- "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
60
- "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
61
- "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
62
- "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
63
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
64
- "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
65
- "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
66
- "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
67
- "model.layers.12.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
68
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
69
- "model.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
70
- "model.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
71
- "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
72
- "model.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
73
- "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
74
- "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
75
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
76
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
77
- "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
78
- "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
79
- "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
80
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
81
- "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
82
- "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
83
- "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
84
- "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
85
- "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
86
- "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
87
- "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
88
- "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
89
- "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
90
- "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
91
- "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
92
- "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
93
- "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
94
- "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
95
- "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
96
- "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
97
- "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
98
- "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
99
- "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
100
- "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
101
- "model.layers.15.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
102
- "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
103
- "model.layers.15.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
104
- "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
105
- "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
106
- "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
107
- "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
108
- "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
109
- "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
110
- "model.layers.16.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
111
- "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
112
- "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
113
- "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
114
- "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
115
- "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
116
- "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
117
- "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
118
- "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
119
- "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
120
- "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
121
- "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
122
- "model.layers.17.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
123
- "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
124
- "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
125
- "model.layers.17.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
126
- "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
127
- "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
128
- "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
129
- "model.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
130
- "model.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
131
- "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
132
- "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
133
- "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
134
- "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
135
- "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
136
- "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
137
- "model.layers.18.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
138
- "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
139
- "model.layers.18.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
140
- "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
141
- "model.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
142
- "model.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
143
- "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
144
- "model.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
145
- "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
146
- "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
147
- "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
148
- "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
149
- "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
150
- "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
151
- "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
152
- "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
153
- "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
154
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
155
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
156
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
157
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
158
- "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
159
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
160
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
161
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
162
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
163
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
164
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
165
- "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
166
- "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
167
- "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
168
- "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
169
- "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
170
- "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
171
- "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
172
- "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
173
- "model.layers.20.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
174
- "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
175
- "model.layers.20.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
176
- "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
177
- "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
178
- "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
179
- "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
180
- "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
181
- "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
182
- "model.layers.21.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
183
- "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
184
- "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
185
- "model.layers.21.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
186
- "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
187
- "model.layers.21.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
188
- "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
189
- "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
190
- "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
191
- "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
192
- "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
193
- "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
194
- "model.layers.22.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
195
- "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
196
- "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
197
- "model.layers.22.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
198
- "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
199
- "model.layers.22.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
200
- "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
201
- "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
202
- "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
203
- "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
204
- "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
205
- "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
206
- "model.layers.23.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
207
- "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
208
- "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
209
- "model.layers.23.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
210
- "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
211
- "model.layers.23.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
212
- "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
213
- "model.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
214
- "model.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
215
- "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
216
- "model.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
217
- "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
218
- "model.layers.24.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
219
- "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
220
- "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
221
- "model.layers.24.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
222
- "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
223
- "model.layers.24.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
224
- "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
225
- "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
226
- "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
227
- "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
228
- "model.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
229
- "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
230
- "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
231
- "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
232
- "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
233
- "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
234
- "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
235
- "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
236
- "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
237
- "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
238
- "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
239
- "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
240
- "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
241
- "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
242
- "model.layers.26.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
243
- "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
244
- "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
245
- "model.layers.26.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
246
- "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
247
- "model.layers.26.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
248
- "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
249
- "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
250
- "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
251
- "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
252
- "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
253
- "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
254
- "model.layers.27.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
255
- "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
256
- "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
257
- "model.layers.27.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
258
- "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
259
- "model.layers.27.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
260
- "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
261
- "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
262
- "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
263
- "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
264
- "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
265
- "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
266
- "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
267
- "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
268
- "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
269
- "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
270
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
271
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
272
- "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
273
- "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
274
- "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
275
- "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
276
- "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
277
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
278
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
279
- "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
280
- "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
281
- "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
282
- "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
283
- "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
284
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
285
- "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
286
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
287
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
288
- "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
289
- "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
290
- "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
291
- "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
292
- "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
293
- "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
294
- "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
295
- "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
296
- "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
297
- "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
298
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
299
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
300
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
301
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
302
- "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
303
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
304
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
305
- "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
306
- "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
307
- "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
308
- "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
309
- "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
310
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
311
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
312
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
313
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
314
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
315
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
316
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
317
- "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
318
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
319
- "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
320
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
321
- "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
322
- "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
323
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
324
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
325
- "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
326
- "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
327
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
328
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
329
- "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
330
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
331
- "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
332
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
333
- "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
334
- "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
335
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
336
- "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
337
- "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
338
- "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
339
- "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
340
- "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
341
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
342
- "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
343
- "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
344
- "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
345
- "model.norm.weight": "model-00006-of-00007.safetensors"
346
- }
347
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1467/special_tokens_map.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>"
16
- ],
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "<|endoftext|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1467/tokenizer_config.json DELETED
@@ -1,208 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "151643": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "151644": {
14
- "content": "<|im_start|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "151645": {
22
- "content": "<|im_end|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "151646": {
30
- "content": "<|object_ref_start|>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "151647": {
38
- "content": "<|object_ref_end|>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "151648": {
46
- "content": "<|box_start|>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "151649": {
54
- "content": "<|box_end|>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "151650": {
62
- "content": "<|quad_start|>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "151651": {
70
- "content": "<|quad_end|>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "151652": {
78
- "content": "<|vision_start|>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "151653": {
86
- "content": "<|vision_end|>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "151654": {
94
- "content": "<|vision_pad|>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "151655": {
102
- "content": "<|image_pad|>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "151656": {
110
- "content": "<|video_pad|>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "151657": {
118
- "content": "<tool_call>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "151658": {
126
- "content": "</tool_call>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "151659": {
134
- "content": "<|fim_prefix|>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "151660": {
142
- "content": "<|fim_middle|>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "151661": {
150
- "content": "<|fim_suffix|>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "151662": {
158
- "content": "<|fim_pad|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "151663": {
166
- "content": "<|repo_name|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "151664": {
174
- "content": "<|file_sep|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- }
181
- },
182
- "additional_special_tokens": [
183
- "<|im_start|>",
184
- "<|im_end|>",
185
- "<|object_ref_start|>",
186
- "<|object_ref_end|>",
187
- "<|box_start|>",
188
- "<|box_end|>",
189
- "<|quad_start|>",
190
- "<|quad_end|>",
191
- "<|vision_start|>",
192
- "<|vision_end|>",
193
- "<|vision_pad|>",
194
- "<|image_pad|>",
195
- "<|video_pad|>"
196
- ],
197
- "bos_token": null,
198
- "clean_up_tokenization_spaces": false,
199
- "eos_token": "<|im_end|>",
200
- "errors": "replace",
201
- "extra_special_tokens": {},
202
- "model_max_length": 131072,
203
- "pad_token": "<|endoftext|>",
204
- "padding_side": "right",
205
- "split_special_tokens": false,
206
- "tokenizer_class": "Qwen2Tokenizer",
207
- "unk_token": null
208
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1467/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-1467/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-163/added_tokens.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "</tool_call>": 151658,
3
- "<tool_call>": 151657,
4
- "<|box_end|>": 151649,
5
- "<|box_start|>": 151648,
6
- "<|endoftext|>": 151643,
7
- "<|file_sep|>": 151664,
8
- "<|fim_middle|>": 151660,
9
- "<|fim_pad|>": 151662,
10
- "<|fim_prefix|>": 151659,
11
- "<|fim_suffix|>": 151661,
12
- "<|im_end|>": 151645,
13
- "<|im_start|>": 151644,
14
- "<|image_pad|>": 151655,
15
- "<|object_ref_end|>": 151647,
16
- "<|object_ref_start|>": 151646,
17
- "<|quad_end|>": 151651,
18
- "<|quad_start|>": 151650,
19
- "<|repo_name|>": 151663,
20
- "<|video_pad|>": 151656,
21
- "<|vision_end|>": 151653,
22
- "<|vision_pad|>": 151654,
23
- "<|vision_start|>": 151652
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-163/chat_template.jinja DELETED
@@ -1,54 +0,0 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0]['role'] == 'system' %}
4
- {{- messages[0]['content'] }}
5
- {%- else %}
6
- {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
7
- {%- endif %}
8
- {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
- {%- for tool in tools %}
10
- {{- "\n" }}
11
- {{- tool | tojson }}
12
- {%- endfor %}
13
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
- {%- else %}
15
- {%- if messages[0]['role'] == 'system' %}
16
- {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
- {%- else %}
18
- {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
19
- {%- endif %}
20
- {%- endif %}
21
- {%- for message in messages %}
22
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
- {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
- {%- elif message.role == "assistant" %}
25
- {{- '<|im_start|>' + message.role }}
26
- {%- if message.content %}
27
- {{- '\n' + message.content }}
28
- {%- endif %}
29
- {%- for tool_call in message.tool_calls %}
30
- {%- if tool_call.function is defined %}
31
- {%- set tool_call = tool_call.function %}
32
- {%- endif %}
33
- {{- '\n<tool_call>\n{"name": "' }}
34
- {{- tool_call.name }}
35
- {{- '", "arguments": ' }}
36
- {{- tool_call.arguments | tojson }}
37
- {{- '}\n</tool_call>' }}
38
- {%- endfor %}
39
- {{- '<|im_end|>\n' }}
40
- {%- elif message.role == "tool" %}
41
- {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
- {{- '<|im_start|>user' }}
43
- {%- endif %}
44
- {{- '\n<tool_response>\n' }}
45
- {{- message.content }}
46
- {{- '\n</tool_response>' }}
47
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
- {{- '<|im_end|>\n' }}
49
- {%- endif %}
50
- {%- endif %}
51
- {%- endfor %}
52
- {%- if add_generation_prompt %}
53
- {{- '<|im_start|>assistant\n' }}
54
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-163/config.json DELETED
@@ -1,58 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen2ForCausalLM"
4
- ],
5
- "attention_dropout": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "hidden_act": "silu",
9
- "hidden_size": 3584,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 18944,
12
- "layer_types": [
13
- "full_attention",
14
- "full_attention",
15
- "full_attention",
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention"
41
- ],
42
- "max_position_embeddings": 4096,
43
- "max_window_layers": 28,
44
- "model_type": "qwen2",
45
- "num_attention_heads": 28,
46
- "num_hidden_layers": 28,
47
- "num_key_value_heads": 4,
48
- "rms_norm_eps": 1e-06,
49
- "rope_scaling": null,
50
- "rope_theta": 10000.0,
51
- "sliding_window": null,
52
- "tie_word_embeddings": false,
53
- "torch_dtype": "float32",
54
- "transformers_version": "4.55.0",
55
- "use_cache": false,
56
- "use_sliding_window": false,
57
- "vocab_size": 152064
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-163/generation_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "bos_token_id": 151643,
3
- "eos_token_id": [
4
- 151645,
5
- 151643
6
- ],
7
- "pad_token_id": 151643,
8
- "transformers_version": "4.55.0"
9
- }
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-163/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-163/model.safetensors.index.json DELETED
@@ -1,347 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_parameters": 1903904128,
4
- "total_size": 30462466048
5
- },
6
- "weight_map": {
7
- "lm_head.weight": "model-00007-of-00007.safetensors",
8
- "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
9
- "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
10
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
11
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
12
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
13
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
14
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
15
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
16
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
17
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
18
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
19
- "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
20
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
21
- "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
22
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
23
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
24
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
25
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
26
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
27
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
28
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
29
- "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
30
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
31
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
32
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
33
- "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
34
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
35
- "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
36
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
37
- "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
38
- "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
39
- "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
40
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
41
- "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
42
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
43
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
44
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
45
- "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
46
- "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
47
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
48
- "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
49
- "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
50
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
51
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
52
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
53
- "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
54
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
55
- "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
56
- "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
57
- "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
58
- "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
59
- "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
60
- "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
61
- "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
62
- "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
63
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
64
- "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
65
- "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
66
- "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
67
- "model.layers.12.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
68
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
69
- "model.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
70
- "model.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
71
- "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
72
- "model.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
73
- "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
74
- "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
75
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
76
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
77
- "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
78
- "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
79
- "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
80
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
81
- "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
82
- "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
83
- "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
84
- "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
85
- "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
86
- "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
87
- "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
88
- "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
89
- "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
90
- "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
91
- "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
92
- "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
93
- "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
94
- "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
95
- "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
96
- "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
97
- "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
98
- "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
99
- "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
100
- "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
101
- "model.layers.15.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
102
- "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
103
- "model.layers.15.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
104
- "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
105
- "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
106
- "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
107
- "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
108
- "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
109
- "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
110
- "model.layers.16.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
111
- "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
112
- "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
113
- "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
114
- "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
115
- "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
116
- "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
117
- "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
118
- "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
119
- "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
120
- "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
121
- "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
122
- "model.layers.17.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
123
- "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
124
- "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
125
- "model.layers.17.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
126
- "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
127
- "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
128
- "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
129
- "model.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
130
- "model.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
131
- "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
132
- "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
133
- "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
134
- "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
135
- "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
136
- "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
137
- "model.layers.18.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
138
- "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
139
- "model.layers.18.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
140
- "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
141
- "model.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
142
- "model.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
143
- "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
144
- "model.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
145
- "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
146
- "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
147
- "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
148
- "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
149
- "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
150
- "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
151
- "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
152
- "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
153
- "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
154
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
155
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
156
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
157
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
158
- "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
159
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
160
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
161
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
162
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
163
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
164
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
165
- "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
166
- "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
167
- "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
168
- "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
169
- "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
170
- "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
171
- "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
172
- "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
173
- "model.layers.20.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
174
- "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
175
- "model.layers.20.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
176
- "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
177
- "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
178
- "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
179
- "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
180
- "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
181
- "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
182
- "model.layers.21.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
183
- "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
184
- "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
185
- "model.layers.21.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
186
- "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
187
- "model.layers.21.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
188
- "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
189
- "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
190
- "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
191
- "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
192
- "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
193
- "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
194
- "model.layers.22.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
195
- "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
196
- "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
197
- "model.layers.22.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
198
- "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
199
- "model.layers.22.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
200
- "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
201
- "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
202
- "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
203
- "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
204
- "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
205
- "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
206
- "model.layers.23.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
207
- "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
208
- "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
209
- "model.layers.23.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
210
- "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
211
- "model.layers.23.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
212
- "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
213
- "model.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
214
- "model.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
215
- "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
216
- "model.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
217
- "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
218
- "model.layers.24.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
219
- "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
220
- "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
221
- "model.layers.24.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
222
- "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
223
- "model.layers.24.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
224
- "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
225
- "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
226
- "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
227
- "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
228
- "model.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
229
- "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
230
- "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
231
- "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
232
- "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
233
- "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
234
- "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
235
- "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
236
- "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
237
- "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
238
- "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
239
- "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
240
- "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
241
- "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
242
- "model.layers.26.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
243
- "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
244
- "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
245
- "model.layers.26.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
246
- "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
247
- "model.layers.26.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
248
- "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
249
- "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
250
- "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
251
- "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
252
- "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
253
- "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
254
- "model.layers.27.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
255
- "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
256
- "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
257
- "model.layers.27.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
258
- "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
259
- "model.layers.27.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
260
- "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
261
- "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
262
- "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
263
- "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
264
- "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
265
- "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
266
- "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
267
- "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
268
- "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
269
- "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
270
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
271
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
272
- "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
273
- "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
274
- "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
275
- "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
276
- "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
277
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
278
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
279
- "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
280
- "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
281
- "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
282
- "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
283
- "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
284
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
285
- "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
286
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
287
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
288
- "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
289
- "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
290
- "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
291
- "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
292
- "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
293
- "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
294
- "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
295
- "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
296
- "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
297
- "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
298
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
299
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
300
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
301
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
302
- "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
303
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
304
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
305
- "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
306
- "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
307
- "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
308
- "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
309
- "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
310
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
311
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
312
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
313
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
314
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
315
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
316
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
317
- "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
318
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
319
- "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
320
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
321
- "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
322
- "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
323
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
324
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
325
- "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
326
- "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
327
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
328
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
329
- "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
330
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
331
- "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
332
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
333
- "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
334
- "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
335
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
336
- "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
337
- "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
338
- "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
339
- "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
340
- "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
341
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
342
- "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
343
- "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
344
- "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
345
- "model.norm.weight": "model-00006-of-00007.safetensors"
346
- }
347
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-163/special_tokens_map.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>"
16
- ],
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "<|endoftext|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-163/tokenizer_config.json DELETED
@@ -1,208 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "151643": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "151644": {
14
- "content": "<|im_start|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "151645": {
22
- "content": "<|im_end|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "151646": {
30
- "content": "<|object_ref_start|>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "151647": {
38
- "content": "<|object_ref_end|>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "151648": {
46
- "content": "<|box_start|>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "151649": {
54
- "content": "<|box_end|>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "151650": {
62
- "content": "<|quad_start|>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "151651": {
70
- "content": "<|quad_end|>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "151652": {
78
- "content": "<|vision_start|>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "151653": {
86
- "content": "<|vision_end|>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "151654": {
94
- "content": "<|vision_pad|>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "151655": {
102
- "content": "<|image_pad|>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "151656": {
110
- "content": "<|video_pad|>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "151657": {
118
- "content": "<tool_call>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "151658": {
126
- "content": "</tool_call>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "151659": {
134
- "content": "<|fim_prefix|>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "151660": {
142
- "content": "<|fim_middle|>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "151661": {
150
- "content": "<|fim_suffix|>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "151662": {
158
- "content": "<|fim_pad|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "151663": {
166
- "content": "<|repo_name|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "151664": {
174
- "content": "<|file_sep|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- }
181
- },
182
- "additional_special_tokens": [
183
- "<|im_start|>",
184
- "<|im_end|>",
185
- "<|object_ref_start|>",
186
- "<|object_ref_end|>",
187
- "<|box_start|>",
188
- "<|box_end|>",
189
- "<|quad_start|>",
190
- "<|quad_end|>",
191
- "<|vision_start|>",
192
- "<|vision_end|>",
193
- "<|vision_pad|>",
194
- "<|image_pad|>",
195
- "<|video_pad|>"
196
- ],
197
- "bos_token": null,
198
- "clean_up_tokenization_spaces": false,
199
- "eos_token": "<|im_end|>",
200
- "errors": "replace",
201
- "extra_special_tokens": {},
202
- "model_max_length": 131072,
203
- "pad_token": "<|endoftext|>",
204
- "padding_side": "right",
205
- "split_special_tokens": false,
206
- "tokenizer_class": "Qwen2Tokenizer",
207
- "unk_token": null
208
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-163/trainer_state.json DELETED
@@ -1,1175 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
- "eval_steps": 500,
7
- "global_step": 163,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.006134969325153374,
14
- "grad_norm": 29.675046920776367,
15
- "learning_rate": 5e-06,
16
- "loss": 3.5826,
17
- "step": 1
18
- },
19
- {
20
- "epoch": 0.012269938650306749,
21
- "grad_norm": 29.766326904296875,
22
- "learning_rate": 4.999995356617983e-06,
23
- "loss": 2.2822,
24
- "step": 2
25
- },
26
- {
27
- "epoch": 0.018404907975460124,
28
- "grad_norm": 30.781108856201172,
29
- "learning_rate": 4.999981426489179e-06,
30
- "loss": 4.0728,
31
- "step": 3
32
- },
33
- {
34
- "epoch": 0.024539877300613498,
35
- "grad_norm": 21.731002807617188,
36
- "learning_rate": 4.999958209665336e-06,
37
- "loss": 1.9697,
38
- "step": 4
39
- },
40
- {
41
- "epoch": 0.03067484662576687,
42
- "grad_norm": 18.72490692138672,
43
- "learning_rate": 4.999925706232695e-06,
44
- "loss": 3.3215,
45
- "step": 5
46
- },
47
- {
48
- "epoch": 0.03680981595092025,
49
- "grad_norm": 20.35538673400879,
50
- "learning_rate": 4.999883916312e-06,
51
- "loss": 3.4265,
52
- "step": 6
53
- },
54
- {
55
- "epoch": 0.04294478527607362,
56
- "grad_norm": 12.431780815124512,
57
- "learning_rate": 4.9998328400584864e-06,
58
- "loss": 2.1096,
59
- "step": 7
60
- },
61
- {
62
- "epoch": 0.049079754601226995,
63
- "grad_norm": 2.4425623416900635,
64
- "learning_rate": 4.999772477661888e-06,
65
- "loss": 1.1057,
66
- "step": 8
67
- },
68
- {
69
- "epoch": 0.05521472392638037,
70
- "grad_norm": 14.460396766662598,
71
- "learning_rate": 4.999702829346432e-06,
72
- "loss": 2.9792,
73
- "step": 9
74
- },
75
- {
76
- "epoch": 0.06134969325153374,
77
- "grad_norm": 4.459011077880859,
78
- "learning_rate": 4.999623895370843e-06,
79
- "loss": 1.6282,
80
- "step": 10
81
- },
82
- {
83
- "epoch": 0.06748466257668712,
84
- "grad_norm": 9.057388305664062,
85
- "learning_rate": 4.999535676028338e-06,
86
- "loss": 2.5876,
87
- "step": 11
88
- },
89
- {
90
- "epoch": 0.0736196319018405,
91
- "grad_norm": 1.9979051351547241,
92
- "learning_rate": 4.999438171646624e-06,
93
- "loss": 1.0706,
94
- "step": 12
95
- },
96
- {
97
- "epoch": 0.07975460122699386,
98
- "grad_norm": 3.1870503425598145,
99
- "learning_rate": 4.999331382587901e-06,
100
- "loss": 1.3047,
101
- "step": 13
102
- },
103
- {
104
- "epoch": 0.08588957055214724,
105
- "grad_norm": 6.666968822479248,
106
- "learning_rate": 4.999215309248861e-06,
107
- "loss": 2.3811,
108
- "step": 14
109
- },
110
- {
111
- "epoch": 0.09202453987730061,
112
- "grad_norm": 2.846212387084961,
113
- "learning_rate": 4.999089952060681e-06,
114
- "loss": 1.3571,
115
- "step": 15
116
- },
117
- {
118
- "epoch": 0.09815950920245399,
119
- "grad_norm": 4.295431613922119,
120
- "learning_rate": 4.998955311489025e-06,
121
- "loss": 1.47,
122
- "step": 16
123
- },
124
- {
125
- "epoch": 0.10429447852760736,
126
- "grad_norm": 6.105274677276611,
127
- "learning_rate": 4.998811388034046e-06,
128
- "loss": 2.4916,
129
- "step": 17
130
- },
131
- {
132
- "epoch": 0.11042944785276074,
133
- "grad_norm": 6.522989749908447,
134
- "learning_rate": 4.9986581822303746e-06,
135
- "loss": 1.9366,
136
- "step": 18
137
- },
138
- {
139
- "epoch": 0.1165644171779141,
140
- "grad_norm": 4.987826347351074,
141
- "learning_rate": 4.998495694647127e-06,
142
- "loss": 2.5305,
143
- "step": 19
144
- },
145
- {
146
- "epoch": 0.12269938650306748,
147
- "grad_norm": 3.1573679447174072,
148
- "learning_rate": 4.998323925887895e-06,
149
- "loss": 1.8253,
150
- "step": 20
151
- },
152
- {
153
- "epoch": 0.12883435582822086,
154
- "grad_norm": 2.2754809856414795,
155
- "learning_rate": 4.998142876590749e-06,
156
- "loss": 0.811,
157
- "step": 21
158
- },
159
- {
160
- "epoch": 0.13496932515337423,
161
- "grad_norm": 6.228421211242676,
162
- "learning_rate": 4.997952547428236e-06,
163
- "loss": 1.5741,
164
- "step": 22
165
- },
166
- {
167
- "epoch": 0.1411042944785276,
168
- "grad_norm": 3.217477560043335,
169
- "learning_rate": 4.997752939107372e-06,
170
- "loss": 1.2975,
171
- "step": 23
172
- },
173
- {
174
- "epoch": 0.147239263803681,
175
- "grad_norm": 3.494149923324585,
176
- "learning_rate": 4.997544052369642e-06,
177
- "loss": 1.8436,
178
- "step": 24
179
- },
180
- {
181
- "epoch": 0.15337423312883436,
182
- "grad_norm": 2.7154700756073,
183
- "learning_rate": 4.997325887990999e-06,
184
- "loss": 1.3535,
185
- "step": 25
186
- },
187
- {
188
- "epoch": 0.15950920245398773,
189
- "grad_norm": 3.6858909130096436,
190
- "learning_rate": 4.997098446781861e-06,
191
- "loss": 1.514,
192
- "step": 26
193
- },
194
- {
195
- "epoch": 0.1656441717791411,
196
- "grad_norm": 3.739105463027954,
197
- "learning_rate": 4.996861729587103e-06,
198
- "loss": 1.2834,
199
- "step": 27
200
- },
201
- {
202
- "epoch": 0.17177914110429449,
203
- "grad_norm": 2.842552900314331,
204
- "learning_rate": 4.996615737286061e-06,
205
- "loss": 1.1397,
206
- "step": 28
207
- },
208
- {
209
- "epoch": 0.17791411042944785,
210
- "grad_norm": 4.382201671600342,
211
- "learning_rate": 4.996360470792524e-06,
212
- "loss": 1.7099,
213
- "step": 29
214
- },
215
- {
216
- "epoch": 0.18404907975460122,
217
- "grad_norm": 1.378517508506775,
218
- "learning_rate": 4.996095931054731e-06,
219
- "loss": 0.7317,
220
- "step": 30
221
- },
222
- {
223
- "epoch": 0.1901840490797546,
224
- "grad_norm": 3.7880098819732666,
225
- "learning_rate": 4.9958221190553705e-06,
226
- "loss": 1.5918,
227
- "step": 31
228
- },
229
- {
230
- "epoch": 0.19631901840490798,
231
- "grad_norm": 1.192482352256775,
232
- "learning_rate": 4.995539035811572e-06,
233
- "loss": 0.7414,
234
- "step": 32
235
- },
236
- {
237
- "epoch": 0.20245398773006135,
238
- "grad_norm": 1.4741590023040771,
239
- "learning_rate": 4.9952466823749076e-06,
240
- "loss": 0.7675,
241
- "step": 33
242
- },
243
- {
244
- "epoch": 0.2085889570552147,
245
- "grad_norm": 1.505175232887268,
246
- "learning_rate": 4.9949450598313835e-06,
247
- "loss": 0.8803,
248
- "step": 34
249
- },
250
- {
251
- "epoch": 0.2147239263803681,
252
- "grad_norm": 2.5110180377960205,
253
- "learning_rate": 4.994634169301439e-06,
254
- "loss": 1.4041,
255
- "step": 35
256
- },
257
- {
258
- "epoch": 0.22085889570552147,
259
- "grad_norm": 1.8418388366699219,
260
- "learning_rate": 4.994314011939941e-06,
261
- "loss": 1.142,
262
- "step": 36
263
- },
264
- {
265
- "epoch": 0.22699386503067484,
266
- "grad_norm": 1.505200743675232,
267
- "learning_rate": 4.99398458893618e-06,
268
- "loss": 0.9891,
269
- "step": 37
270
- },
271
- {
272
- "epoch": 0.2331288343558282,
273
- "grad_norm": 3.1738810539245605,
274
- "learning_rate": 4.993645901513865e-06,
275
- "loss": 1.7619,
276
- "step": 38
277
- },
278
- {
279
- "epoch": 0.2392638036809816,
280
- "grad_norm": 1.3337852954864502,
281
- "learning_rate": 4.993297950931121e-06,
282
- "loss": 0.8645,
283
- "step": 39
284
- },
285
- {
286
- "epoch": 0.24539877300613497,
287
- "grad_norm": 2.316638946533203,
288
- "learning_rate": 4.9929407384804806e-06,
289
- "loss": 1.1522,
290
- "step": 40
291
- },
292
- {
293
- "epoch": 0.25153374233128833,
294
- "grad_norm": 2.777416467666626,
295
- "learning_rate": 4.992574265488883e-06,
296
- "loss": 1.3231,
297
- "step": 41
298
- },
299
- {
300
- "epoch": 0.25766871165644173,
301
- "grad_norm": 1.575035810470581,
302
- "learning_rate": 4.9921985333176694e-06,
303
- "loss": 0.9024,
304
- "step": 42
305
- },
306
- {
307
- "epoch": 0.26380368098159507,
308
- "grad_norm": 1.122635841369629,
309
- "learning_rate": 4.991813543362572e-06,
310
- "loss": 0.688,
311
- "step": 43
312
- },
313
- {
314
- "epoch": 0.26993865030674846,
315
- "grad_norm": 2.904345750808716,
316
- "learning_rate": 4.991419297053716e-06,
317
- "loss": 1.4858,
318
- "step": 44
319
- },
320
- {
321
- "epoch": 0.27607361963190186,
322
- "grad_norm": 1.364274263381958,
323
- "learning_rate": 4.991015795855611e-06,
324
- "loss": 0.8004,
325
- "step": 45
326
- },
327
- {
328
- "epoch": 0.2822085889570552,
329
- "grad_norm": 2.9117391109466553,
330
- "learning_rate": 4.990603041267144e-06,
331
- "loss": 1.2806,
332
- "step": 46
333
- },
334
- {
335
- "epoch": 0.2883435582822086,
336
- "grad_norm": 1.954034686088562,
337
- "learning_rate": 4.990181034821578e-06,
338
- "loss": 0.8162,
339
- "step": 47
340
- },
341
- {
342
- "epoch": 0.294478527607362,
343
- "grad_norm": 1.3249831199645996,
344
- "learning_rate": 4.98974977808654e-06,
345
- "loss": 0.7924,
346
- "step": 48
347
- },
348
- {
349
- "epoch": 0.3006134969325153,
350
- "grad_norm": 1.4197661876678467,
351
- "learning_rate": 4.989309272664026e-06,
352
- "loss": 0.8373,
353
- "step": 49
354
- },
355
- {
356
- "epoch": 0.3067484662576687,
357
- "grad_norm": 1.7603431940078735,
358
- "learning_rate": 4.988859520190381e-06,
359
- "loss": 1.027,
360
- "step": 50
361
- },
362
- {
363
- "epoch": 0.3128834355828221,
364
- "grad_norm": 1.2640917301177979,
365
- "learning_rate": 4.988400522336304e-06,
366
- "loss": 0.8586,
367
- "step": 51
368
- },
369
- {
370
- "epoch": 0.31901840490797545,
371
- "grad_norm": 1.516197681427002,
372
- "learning_rate": 4.9879322808068365e-06,
373
- "loss": 0.8846,
374
- "step": 52
375
- },
376
- {
377
- "epoch": 0.32515337423312884,
378
- "grad_norm": 1.9741435050964355,
379
- "learning_rate": 4.987454797341358e-06,
380
- "loss": 0.8055,
381
- "step": 53
382
- },
383
- {
384
- "epoch": 0.3312883435582822,
385
- "grad_norm": 1.1730972528457642,
386
- "learning_rate": 4.98696807371358e-06,
387
- "loss": 0.7968,
388
- "step": 54
389
- },
390
- {
391
- "epoch": 0.3374233128834356,
392
- "grad_norm": 3.1562085151672363,
393
- "learning_rate": 4.986472111731536e-06,
394
- "loss": 1.1708,
395
- "step": 55
396
- },
397
- {
398
- "epoch": 0.34355828220858897,
399
- "grad_norm": 1.4498661756515503,
400
- "learning_rate": 4.985966913237581e-06,
401
- "loss": 0.6461,
402
- "step": 56
403
- },
404
- {
405
- "epoch": 0.3496932515337423,
406
- "grad_norm": 1.5239108800888062,
407
- "learning_rate": 4.985452480108376e-06,
408
- "loss": 0.7928,
409
- "step": 57
410
- },
411
- {
412
- "epoch": 0.3558282208588957,
413
- "grad_norm": 6.304637908935547,
414
- "learning_rate": 4.984928814254889e-06,
415
- "loss": 1.5198,
416
- "step": 58
417
- },
418
- {
419
- "epoch": 0.3619631901840491,
420
- "grad_norm": 2.144055128097534,
421
- "learning_rate": 4.984395917622387e-06,
422
- "loss": 0.8968,
423
- "step": 59
424
- },
425
- {
426
- "epoch": 0.36809815950920244,
427
- "grad_norm": 45.799842834472656,
428
- "learning_rate": 4.9838537921904206e-06,
429
- "loss": 0.9896,
430
- "step": 60
431
- },
432
- {
433
- "epoch": 0.37423312883435583,
434
- "grad_norm": 23.065954208374023,
435
- "learning_rate": 4.9833024399728295e-06,
436
- "loss": 0.9584,
437
- "step": 61
438
- },
439
- {
440
- "epoch": 0.3803680981595092,
441
- "grad_norm": 1.2204431295394897,
442
- "learning_rate": 4.982741863017722e-06,
443
- "loss": 0.6698,
444
- "step": 62
445
- },
446
- {
447
- "epoch": 0.38650306748466257,
448
- "grad_norm": 2.3951306343078613,
449
- "learning_rate": 4.982172063407479e-06,
450
- "loss": 1.2787,
451
- "step": 63
452
- },
453
- {
454
- "epoch": 0.39263803680981596,
455
- "grad_norm": 1.5388777256011963,
456
- "learning_rate": 4.9815930432587365e-06,
457
- "loss": 0.6249,
458
- "step": 64
459
- },
460
- {
461
- "epoch": 0.3987730061349693,
462
- "grad_norm": 2.3690946102142334,
463
- "learning_rate": 4.981004804722384e-06,
464
- "loss": 0.8031,
465
- "step": 65
466
- },
467
- {
468
- "epoch": 0.4049079754601227,
469
- "grad_norm": 2.831342935562134,
470
- "learning_rate": 4.980407349983556e-06,
471
- "loss": 1.0599,
472
- "step": 66
473
- },
474
- {
475
- "epoch": 0.4110429447852761,
476
- "grad_norm": 1.3182815313339233,
477
- "learning_rate": 4.979800681261619e-06,
478
- "loss": 0.7507,
479
- "step": 67
480
- },
481
- {
482
- "epoch": 0.4171779141104294,
483
- "grad_norm": 0.9745801091194153,
484
- "learning_rate": 4.9791848008101705e-06,
485
- "loss": 0.5438,
486
- "step": 68
487
- },
488
- {
489
- "epoch": 0.4233128834355828,
490
- "grad_norm": 1.9159034490585327,
491
- "learning_rate": 4.978559710917024e-06,
492
- "loss": 0.8771,
493
- "step": 69
494
- },
495
- {
496
- "epoch": 0.4294478527607362,
497
- "grad_norm": 1.6300384998321533,
498
- "learning_rate": 4.977925413904205e-06,
499
- "loss": 0.9708,
500
- "step": 70
501
- },
502
- {
503
- "epoch": 0.43558282208588955,
504
- "grad_norm": 5.562711715698242,
505
- "learning_rate": 4.9772819121279395e-06,
506
- "loss": 1.3199,
507
- "step": 71
508
- },
509
- {
510
- "epoch": 0.44171779141104295,
511
- "grad_norm": 1.1460847854614258,
512
- "learning_rate": 4.976629207978648e-06,
513
- "loss": 0.6887,
514
- "step": 72
515
- },
516
- {
517
- "epoch": 0.44785276073619634,
518
- "grad_norm": 1.2074755430221558,
519
- "learning_rate": 4.975967303880933e-06,
520
- "loss": 0.5392,
521
- "step": 73
522
- },
523
- {
524
- "epoch": 0.4539877300613497,
525
- "grad_norm": 1.3423306941986084,
526
- "learning_rate": 4.975296202293575e-06,
527
- "loss": 0.8397,
528
- "step": 74
529
- },
530
- {
531
- "epoch": 0.4601226993865031,
532
- "grad_norm": 1.1050562858581543,
533
- "learning_rate": 4.974615905709518e-06,
534
- "loss": 0.7787,
535
- "step": 75
536
- },
537
- {
538
- "epoch": 0.4662576687116564,
539
- "grad_norm": 3.6690878868103027,
540
- "learning_rate": 4.973926416655863e-06,
541
- "loss": 1.2395,
542
- "step": 76
543
- },
544
- {
545
- "epoch": 0.4723926380368098,
546
- "grad_norm": 1.678609848022461,
547
- "learning_rate": 4.973227737693858e-06,
548
- "loss": 0.717,
549
- "step": 77
550
- },
551
- {
552
- "epoch": 0.4785276073619632,
553
- "grad_norm": 4.053447723388672,
554
- "learning_rate": 4.972519871418894e-06,
555
- "loss": 1.1079,
556
- "step": 78
557
- },
558
- {
559
- "epoch": 0.48466257668711654,
560
- "grad_norm": 1.8125452995300293,
561
- "learning_rate": 4.971802820460481e-06,
562
- "loss": 0.8235,
563
- "step": 79
564
- },
565
- {
566
- "epoch": 0.49079754601226994,
567
- "grad_norm": 1.2120040655136108,
568
- "learning_rate": 4.971076587482254e-06,
569
- "loss": 0.693,
570
- "step": 80
571
- },
572
- {
573
- "epoch": 0.49693251533742333,
574
- "grad_norm": 1.1679635047912598,
575
- "learning_rate": 4.970341175181957e-06,
576
- "loss": 0.6738,
577
- "step": 81
578
- },
579
- {
580
- "epoch": 0.5030674846625767,
581
- "grad_norm": 1.2779017686843872,
582
- "learning_rate": 4.969596586291425e-06,
583
- "loss": 0.8461,
584
- "step": 82
585
- },
586
- {
587
- "epoch": 0.50920245398773,
588
- "grad_norm": 1.60254967212677,
589
- "learning_rate": 4.968842823576592e-06,
590
- "loss": 0.8051,
591
- "step": 83
592
- },
593
- {
594
- "epoch": 0.5153374233128835,
595
- "grad_norm": 2.7090866565704346,
596
- "learning_rate": 4.968079889837461e-06,
597
- "loss": 1.1427,
598
- "step": 84
599
- },
600
- {
601
- "epoch": 0.5214723926380368,
602
- "grad_norm": 1.372827172279358,
603
- "learning_rate": 4.967307787908108e-06,
604
- "loss": 0.7582,
605
- "step": 85
606
- },
607
- {
608
- "epoch": 0.5276073619631901,
609
- "grad_norm": 1.495653748512268,
610
- "learning_rate": 4.966526520656663e-06,
611
- "loss": 0.9407,
612
- "step": 86
613
- },
614
- {
615
- "epoch": 0.5337423312883436,
616
- "grad_norm": 1.782081961631775,
617
- "learning_rate": 4.965736090985305e-06,
618
- "loss": 0.9427,
619
- "step": 87
620
- },
621
- {
622
- "epoch": 0.5398773006134969,
623
- "grad_norm": 1.1909613609313965,
624
- "learning_rate": 4.964936501830246e-06,
625
- "loss": 0.6759,
626
- "step": 88
627
- },
628
- {
629
- "epoch": 0.5460122699386503,
630
- "grad_norm": 3.4094467163085938,
631
- "learning_rate": 4.964127756161727e-06,
632
- "loss": 1.0704,
633
- "step": 89
634
- },
635
- {
636
- "epoch": 0.5521472392638037,
637
- "grad_norm": 1.454544186592102,
638
- "learning_rate": 4.963309856983998e-06,
639
- "loss": 0.8267,
640
- "step": 90
641
- },
642
- {
643
- "epoch": 0.558282208588957,
644
- "grad_norm": 1.3168748617172241,
645
- "learning_rate": 4.9624828073353144e-06,
646
- "loss": 0.8437,
647
- "step": 91
648
- },
649
- {
650
- "epoch": 0.5644171779141104,
651
- "grad_norm": 1.3142277002334595,
652
- "learning_rate": 4.961646610287922e-06,
653
- "loss": 0.7897,
654
- "step": 92
655
- },
656
- {
657
- "epoch": 0.5705521472392638,
658
- "grad_norm": 1.1625009775161743,
659
- "learning_rate": 4.960801268948047e-06,
660
- "loss": 0.6761,
661
- "step": 93
662
- },
663
- {
664
- "epoch": 0.5766871165644172,
665
- "grad_norm": 1.1165505647659302,
666
- "learning_rate": 4.959946786455882e-06,
667
- "loss": 0.4814,
668
- "step": 94
669
- },
670
- {
671
- "epoch": 0.5828220858895705,
672
- "grad_norm": 1.1464658975601196,
673
- "learning_rate": 4.959083165985581e-06,
674
- "loss": 0.6421,
675
- "step": 95
676
- },
677
- {
678
- "epoch": 0.588957055214724,
679
- "grad_norm": 1.1049315929412842,
680
- "learning_rate": 4.958210410745237e-06,
681
- "loss": 0.8091,
682
- "step": 96
683
- },
684
- {
685
- "epoch": 0.5950920245398773,
686
- "grad_norm": 1.1336909532546997,
687
- "learning_rate": 4.957328523976879e-06,
688
- "loss": 0.5852,
689
- "step": 97
690
- },
691
- {
692
- "epoch": 0.6012269938650306,
693
- "grad_norm": 1.8788769245147705,
694
- "learning_rate": 4.956437508956458e-06,
695
- "loss": 0.8707,
696
- "step": 98
697
- },
698
- {
699
- "epoch": 0.6073619631901841,
700
- "grad_norm": 1.7682530879974365,
701
- "learning_rate": 4.9555373689938325e-06,
702
- "loss": 0.7731,
703
- "step": 99
704
- },
705
- {
706
- "epoch": 0.6134969325153374,
707
- "grad_norm": 1.7279331684112549,
708
- "learning_rate": 4.954628107432757e-06,
709
- "loss": 1.0845,
710
- "step": 100
711
- },
712
- {
713
- "epoch": 0.6196319018404908,
714
- "grad_norm": 1.135621190071106,
715
- "learning_rate": 4.95370972765087e-06,
716
- "loss": 0.7108,
717
- "step": 101
718
- },
719
- {
720
- "epoch": 0.6257668711656442,
721
- "grad_norm": 1.1386152505874634,
722
- "learning_rate": 4.952782233059683e-06,
723
- "loss": 0.5538,
724
- "step": 102
725
- },
726
- {
727
- "epoch": 0.6319018404907976,
728
- "grad_norm": 1.5626285076141357,
729
- "learning_rate": 4.951845627104565e-06,
730
- "loss": 0.8767,
731
- "step": 103
732
- },
733
- {
734
- "epoch": 0.6380368098159509,
735
- "grad_norm": 1.5314006805419922,
736
- "learning_rate": 4.95089991326473e-06,
737
- "loss": 0.9553,
738
- "step": 104
739
- },
740
- {
741
- "epoch": 0.6441717791411042,
742
- "grad_norm": 1.241329312324524,
743
- "learning_rate": 4.9499450950532305e-06,
744
- "loss": 0.8938,
745
- "step": 105
746
- },
747
- {
748
- "epoch": 0.6503067484662577,
749
- "grad_norm": 1.809147596359253,
750
- "learning_rate": 4.94898117601693e-06,
751
- "loss": 0.9918,
752
- "step": 106
753
- },
754
- {
755
- "epoch": 0.656441717791411,
756
- "grad_norm": 2.3991501331329346,
757
- "learning_rate": 4.948008159736507e-06,
758
- "loss": 0.8674,
759
- "step": 107
760
- },
761
- {
762
- "epoch": 0.6625766871165644,
763
- "grad_norm": 1.1829618215560913,
764
- "learning_rate": 4.94702604982643e-06,
765
- "loss": 0.5653,
766
- "step": 108
767
- },
768
- {
769
- "epoch": 0.6687116564417178,
770
- "grad_norm": 1.2407200336456299,
771
- "learning_rate": 4.9460348499349485e-06,
772
- "loss": 0.7751,
773
- "step": 109
774
- },
775
- {
776
- "epoch": 0.6748466257668712,
777
- "grad_norm": 1.179889440536499,
778
- "learning_rate": 4.945034563744077e-06,
779
- "loss": 0.6028,
780
- "step": 110
781
- },
782
- {
783
- "epoch": 0.6809815950920245,
784
- "grad_norm": 1.201215386390686,
785
- "learning_rate": 4.944025194969586e-06,
786
- "loss": 0.6137,
787
- "step": 111
788
- },
789
- {
790
- "epoch": 0.6871165644171779,
791
- "grad_norm": 1.4372074604034424,
792
- "learning_rate": 4.9430067473609825e-06,
793
- "loss": 0.8444,
794
- "step": 112
795
- },
796
- {
797
- "epoch": 0.6932515337423313,
798
- "grad_norm": 1.140851616859436,
799
- "learning_rate": 4.941979224701499e-06,
800
- "loss": 0.7984,
801
- "step": 113
802
- },
803
- {
804
- "epoch": 0.6993865030674846,
805
- "grad_norm": 1.5855345726013184,
806
- "learning_rate": 4.94094263080808e-06,
807
- "loss": 0.8791,
808
- "step": 114
809
- },
810
- {
811
- "epoch": 0.7055214723926381,
812
- "grad_norm": 1.6596401929855347,
813
- "learning_rate": 4.939896969531367e-06,
814
- "loss": 1.0894,
815
- "step": 115
816
- },
817
- {
818
- "epoch": 0.7116564417177914,
819
- "grad_norm": 1.1690044403076172,
820
- "learning_rate": 4.938842244755683e-06,
821
- "loss": 0.8483,
822
- "step": 116
823
- },
824
- {
825
- "epoch": 0.7177914110429447,
826
- "grad_norm": 1.5814299583435059,
827
- "learning_rate": 4.937778460399022e-06,
828
- "loss": 0.9714,
829
- "step": 117
830
- },
831
- {
832
- "epoch": 0.7239263803680982,
833
- "grad_norm": 1.1886773109436035,
834
- "learning_rate": 4.936705620413028e-06,
835
- "loss": 0.5003,
836
- "step": 118
837
- },
838
- {
839
- "epoch": 0.7300613496932515,
840
- "grad_norm": 1.0248996019363403,
841
- "learning_rate": 4.935623728782986e-06,
842
- "loss": 0.5501,
843
- "step": 119
844
- },
845
- {
846
- "epoch": 0.7361963190184049,
847
- "grad_norm": 1.3180006742477417,
848
- "learning_rate": 4.934532789527805e-06,
849
- "loss": 0.9225,
850
- "step": 120
851
- },
852
- {
853
- "epoch": 0.7423312883435583,
854
- "grad_norm": 1.9640899896621704,
855
- "learning_rate": 4.933432806700004e-06,
856
- "loss": 0.6739,
857
- "step": 121
858
- },
859
- {
860
- "epoch": 0.7484662576687117,
861
- "grad_norm": 1.2381649017333984,
862
- "learning_rate": 4.932323784385693e-06,
863
- "loss": 0.8245,
864
- "step": 122
865
- },
866
- {
867
- "epoch": 0.754601226993865,
868
- "grad_norm": 1.220119833946228,
869
- "learning_rate": 4.931205726704566e-06,
870
- "loss": 0.7057,
871
- "step": 123
872
- },
873
- {
874
- "epoch": 0.7607361963190185,
875
- "grad_norm": 1.1197577714920044,
876
- "learning_rate": 4.930078637809878e-06,
877
- "loss": 0.7426,
878
- "step": 124
879
- },
880
- {
881
- "epoch": 0.7668711656441718,
882
- "grad_norm": 0.980256974697113,
883
- "learning_rate": 4.928942521888431e-06,
884
- "loss": 0.6393,
885
- "step": 125
886
- },
887
- {
888
- "epoch": 0.7730061349693251,
889
- "grad_norm": 1.0923237800598145,
890
- "learning_rate": 4.927797383160561e-06,
891
- "loss": 0.943,
892
- "step": 126
893
- },
894
- {
895
- "epoch": 0.7791411042944786,
896
- "grad_norm": 1.1057695150375366,
897
- "learning_rate": 4.926643225880123e-06,
898
- "loss": 0.5529,
899
- "step": 127
900
- },
901
- {
902
- "epoch": 0.7852760736196319,
903
- "grad_norm": 1.3289391994476318,
904
- "learning_rate": 4.925480054334471e-06,
905
- "loss": 0.6525,
906
- "step": 128
907
- },
908
- {
909
- "epoch": 0.7914110429447853,
910
- "grad_norm": 1.276287317276001,
911
- "learning_rate": 4.924307872844444e-06,
912
- "loss": 1.0422,
913
- "step": 129
914
- },
915
- {
916
- "epoch": 0.7975460122699386,
917
- "grad_norm": 1.1904211044311523,
918
- "learning_rate": 4.923126685764351e-06,
919
- "loss": 0.7181,
920
- "step": 130
921
- },
922
- {
923
- "epoch": 0.803680981595092,
924
- "grad_norm": 1.7319759130477905,
925
- "learning_rate": 4.921936497481956e-06,
926
- "loss": 0.7125,
927
- "step": 131
928
- },
929
- {
930
- "epoch": 0.8098159509202454,
931
- "grad_norm": 1.0085227489471436,
932
- "learning_rate": 4.920737312418456e-06,
933
- "loss": 0.6521,
934
- "step": 132
935
- },
936
- {
937
- "epoch": 0.8159509202453987,
938
- "grad_norm": 1.4816420078277588,
939
- "learning_rate": 4.919529135028473e-06,
940
- "loss": 0.8476,
941
- "step": 133
942
- },
943
- {
944
- "epoch": 0.8220858895705522,
945
- "grad_norm": 1.2678344249725342,
946
- "learning_rate": 4.918311969800027e-06,
947
- "loss": 0.6699,
948
- "step": 134
949
- },
950
- {
951
- "epoch": 0.8282208588957055,
952
- "grad_norm": 3.9976818561553955,
953
- "learning_rate": 4.917085821254532e-06,
954
- "loss": 0.7491,
955
- "step": 135
956
- },
957
- {
958
- "epoch": 0.8343558282208589,
959
- "grad_norm": 1.8387072086334229,
960
- "learning_rate": 4.915850693946766e-06,
961
- "loss": 0.4094,
962
- "step": 136
963
- },
964
- {
965
- "epoch": 0.8404907975460123,
966
- "grad_norm": 1.0204946994781494,
967
- "learning_rate": 4.914606592464865e-06,
968
- "loss": 0.6877,
969
- "step": 137
970
- },
971
- {
972
- "epoch": 0.8466257668711656,
973
- "grad_norm": 1.5259982347488403,
974
- "learning_rate": 4.9133535214303e-06,
975
- "loss": 0.918,
976
- "step": 138
977
- },
978
- {
979
- "epoch": 0.852760736196319,
980
- "grad_norm": 1.569480299949646,
981
- "learning_rate": 4.91209148549786e-06,
982
- "loss": 0.856,
983
- "step": 139
984
- },
985
- {
986
- "epoch": 0.8588957055214724,
987
- "grad_norm": 1.1170610189437866,
988
- "learning_rate": 4.910820489355637e-06,
989
- "loss": 0.7092,
990
- "step": 140
991
- },
992
- {
993
- "epoch": 0.8650306748466258,
994
- "grad_norm": 1.167941927909851,
995
- "learning_rate": 4.909540537725007e-06,
996
- "loss": 0.5254,
997
- "step": 141
998
- },
999
- {
1000
- "epoch": 0.8711656441717791,
1001
- "grad_norm": 1.3118079900741577,
1002
- "learning_rate": 4.908251635360616e-06,
1003
- "loss": 0.9857,
1004
- "step": 142
1005
- },
1006
- {
1007
- "epoch": 0.8773006134969326,
1008
- "grad_norm": 1.121370792388916,
1009
- "learning_rate": 4.906953787050354e-06,
1010
- "loss": 0.6619,
1011
- "step": 143
1012
- },
1013
- {
1014
- "epoch": 0.8834355828220859,
1015
- "grad_norm": 1.0083792209625244,
1016
- "learning_rate": 4.905646997615347e-06,
1017
- "loss": 0.5386,
1018
- "step": 144
1019
- },
1020
- {
1021
- "epoch": 0.8895705521472392,
1022
- "grad_norm": 1.318919062614441,
1023
- "learning_rate": 4.904331271909932e-06,
1024
- "loss": 0.7674,
1025
- "step": 145
1026
- },
1027
- {
1028
- "epoch": 0.8957055214723927,
1029
- "grad_norm": 1.4043550491333008,
1030
- "learning_rate": 4.903006614821645e-06,
1031
- "loss": 0.591,
1032
- "step": 146
1033
- },
1034
- {
1035
- "epoch": 0.901840490797546,
1036
- "grad_norm": 1.0472655296325684,
1037
- "learning_rate": 4.901673031271194e-06,
1038
- "loss": 0.5931,
1039
- "step": 147
1040
- },
1041
- {
1042
- "epoch": 0.9079754601226994,
1043
- "grad_norm": 1.239875078201294,
1044
- "learning_rate": 4.900330526212451e-06,
1045
- "loss": 0.5277,
1046
- "step": 148
1047
- },
1048
- {
1049
- "epoch": 0.9141104294478528,
1050
- "grad_norm": 1.1664248704910278,
1051
- "learning_rate": 4.898979104632427e-06,
1052
- "loss": 0.8526,
1053
- "step": 149
1054
- },
1055
- {
1056
- "epoch": 0.9202453987730062,
1057
- "grad_norm": 1.2190073728561401,
1058
- "learning_rate": 4.897618771551255e-06,
1059
- "loss": 0.5936,
1060
- "step": 150
1061
- },
1062
- {
1063
- "epoch": 0.9263803680981595,
1064
- "grad_norm": 1.1450576782226562,
1065
- "learning_rate": 4.8962495320221714e-06,
1066
- "loss": 0.6725,
1067
- "step": 151
1068
- },
1069
- {
1070
- "epoch": 0.9325153374233128,
1071
- "grad_norm": 1.420145869255066,
1072
- "learning_rate": 4.8948713911315e-06,
1073
- "loss": 0.751,
1074
- "step": 152
1075
- },
1076
- {
1077
- "epoch": 0.9386503067484663,
1078
- "grad_norm": 1.2019397020339966,
1079
- "learning_rate": 4.8934843539986266e-06,
1080
- "loss": 0.6503,
1081
- "step": 153
1082
- },
1083
- {
1084
- "epoch": 0.9447852760736196,
1085
- "grad_norm": 1.0950829982757568,
1086
- "learning_rate": 4.892088425775986e-06,
1087
- "loss": 0.8032,
1088
- "step": 154
1089
- },
1090
- {
1091
- "epoch": 0.950920245398773,
1092
- "grad_norm": 1.1624736785888672,
1093
- "learning_rate": 4.890683611649041e-06,
1094
- "loss": 0.7497,
1095
- "step": 155
1096
- },
1097
- {
1098
- "epoch": 0.9570552147239264,
1099
- "grad_norm": 1.9366374015808105,
1100
- "learning_rate": 4.8892699168362626e-06,
1101
- "loss": 0.7442,
1102
- "step": 156
1103
- },
1104
- {
1105
- "epoch": 0.9631901840490797,
1106
- "grad_norm": 1.1350420713424683,
1107
- "learning_rate": 4.887847346589111e-06,
1108
- "loss": 0.5686,
1109
- "step": 157
1110
- },
1111
- {
1112
- "epoch": 0.9693251533742331,
1113
- "grad_norm": 1.0068031549453735,
1114
- "learning_rate": 4.886415906192015e-06,
1115
- "loss": 0.3893,
1116
- "step": 158
1117
- },
1118
- {
1119
- "epoch": 0.9754601226993865,
1120
- "grad_norm": 1.1681759357452393,
1121
- "learning_rate": 4.884975600962355e-06,
1122
- "loss": 0.7812,
1123
- "step": 159
1124
- },
1125
- {
1126
- "epoch": 0.9815950920245399,
1127
- "grad_norm": 1.234325885772705,
1128
- "learning_rate": 4.883526436250441e-06,
1129
- "loss": 0.6285,
1130
- "step": 160
1131
- },
1132
- {
1133
- "epoch": 0.9877300613496932,
1134
- "grad_norm": 1.049235463142395,
1135
- "learning_rate": 4.8820684174394935e-06,
1136
- "loss": 0.7039,
1137
- "step": 161
1138
- },
1139
- {
1140
- "epoch": 0.9938650306748467,
1141
- "grad_norm": 1.4013290405273438,
1142
- "learning_rate": 4.880601549945622e-06,
1143
- "loss": 0.9962,
1144
- "step": 162
1145
- },
1146
- {
1147
- "epoch": 1.0,
1148
- "grad_norm": 1.03646719455719,
1149
- "learning_rate": 4.879125839217808e-06,
1150
- "loss": 0.7865,
1151
- "step": 163
1152
- }
1153
- ],
1154
- "logging_steps": 1,
1155
- "max_steps": 1630,
1156
- "num_input_tokens_seen": 0,
1157
- "num_train_epochs": 10,
1158
- "save_steps": 208,
1159
- "stateful_callbacks": {
1160
- "TrainerControl": {
1161
- "args": {
1162
- "should_epoch_stop": false,
1163
- "should_evaluate": false,
1164
- "should_log": false,
1165
- "should_save": true,
1166
- "should_training_stop": false
1167
- },
1168
- "attributes": {}
1169
- }
1170
- },
1171
- "total_flos": 4.157944675853926e+16,
1172
- "train_batch_size": 1,
1173
- "trial_name": null,
1174
- "trial_params": null
1175
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-163/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-1630/added_tokens.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "</tool_call>": 151658,
3
- "<tool_call>": 151657,
4
- "<|box_end|>": 151649,
5
- "<|box_start|>": 151648,
6
- "<|endoftext|>": 151643,
7
- "<|file_sep|>": 151664,
8
- "<|fim_middle|>": 151660,
9
- "<|fim_pad|>": 151662,
10
- "<|fim_prefix|>": 151659,
11
- "<|fim_suffix|>": 151661,
12
- "<|im_end|>": 151645,
13
- "<|im_start|>": 151644,
14
- "<|image_pad|>": 151655,
15
- "<|object_ref_end|>": 151647,
16
- "<|object_ref_start|>": 151646,
17
- "<|quad_end|>": 151651,
18
- "<|quad_start|>": 151650,
19
- "<|repo_name|>": 151663,
20
- "<|video_pad|>": 151656,
21
- "<|vision_end|>": 151653,
22
- "<|vision_pad|>": 151654,
23
- "<|vision_start|>": 151652
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1630/chat_template.jinja DELETED
@@ -1,54 +0,0 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0]['role'] == 'system' %}
4
- {{- messages[0]['content'] }}
5
- {%- else %}
6
- {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
7
- {%- endif %}
8
- {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
- {%- for tool in tools %}
10
- {{- "\n" }}
11
- {{- tool | tojson }}
12
- {%- endfor %}
13
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
- {%- else %}
15
- {%- if messages[0]['role'] == 'system' %}
16
- {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
- {%- else %}
18
- {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
19
- {%- endif %}
20
- {%- endif %}
21
- {%- for message in messages %}
22
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
- {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
- {%- elif message.role == "assistant" %}
25
- {{- '<|im_start|>' + message.role }}
26
- {%- if message.content %}
27
- {{- '\n' + message.content }}
28
- {%- endif %}
29
- {%- for tool_call in message.tool_calls %}
30
- {%- if tool_call.function is defined %}
31
- {%- set tool_call = tool_call.function %}
32
- {%- endif %}
33
- {{- '\n<tool_call>\n{"name": "' }}
34
- {{- tool_call.name }}
35
- {{- '", "arguments": ' }}
36
- {{- tool_call.arguments | tojson }}
37
- {{- '}\n</tool_call>' }}
38
- {%- endfor %}
39
- {{- '<|im_end|>\n' }}
40
- {%- elif message.role == "tool" %}
41
- {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
- {{- '<|im_start|>user' }}
43
- {%- endif %}
44
- {{- '\n<tool_response>\n' }}
45
- {{- message.content }}
46
- {{- '\n</tool_response>' }}
47
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
- {{- '<|im_end|>\n' }}
49
- {%- endif %}
50
- {%- endif %}
51
- {%- endfor %}
52
- {%- if add_generation_prompt %}
53
- {{- '<|im_start|>assistant\n' }}
54
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1630/config.json DELETED
@@ -1,58 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen2ForCausalLM"
4
- ],
5
- "attention_dropout": 0.0,
6
- "bos_token_id": 151643,
7
- "eos_token_id": 151645,
8
- "hidden_act": "silu",
9
- "hidden_size": 3584,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 18944,
12
- "layer_types": [
13
- "full_attention",
14
- "full_attention",
15
- "full_attention",
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention"
41
- ],
42
- "max_position_embeddings": 4096,
43
- "max_window_layers": 28,
44
- "model_type": "qwen2",
45
- "num_attention_heads": 28,
46
- "num_hidden_layers": 28,
47
- "num_key_value_heads": 4,
48
- "rms_norm_eps": 1e-06,
49
- "rope_scaling": null,
50
- "rope_theta": 10000.0,
51
- "sliding_window": null,
52
- "tie_word_embeddings": false,
53
- "torch_dtype": "float32",
54
- "transformers_version": "4.55.0",
55
- "use_cache": false,
56
- "use_sliding_window": false,
57
- "vocab_size": 152064
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1630/generation_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "bos_token_id": 151643,
3
- "eos_token_id": [
4
- 151645,
5
- 151643
6
- ],
7
- "pad_token_id": 151643,
8
- "transformers_version": "4.55.0"
9
- }
 
 
 
 
 
 
 
 
 
 
limo_filtered_correct/checkpoint-1630/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
limo_filtered_correct/checkpoint-1630/model.safetensors.index.json DELETED
@@ -1,347 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_parameters": 1903904128,
4
- "total_size": 30462466048
5
- },
6
- "weight_map": {
7
- "lm_head.weight": "model-00007-of-00007.safetensors",
8
- "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
9
- "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
10
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
11
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
12
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
13
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
14
- "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
15
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
16
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
17
- "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
18
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
19
- "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
20
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
21
- "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
22
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
23
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
24
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
25
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
26
- "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
27
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
28
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
29
- "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
30
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
31
- "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
32
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
33
- "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
34
- "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
35
- "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
36
- "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
37
- "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
38
- "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
39
- "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
40
- "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
41
- "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
42
- "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
43
- "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
44
- "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
45
- "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
46
- "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
47
- "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
48
- "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
49
- "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
50
- "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
51
- "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
52
- "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
53
- "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
54
- "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
55
- "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
56
- "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
57
- "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
58
- "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
59
- "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
60
- "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
61
- "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
62
- "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
63
- "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
64
- "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
65
- "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
66
- "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
67
- "model.layers.12.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
68
- "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
69
- "model.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
70
- "model.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
71
- "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
72
- "model.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
73
- "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
74
- "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
75
- "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
76
- "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
77
- "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
78
- "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
79
- "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
80
- "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
81
- "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
82
- "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
83
- "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
84
- "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
85
- "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
86
- "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
87
- "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
88
- "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
89
- "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
90
- "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
91
- "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
92
- "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
93
- "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
94
- "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
95
- "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
96
- "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
97
- "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
98
- "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
99
- "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
100
- "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
101
- "model.layers.15.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
102
- "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
103
- "model.layers.15.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
104
- "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
105
- "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
106
- "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
107
- "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
108
- "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
109
- "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
110
- "model.layers.16.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
111
- "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
112
- "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
113
- "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
114
- "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
115
- "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
116
- "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
117
- "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
118
- "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
119
- "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
120
- "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
121
- "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
122
- "model.layers.17.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
123
- "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
124
- "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
125
- "model.layers.17.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
126
- "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
127
- "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
128
- "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
129
- "model.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
130
- "model.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
131
- "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
132
- "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
133
- "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
134
- "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
135
- "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
136
- "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
137
- "model.layers.18.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
138
- "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
139
- "model.layers.18.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
140
- "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
141
- "model.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
142
- "model.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
143
- "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
144
- "model.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
145
- "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
146
- "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
147
- "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
148
- "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
149
- "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
150
- "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
151
- "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
152
- "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
153
- "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
154
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
155
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
156
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
157
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
158
- "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
159
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
160
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
161
- "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
162
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
163
- "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
164
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
165
- "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
166
- "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
167
- "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
168
- "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
169
- "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
170
- "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
171
- "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
172
- "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
173
- "model.layers.20.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
174
- "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
175
- "model.layers.20.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
176
- "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
177
- "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
178
- "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
179
- "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
180
- "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
181
- "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
182
- "model.layers.21.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
183
- "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
184
- "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
185
- "model.layers.21.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
186
- "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
187
- "model.layers.21.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
188
- "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
189
- "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
190
- "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
191
- "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
192
- "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
193
- "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
194
- "model.layers.22.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
195
- "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
196
- "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
197
- "model.layers.22.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
198
- "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
199
- "model.layers.22.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
200
- "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
201
- "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
202
- "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
203
- "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
204
- "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
205
- "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
206
- "model.layers.23.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
207
- "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
208
- "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
209
- "model.layers.23.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
210
- "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
211
- "model.layers.23.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
212
- "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
213
- "model.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
214
- "model.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
215
- "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
216
- "model.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
217
- "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
218
- "model.layers.24.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
219
- "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
220
- "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
221
- "model.layers.24.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
222
- "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
223
- "model.layers.24.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
224
- "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
225
- "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
226
- "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
227
- "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
228
- "model.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
229
- "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
230
- "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
231
- "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
232
- "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
233
- "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
234
- "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
235
- "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
236
- "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
237
- "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
238
- "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
239
- "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
240
- "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
241
- "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
242
- "model.layers.26.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
243
- "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
244
- "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
245
- "model.layers.26.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
246
- "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
247
- "model.layers.26.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
248
- "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
249
- "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
250
- "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
251
- "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
252
- "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
253
- "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
254
- "model.layers.27.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
255
- "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
256
- "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
257
- "model.layers.27.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
258
- "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
259
- "model.layers.27.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
260
- "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
261
- "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
262
- "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
263
- "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
264
- "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
265
- "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
266
- "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
267
- "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
268
- "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
269
- "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
270
- "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
271
- "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
272
- "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
273
- "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
274
- "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
275
- "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
276
- "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
277
- "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
278
- "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
279
- "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
280
- "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
281
- "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
282
- "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
283
- "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
284
- "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
285
- "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
286
- "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
287
- "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
288
- "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
289
- "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
290
- "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
291
- "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
292
- "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
293
- "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
294
- "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
295
- "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
296
- "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
297
- "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
298
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
299
- "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
300
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
301
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
302
- "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
303
- "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
304
- "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
305
- "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
306
- "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
307
- "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
308
- "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
309
- "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
310
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
311
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
312
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
313
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
314
- "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
315
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
316
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
317
- "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
318
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
319
- "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
320
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
321
- "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
322
- "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
323
- "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
324
- "model.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
325
- "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
326
- "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
327
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
328
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
329
- "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
330
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
331
- "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
332
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
333
- "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
334
- "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
335
- "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
336
- "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
337
- "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
338
- "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
339
- "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
340
- "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
341
- "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
342
- "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
343
- "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
344
- "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
345
- "model.norm.weight": "model-00006-of-00007.safetensors"
346
- }
347
- }