SoonOk commited on
Commit
b6c054d
·
verified ·
1 Parent(s): e2e28dc

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  V2/checkpoint-620/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  V2/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  V2/checkpoint-620/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  V2/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ V1/checkpoint-620/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ V1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
V1/README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: SoonOk/SFTWerewolf
3
+ library_name: transformers
4
+ model_name: Alpha_1_Beta_0
5
+ tags:
6
+ - generated_from_trainer
7
+ - kto
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Alpha_1_Beta_0
13
+
14
+ This model is a fine-tuned version of [SoonOk/SFTWerewolf](https://huggingface.co/SoonOk/SFTWerewolf).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/hwangyechan4-unist-/auxmakto/runs/ysr0qjk0)
31
+
32
+
33
+ This model was trained with KTO, a method introduced in [KTO: Model Alignment as Prospect Theoretic Optimization](https://huggingface.co/papers/2402.01306).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.26.2
38
+ - Transformers: 4.57.5
39
+ - Pytorch: 2.8.0
40
+ - Datasets: 4.4.2
41
+ - Tokenizers: 0.22.2
42
+
43
+ ## Citations
44
+
45
+ Cite KTO as:
46
+
47
+ ```bibtex
48
+ @article{ethayarajh2024kto,
49
+ title = {{KTO: Model Alignment as Prospect Theoretic Optimization}},
50
+ author = {Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.01306},
53
+ }
54
+ ```
55
+
56
+ Cite TRL as:
57
+
58
+ ```bibtex
59
+ @misc{vonwerra2022trl,
60
+ title = {{TRL: Transformer Reinforcement Learning}},
61
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
62
+ year = 2020,
63
+ journal = {GitHub repository},
64
+ publisher = {GitHub},
65
+ howpublished = {\url{https://github.com/huggingface/trl}}
66
+ }
67
+ ```
V1/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
V1/aux_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:715a9e055ff85d7bc07c8deb744d309eecb930a823f0a4a079a0d97695bf22d7
3
+ size 223213
V1/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
V1/checkpoint-620/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
V1/checkpoint-620/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
V1/checkpoint-620/config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 11008,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention"
49
+ ],
50
+ "max_position_embeddings": 32768,
51
+ "max_window_layers": 70,
52
+ "model_type": "qwen2",
53
+ "num_attention_heads": 16,
54
+ "num_hidden_layers": 36,
55
+ "num_key_value_heads": 2,
56
+ "output_hidden_states": true,
57
+ "pad_token_id": 151643,
58
+ "rms_norm_eps": 1e-06,
59
+ "rope_scaling": null,
60
+ "rope_theta": 1000000.0,
61
+ "sliding_window": null,
62
+ "tie_word_embeddings": true,
63
+ "transformers_version": "4.57.5",
64
+ "use_cache": false,
65
+ "use_sliding_window": false,
66
+ "vocab_size": 151665
67
+ }
V1/checkpoint-620/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.05,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.57.5"
13
+ }
V1/checkpoint-620/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
V1/checkpoint-620/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3759239661c4f840b1e57264ea16334300416730fd41bde367ac78a91d1e156f
3
+ size 4956450288
V1/checkpoint-620/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eac8e65f7dd1a9d359c9d2bcc6c1c5693040b995f76be725d8c64ce36c294ad1
3
+ size 1214588148
V1/checkpoint-620/model.safetensors.index.json ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 3085494326,
4
+ "total_size": 6170988652
5
+ },
6
+ "weight_map": {
7
+ "aux_head.bias": "model-00002-of-00002.safetensors",
8
+ "aux_head.weight": "model-00002-of-00002.safetensors",
9
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
19
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
21
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
31
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
33
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
43
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
45
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
52
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
55
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
57
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
64
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
67
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
69
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
76
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
79
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
81
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
88
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
91
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
93
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
100
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
103
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
105
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
112
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
115
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
117
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
124
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
127
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
129
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
136
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
139
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
141
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
148
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
151
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
153
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
160
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
163
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
165
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
172
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
173
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
174
+ "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
175
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
177
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
180
+ "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
181
+ "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
182
+ "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
184
+ "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
185
+ "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
186
+ "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
187
+ "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
189
+ "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
190
+ "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
191
+ "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
192
+ "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
193
+ "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
194
+ "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
195
+ "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
196
+ "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
197
+ "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
198
+ "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
199
+ "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
200
+ "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
201
+ "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
202
+ "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
203
+ "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
204
+ "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
205
+ "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
206
+ "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
208
+ "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
209
+ "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
210
+ "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
211
+ "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
212
+ "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
213
+ "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
214
+ "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
215
+ "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
216
+ "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
217
+ "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
218
+ "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
220
+ "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
221
+ "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
222
+ "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
223
+ "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
224
+ "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
225
+ "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
226
+ "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
227
+ "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
228
+ "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
229
+ "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
230
+ "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
231
+ "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
232
+ "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
233
+ "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
234
+ "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
235
+ "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
236
+ "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
237
+ "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
238
+ "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
239
+ "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
240
+ "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
241
+ "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
242
+ "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
243
+ "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
244
+ "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
245
+ "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
246
+ "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
247
+ "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
248
+ "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
249
+ "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
250
+ "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
251
+ "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
252
+ "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
253
+ "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
254
+ "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
255
+ "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
256
+ "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
257
+ "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
258
+ "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
259
+ "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
260
+ "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
261
+ "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
262
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
263
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
264
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
265
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
266
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
267
+ "model.layers.28.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
268
+ "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
270
+ "model.layers.28.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
271
+ "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.layers.28.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
273
+ "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
274
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.29.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
280
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
283
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.29.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
285
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
286
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
287
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
288
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
289
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
290
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
291
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
292
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
293
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
294
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
295
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
296
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
297
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
298
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.30.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
304
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.30.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
307
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.30.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
309
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.31.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
316
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.31.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
319
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.31.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
321
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.32.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
328
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.32.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
331
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.32.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
333
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
339
+ "model.layers.33.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
340
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
341
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
342
+ "model.layers.33.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
343
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
344
+ "model.layers.33.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
345
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
346
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
347
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
348
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
349
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
350
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
351
+ "model.layers.34.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
352
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
353
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
354
+ "model.layers.34.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
355
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
356
+ "model.layers.34.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
357
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
358
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
359
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
360
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
361
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
362
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
363
+ "model.layers.35.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
364
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
365
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
366
+ "model.layers.35.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
367
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
368
+ "model.layers.35.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
369
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
370
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
371
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
372
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
373
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
375
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
376
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
377
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
378
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
379
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
380
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
381
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
382
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
384
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
385
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
386
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
388
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
389
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
390
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
391
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
392
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
393
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
394
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
395
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
398
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
400
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
402
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
403
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
404
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
405
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
406
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
407
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
408
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
409
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
410
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
411
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
412
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
413
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
414
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
415
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
416
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
417
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
418
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
419
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
420
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
421
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
422
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
423
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
424
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
425
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
426
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
427
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
428
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
429
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
430
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
431
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
432
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
433
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
434
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
435
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
436
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
437
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
438
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
439
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
440
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
441
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
442
+ "model.norm.weight": "model-00002-of-00002.safetensors"
443
+ }
444
+ }
V1/checkpoint-620/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52ab41abf19a27645f1c06913dbf561f09531be6e0707d4a4763009945e2566c
3
+ size 12342363635
V1/checkpoint-620/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc2024de3977e14d8eb49138e464852d3139a95403b42712dff426b122dbd9a8
3
+ size 14917
V1/checkpoint-620/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f3ccf71b831b178a658a6e1b5e409218184e44f34db2ecfb332baac716953af
3
+ size 14917
V1/checkpoint-620/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21ea3def1bead9ce675c724a153f28a428fc83face40ea6aa04600e378f67f06
3
+ size 1465
V1/checkpoint-620/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
V1/checkpoint-620/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
V1/checkpoint-620/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
V1/checkpoint-620/trainer_state.json ADDED
@@ -0,0 +1,1012 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 20.0,
6
+ "eval_steps": 200,
7
+ "global_step": 620,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.32454361054766734,
14
+ "grad_norm": 193.0,
15
+ "kl": 0.18435561656951904,
16
+ "learning_rate": 6e-08,
17
+ "logits/chosen": -60439219.2,
18
+ "logits/rejected": -88406048.0,
19
+ "logps/chosen": -197.2506591796875,
20
+ "logps/rejected": -107.289501953125,
21
+ "loss": 3.5312,
22
+ "rewards/chosen": -0.0054339878261089325,
23
+ "rewards/margins": -0.004661996196955442,
24
+ "rewards/rejected": -0.00077199162915349,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.6490872210953347,
29
+ "grad_norm": 184.0,
30
+ "kl": 0.2278171330690384,
31
+ "learning_rate": 1.2666666666666666e-07,
32
+ "logits/chosen": -60431888.30573248,
33
+ "logits/rejected": -90372359.85276073,
34
+ "logps/chosen": -227.72788117038218,
35
+ "logps/rejected": -115.19334739263803,
36
+ "loss": 3.529,
37
+ "rewards/chosen": -0.005934715650643512,
38
+ "rewards/margins": -0.0023042475658752288,
39
+ "rewards/rejected": -0.0036304680847682835,
40
+ "step": 20
41
+ },
42
+ {
43
+ "epoch": 0.973630831643002,
44
+ "grad_norm": 212.0,
45
+ "kl": 0.25923511385917664,
46
+ "learning_rate": 1.9333333333333332e-07,
47
+ "logits/chosen": -64896310.01834863,
48
+ "logits/rejected": -90925383.15654951,
49
+ "logps/chosen": -202.24849483944953,
50
+ "logps/rejected": -113.89132388178913,
51
+ "loss": 3.493,
52
+ "rewards/chosen": -0.006282405386641849,
53
+ "rewards/margins": 0.008622396832151831,
54
+ "rewards/rejected": -0.01490480221879368,
55
+ "step": 30
56
+ },
57
+ {
58
+ "epoch": 1.2920892494929006,
59
+ "grad_norm": 176.0,
60
+ "kl": 0.1959868222475052,
61
+ "learning_rate": 2.6e-07,
62
+ "logits/chosen": -59720155.54179566,
63
+ "logits/rejected": -88351780.93114755,
64
+ "logps/chosen": -197.90833010835914,
65
+ "logps/rejected": -108.10444415983606,
66
+ "loss": 3.5435,
67
+ "rewards/chosen": 0.006978450544847424,
68
+ "rewards/margins": 0.021960525532540336,
69
+ "rewards/rejected": -0.014982074987692912,
70
+ "step": 40
71
+ },
72
+ {
73
+ "epoch": 1.616632860040568,
74
+ "grad_norm": 183.0,
75
+ "kl": 0.21559596061706543,
76
+ "learning_rate": 3.2666666666666663e-07,
77
+ "logits/chosen": -59467023.25827815,
78
+ "logits/rejected": -90559948.49704142,
79
+ "logps/chosen": -228.09385347682118,
80
+ "logps/rejected": -112.74208348742603,
81
+ "loss": 3.5032,
82
+ "rewards/chosen": 0.0010771212593609135,
83
+ "rewards/margins": 0.03369407513736861,
84
+ "rewards/rejected": -0.0326169538780077,
85
+ "step": 50
86
+ },
87
+ {
88
+ "epoch": 1.9411764705882353,
89
+ "grad_norm": 166.0,
90
+ "kl": 0.15380892157554626,
91
+ "learning_rate": 3.933333333333333e-07,
92
+ "logits/chosen": -65427847.25970149,
93
+ "logits/rejected": -90375137.78360656,
94
+ "logps/chosen": -201.5051072761194,
95
+ "logps/rejected": -115.42854764344263,
96
+ "loss": 3.4839,
97
+ "rewards/chosen": -0.005884787217894597,
98
+ "rewards/margins": 0.059975266923290735,
99
+ "rewards/rejected": -0.06586005414118533,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 2.259634888438134,
104
+ "grad_norm": 177.0,
105
+ "kl": 0.14670009911060333,
106
+ "learning_rate": 4.6e-07,
107
+ "logits/chosen": -59004054.974358976,
108
+ "logits/rejected": -88976837.67088607,
109
+ "logps/chosen": -197.41224709535257,
110
+ "logps/rejected": -108.29176967958861,
111
+ "loss": 3.5035,
112
+ "rewards/chosen": 0.0009808578552343906,
113
+ "rewards/margins": 0.09745390586015737,
114
+ "rewards/rejected": -0.09647304800492298,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 2.584178498985801,
119
+ "grad_norm": 147.0,
120
+ "kl": 0.17133259773254395,
121
+ "learning_rate": 5.266666666666666e-07,
122
+ "logits/chosen": -60354307.32467532,
123
+ "logits/rejected": -89784492.72289157,
124
+ "logps/chosen": -219.9271002435065,
125
+ "logps/rejected": -113.98176298945783,
126
+ "loss": 3.4706,
127
+ "rewards/chosen": -0.01354174180464311,
128
+ "rewards/margins": 0.1299854844710652,
129
+ "rewards/rejected": -0.1435272262757083,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 2.9087221095334685,
134
+ "grad_norm": 197.0,
135
+ "kl": 0.18291868269443512,
136
+ "learning_rate": 5.933333333333334e-07,
137
+ "logits/chosen": -64280637.686746985,
138
+ "logits/rejected": -91105393.03896104,
139
+ "logps/chosen": -209.47286803463857,
140
+ "logps/rejected": -117.5850497159091,
141
+ "loss": 3.4451,
142
+ "rewards/chosen": -0.034842827233923485,
143
+ "rewards/margins": 0.20260719032359223,
144
+ "rewards/rejected": -0.2374500175575157,
145
+ "step": 90
146
+ },
147
+ {
148
+ "epoch": 3.227180527383367,
149
+ "grad_norm": 197.0,
150
+ "kl": 0.15808846056461334,
151
+ "learning_rate": 6.6e-07,
152
+ "logits/chosen": -61499443.52201258,
153
+ "logits/rejected": -88841982.34838709,
154
+ "logps/chosen": -191.46870086477986,
155
+ "logps/rejected": -110.64470766129033,
156
+ "loss": 3.4412,
157
+ "rewards/chosen": -0.02490143655980908,
158
+ "rewards/margins": 0.2772539658390694,
159
+ "rewards/rejected": -0.3021554023988785,
160
+ "step": 100
161
+ },
162
+ {
163
+ "epoch": 3.5517241379310347,
164
+ "grad_norm": 187.0,
165
+ "kl": 0.20476070046424866,
166
+ "learning_rate": 7.266666666666667e-07,
167
+ "logits/chosen": -59606039.973244146,
168
+ "logits/rejected": -90108102.19354838,
169
+ "logps/chosen": -218.58996132943145,
170
+ "logps/rejected": -116.83943135997067,
171
+ "loss": 3.4053,
172
+ "rewards/chosen": -0.04491897889204249,
173
+ "rewards/margins": 0.3912458546913978,
174
+ "rewards/rejected": -0.43616483358344027,
175
+ "step": 110
176
+ },
177
+ {
178
+ "epoch": 3.8762677484787016,
179
+ "grad_norm": 177.0,
180
+ "kl": 0.2384704351425171,
181
+ "learning_rate": 7.933333333333333e-07,
182
+ "logits/chosen": -63627667.66376811,
183
+ "logits/rejected": -90490970.25084746,
184
+ "logps/chosen": -210.72527173913045,
185
+ "logps/rejected": -119.54580243644068,
186
+ "loss": 3.3335,
187
+ "rewards/chosen": -0.08938233195871546,
188
+ "rewards/margins": 0.5145636426175557,
189
+ "rewards/rejected": -0.6039459745762712,
190
+ "step": 120
191
+ },
192
+ {
193
+ "epoch": 4.1947261663286,
194
+ "grad_norm": 180.0,
195
+ "kl": 0.1893850862979889,
196
+ "learning_rate": 8.599999999999999e-07,
197
+ "logits/chosen": -60919781.574193545,
198
+ "logits/rejected": -90189328.10062893,
199
+ "logps/chosen": -201.86630544354838,
200
+ "logps/rejected": -114.74926297169812,
201
+ "loss": 3.3418,
202
+ "rewards/chosen": -0.05891021605460874,
203
+ "rewards/margins": 0.6146469454804984,
204
+ "rewards/rejected": -0.6735571615351071,
205
+ "step": 130
206
+ },
207
+ {
208
+ "epoch": 4.519269776876268,
209
+ "grad_norm": 175.0,
210
+ "kl": 0.2661321759223938,
211
+ "learning_rate": 9.266666666666665e-07,
212
+ "logits/chosen": -59107307.58803987,
213
+ "logits/rejected": -89497455.00884956,
214
+ "logps/chosen": -215.70617473006644,
215
+ "logps/rejected": -120.84332365412979,
216
+ "loss": 3.2899,
217
+ "rewards/chosen": -0.11127648084266638,
218
+ "rewards/margins": 0.7554082443082831,
219
+ "rewards/rejected": -0.8666847251509495,
220
+ "step": 140
221
+ },
222
+ {
223
+ "epoch": 4.8438133874239355,
224
+ "grad_norm": 177.0,
225
+ "kl": 0.3608871102333069,
226
+ "learning_rate": 9.933333333333333e-07,
227
+ "logits/chosen": -64112037.64705882,
228
+ "logits/rejected": -91098432.85333334,
229
+ "logps/chosen": -204.3817325367647,
230
+ "logps/rejected": -125.74838541666666,
231
+ "loss": 3.23,
232
+ "rewards/chosen": -0.1627967497881721,
233
+ "rewards/margins": 0.8743928864422965,
234
+ "rewards/rejected": -1.0371896362304687,
235
+ "step": 150
236
+ },
237
+ {
238
+ "epoch": 5.162271805273834,
239
+ "grad_norm": 137.0,
240
+ "kl": 0.2991156578063965,
241
+ "learning_rate": 9.99095521855875e-07,
242
+ "logits/chosen": -60793345.625396825,
243
+ "logits/rejected": -90675919.74440895,
244
+ "logps/chosen": -211.70634920634922,
245
+ "logps/rejected": -120.8386456669329,
246
+ "loss": 3.217,
247
+ "rewards/chosen": -0.10563917614164807,
248
+ "rewards/margins": 0.9914023347846198,
249
+ "rewards/rejected": -1.097041510926268,
250
+ "step": 160
251
+ },
252
+ {
253
+ "epoch": 5.486815415821501,
254
+ "grad_norm": 171.0,
255
+ "kl": 0.35243138670921326,
256
+ "learning_rate": 9.959731316773258e-07,
257
+ "logits/chosen": -57961989.13712375,
258
+ "logits/rejected": -89004902.85043988,
259
+ "logps/chosen": -207.07096571906354,
260
+ "logps/rejected": -122.59219208211144,
261
+ "loss": 3.15,
262
+ "rewards/chosen": -0.1434617823980325,
263
+ "rewards/margins": 1.1325593303963681,
264
+ "rewards/rejected": -1.2760211127944006,
265
+ "step": 170
266
+ },
267
+ {
268
+ "epoch": 5.811359026369169,
269
+ "grad_norm": 185.0,
270
+ "kl": 0.4362719655036926,
271
+ "learning_rate": 9.906356050933962e-07,
272
+ "logits/chosen": -64087541.48973607,
273
+ "logits/rejected": -91216310.36789298,
274
+ "logps/chosen": -203.97832661290323,
275
+ "logps/rejected": -131.44156302257525,
276
+ "loss": 3.0881,
277
+ "rewards/chosen": -0.23974457234581195,
278
+ "rewards/margins": 1.1809270756605634,
279
+ "rewards/rejected": -1.4206716480063755,
280
+ "step": 180
281
+ },
282
+ {
283
+ "epoch": 6.129817444219067,
284
+ "grad_norm": 209.0,
285
+ "kl": 0.45517590641975403,
286
+ "learning_rate": 9.831067807935138e-07,
287
+ "logits/chosen": -60818541.48427673,
288
+ "logits/rejected": -91439209.7032258,
289
+ "logps/chosen": -216.79994595125785,
290
+ "logps/rejected": -123.5953125,
291
+ "loss": 3.1009,
292
+ "rewards/chosen": -0.14091354945920548,
293
+ "rewards/margins": 1.3181333373540505,
294
+ "rewards/rejected": -1.459046886813256,
295
+ "step": 190
296
+ },
297
+ {
298
+ "epoch": 6.454361054766734,
299
+ "grad_norm": 174.0,
300
+ "kl": 0.3785388171672821,
301
+ "learning_rate": 9.73420284334652e-07,
302
+ "logits/chosen": -57674728.34323432,
303
+ "logits/rejected": -88681900.43916914,
304
+ "logps/chosen": -205.04843492161717,
305
+ "logps/rejected": -123.72198395771514,
306
+ "loss": 3.0149,
307
+ "rewards/chosen": -0.1368737551245359,
308
+ "rewards/margins": 1.4349443391007053,
309
+ "rewards/rejected": -1.571818094225241,
310
+ "step": 200
311
+ },
312
+ {
313
+ "epoch": 6.454361054766734,
314
+ "eval_kl": 0.035786211490631104,
315
+ "eval_logits/chosen": -67682705.2972973,
316
+ "eval_logits/rejected": -106589274.61946903,
317
+ "eval_logps/chosen": -223.63279490427928,
318
+ "eval_logps/rejected": -130.98892941095133,
319
+ "eval_loss": 0.3622306287288666,
320
+ "eval_rewards/chosen": -0.1184040877196166,
321
+ "eval_rewards/margins": 1.4728804700617717,
322
+ "eval_rewards/rejected": -1.5912845577813883,
323
+ "eval_runtime": 14.432,
324
+ "eval_samples_per_second": 15.175,
325
+ "eval_steps_per_second": 0.97,
326
+ "step": 200
327
+ },
328
+ {
329
+ "epoch": 6.778904665314402,
330
+ "grad_norm": 173.0,
331
+ "kl": 0.486126571893692,
332
+ "learning_rate": 9.616193779614293e-07,
333
+ "logits/chosen": -62086706.13649852,
334
+ "logits/rejected": -91359813.28052805,
335
+ "logps/chosen": -207.32766135014836,
336
+ "logps/rejected": -134.37520627062707,
337
+ "loss": 2.9831,
338
+ "rewards/chosen": -0.22745244510095974,
339
+ "rewards/margins": 1.4696961454905213,
340
+ "rewards/rejected": -1.697148590591481,
341
+ "step": 210
342
+ },
343
+ {
344
+ "epoch": 7.0973630831643,
345
+ "grad_norm": 238.0,
346
+ "kl": 0.4411180913448334,
347
+ "learning_rate": 9.477567673864215e-07,
348
+ "logits/chosen": -61770599.064935066,
349
+ "logits/rejected": -91440755.2,
350
+ "logps/chosen": -212.82518262987014,
351
+ "logps/rejected": -127.96240234375,
352
+ "loss": 2.9719,
353
+ "rewards/chosen": -0.23503605731121904,
354
+ "rewards/margins": 1.439167243164855,
355
+ "rewards/rejected": -1.6742033004760741,
356
+ "step": 220
357
+ },
358
+ {
359
+ "epoch": 7.421906693711968,
360
+ "grad_norm": 696.0,
361
+ "kl": 0.43413224816322327,
362
+ "learning_rate": 9.318943663936569e-07,
363
+ "logits/chosen": -58800922.256410256,
364
+ "logits/rejected": -88753464.19512194,
365
+ "logps/chosen": -204.23775540865384,
366
+ "logps/rejected": -127.21961937881098,
367
+ "loss": 2.9524,
368
+ "rewards/chosen": -0.15640850556202424,
369
+ "rewards/margins": 1.593839914967225,
370
+ "rewards/rejected": -1.7502484205292492,
371
+ "step": 230
372
+ },
373
+ {
374
+ "epoch": 7.746450304259635,
375
+ "grad_norm": 155.0,
376
+ "kl": 0.6114085912704468,
377
+ "learning_rate": 9.141030203166256e-07,
378
+ "logits/chosen": -60832057.65765766,
379
+ "logits/rejected": -91749279.27035831,
380
+ "logps/chosen": -210.65941722972974,
381
+ "logps/rejected": -133.82223381514657,
382
+ "loss": 2.9045,
383
+ "rewards/chosen": -0.2403512674051004,
384
+ "rewards/margins": 1.6371219486983197,
385
+ "rewards/rejected": -1.8774732161034202,
386
+ "step": 240
387
+ },
388
+ {
389
+ "epoch": 8.064908722109534,
390
+ "grad_norm": 177.0,
391
+ "kl": 0.5247067213058472,
392
+ "learning_rate": 8.944621896258224e-07,
393
+ "logits/chosen": -61470391.79487179,
394
+ "logits/rejected": -90531684.4556962,
395
+ "logps/chosen": -212.98465044070514,
396
+ "logps/rejected": -129.2169822982595,
397
+ "loss": 2.9026,
398
+ "rewards/chosen": -0.2432682330791767,
399
+ "rewards/margins": 1.5595254387340156,
400
+ "rewards/rejected": -1.8027936718131923,
401
+ "step": 250
402
+ },
403
+ {
404
+ "epoch": 8.3894523326572,
405
+ "grad_norm": 144.0,
406
+ "kl": 0.4949173033237457,
407
+ "learning_rate": 8.730595950389967e-07,
408
+ "logits/chosen": -58573346.13333333,
409
+ "logits/rejected": -89080557.88307692,
410
+ "logps/chosen": -201.41715029761906,
411
+ "logps/rejected": -128.4549278846154,
412
+ "loss": 2.8883,
413
+ "rewards/chosen": -0.1385447789752294,
414
+ "rewards/margins": 1.7706690882122706,
415
+ "rewards/rejected": -1.9092138671875,
416
+ "step": 260
417
+ },
418
+ {
419
+ "epoch": 8.713995943204868,
420
+ "grad_norm": 185.0,
421
+ "kl": 0.6798511743545532,
422
+ "learning_rate": 8.499908257391323e-07,
423
+ "logits/chosen": -60232947.512195125,
424
+ "logits/rejected": -91839363.28205128,
425
+ "logps/chosen": -217.515625,
426
+ "logps/rejected": -134.24834735576923,
427
+ "loss": 2.8604,
428
+ "rewards/chosen": -0.22204266524896388,
429
+ "rewards/margins": 1.757865030814738,
430
+ "rewards/rejected": -1.9799076960637019,
431
+ "step": 270
432
+ },
433
+ {
434
+ "epoch": 9.032454361054766,
435
+ "grad_norm": 240.0,
436
+ "kl": 0.4586775600910187,
437
+ "learning_rate": 8.253589124499511e-07,
438
+ "logits/chosen": -61861802.11612903,
439
+ "logits/rejected": -91389275.7735849,
440
+ "logps/chosen": -207.82133316532259,
441
+ "logps/rejected": -133.32677378144655,
442
+ "loss": 2.8302,
443
+ "rewards/chosen": -0.2830538349766885,
444
+ "rewards/margins": 1.7209184404753555,
445
+ "rewards/rejected": -2.003972275452044,
446
+ "step": 280
447
+ },
448
+ {
449
+ "epoch": 9.356997971602434,
450
+ "grad_norm": 164.0,
451
+ "kl": 0.61592036485672,
452
+ "learning_rate": 7.992738672756908e-07,
453
+ "logits/chosen": -58843218.940809965,
454
+ "logits/rejected": -88515118.54545455,
455
+ "logps/chosen": -197.16750632788163,
456
+ "logps/rejected": -129.25817985893417,
457
+ "loss": 2.8523,
458
+ "rewards/chosen": -0.1824735837562062,
459
+ "rewards/margins": 1.795535009381529,
460
+ "rewards/rejected": -1.9780085931377351,
461
+ "step": 290
462
+ },
463
+ {
464
+ "epoch": 9.681541582150102,
465
+ "grad_norm": 137.0,
466
+ "kl": 0.7404313087463379,
467
+ "learning_rate": 7.718521923603404e-07,
468
+ "logits/chosen": -59332853.50157729,
469
+ "logits/rejected": -91459862.98452012,
470
+ "logps/chosen": -231.09537657728706,
471
+ "logps/rejected": -134.74608165634675,
472
+ "loss": 2.8027,
473
+ "rewards/chosen": -0.2009657730439484,
474
+ "rewards/margins": 1.85811701405938,
475
+ "rewards/rejected": -2.0590827871033284,
476
+ "step": 300
477
+ },
478
+ {
479
+ "epoch": 10.0,
480
+ "grad_norm": 207.0,
481
+ "kl": 0.44328153133392334,
482
+ "learning_rate": 7.43216359560785e-07,
483
+ "logits/chosen": -62495649.72698413,
484
+ "logits/rejected": -91457778.09584664,
485
+ "logps/chosen": -200.75829613095237,
486
+ "logps/rejected": -134.88111521565494,
487
+ "loss": 2.7823,
488
+ "rewards/chosen": -0.29915979778955853,
489
+ "rewards/margins": 1.8331558409384765,
490
+ "rewards/rejected": -2.132315638728035,
491
+ "step": 310
492
+ },
493
+ {
494
+ "epoch": 10.324543610547668,
495
+ "grad_norm": 182.0,
496
+ "kl": 0.5860379338264465,
497
+ "learning_rate": 7.134942634577615e-07,
498
+ "logits/chosen": -58812569.6,
499
+ "logits/rejected": -88819168.0,
500
+ "logps/chosen": -199.1839599609375,
501
+ "logps/rejected": -127.58912353515625,
502
+ "loss": 2.8078,
503
+ "rewards/chosen": -0.19876351356506347,
504
+ "rewards/margins": 1.8319713115692138,
505
+ "rewards/rejected": -2.030734825134277,
506
+ "step": 320
507
+ },
508
+ {
509
+ "epoch": 10.649087221095336,
510
+ "grad_norm": 177.0,
511
+ "kl": 0.6635628938674927,
512
+ "learning_rate": 6.828186501476144e-07,
513
+ "logits/chosen": -58466127.89808917,
514
+ "logits/rejected": -90940309.20245399,
515
+ "logps/chosen": -229.41757066082803,
516
+ "logps/rejected": -136.31909029907976,
517
+ "loss": 2.7883,
518
+ "rewards/chosen": -0.17490250897255671,
519
+ "rewards/margins": 1.9413016884488392,
520
+ "rewards/rejected": -2.116204197421396,
521
+ "step": 330
522
+ },
523
+ {
524
+ "epoch": 10.973630831643002,
525
+ "grad_norm": 176.0,
526
+ "kl": 0.6288160681724548,
527
+ "learning_rate": 6.513265243660057e-07,
528
+ "logits/chosen": -62876155.30275229,
529
+ "logits/rejected": -91420423.36102237,
530
+ "logps/chosen": -204.8246129587156,
531
+ "logps/rejected": -136.0083491413738,
532
+ "loss": 2.7516,
533
+ "rewards/chosen": -0.2638938507173404,
534
+ "rewards/margins": 1.962711744392983,
535
+ "rewards/rejected": -2.2266055951103234,
536
+ "step": 340
537
+ },
538
+ {
539
+ "epoch": 11.2920892494929,
540
+ "grad_norm": 165.0,
541
+ "kl": 0.6786984205245972,
542
+ "learning_rate": 6.191585375915055e-07,
543
+ "logits/chosen": -58107764.50773994,
544
+ "logits/rejected": -88665648.68196721,
545
+ "logps/chosen": -199.96833881578948,
546
+ "logps/rejected": -128.3748463114754,
547
+ "loss": 2.8082,
548
+ "rewards/chosen": -0.19902199193050987,
549
+ "rewards/margins": 1.8429992122831171,
550
+ "rewards/rejected": -2.042021204213627,
551
+ "step": 350
552
+ },
553
+ {
554
+ "epoch": 11.616632860040568,
555
+ "grad_norm": 167.0,
556
+ "kl": 0.6609476208686829,
557
+ "learning_rate": 5.864583598619467e-07,
558
+ "logits/chosen": -57476970.80794702,
559
+ "logits/rejected": -91102129.23076923,
560
+ "logps/chosen": -229.5999586092715,
561
+ "logps/rejected": -134.39487795857988,
562
+ "loss": 2.7377,
563
+ "rewards/chosen": -0.1495321286435159,
564
+ "rewards/margins": 2.048365336627841,
565
+ "rewards/rejected": -2.197897465271357,
566
+ "step": 360
567
+ },
568
+ {
569
+ "epoch": 11.941176470588236,
570
+ "grad_norm": 141.0,
571
+ "kl": 0.6255931854248047,
572
+ "learning_rate": 5.533720381091582e-07,
573
+ "logits/chosen": -63461816.16716418,
574
+ "logits/rejected": -90867329.2590164,
575
+ "logps/chosen": -204.81716417910448,
576
+ "logps/rejected": -137.47254098360656,
577
+ "loss": 2.7423,
578
+ "rewards/chosen": -0.3370915427136777,
579
+ "rewards/margins": 1.9331667260491296,
580
+ "rewards/rejected": -2.2702582687628072,
581
+ "step": 370
582
+ },
583
+ {
584
+ "epoch": 12.259634888438134,
585
+ "grad_norm": 160.0,
586
+ "kl": 0.6641746163368225,
587
+ "learning_rate": 5.200473438779146e-07,
588
+ "logits/chosen": -57381835.48717949,
589
+ "logits/rejected": -89336883.84810127,
590
+ "logps/chosen": -199.01509915865384,
591
+ "logps/rejected": -127.97672320015823,
592
+ "loss": 2.7637,
593
+ "rewards/chosen": -0.159302613674066,
594
+ "rewards/margins": 1.9056654471694863,
595
+ "rewards/rejected": -2.0649680608435523,
596
+ "step": 380
597
+ },
598
+ {
599
+ "epoch": 12.584178498985802,
600
+ "grad_norm": 158.0,
601
+ "kl": 0.605785071849823,
602
+ "learning_rate": 4.866331133423456e-07,
603
+ "logits/chosen": -58439706.597402595,
604
+ "logits/rejected": -90199761.73493975,
605
+ "logps/chosen": -222.0545606737013,
606
+ "logps/rejected": -134.65968561746988,
607
+ "loss": 2.7272,
608
+ "rewards/chosen": -0.22628873354428775,
609
+ "rewards/margins": 1.985029133137282,
610
+ "rewards/rejected": -2.21131786668157,
611
+ "step": 390
612
+ },
613
+ {
614
+ "epoch": 12.908722109533468,
615
+ "grad_norm": 192.0,
616
+ "kl": 0.7094799280166626,
617
+ "learning_rate": 4.5327858256745065e-07,
618
+ "logits/chosen": -62306581.590361446,
619
+ "logits/rejected": -91545985.66233766,
620
+ "logps/chosen": -212.20241905120483,
621
+ "logps/rejected": -138.62365564123377,
622
+ "loss": 2.7339,
623
+ "rewards/chosen": -0.30779806389866105,
624
+ "rewards/margins": 2.033511124349614,
625
+ "rewards/rejected": -2.341309188248275,
626
+ "step": 400
627
+ },
628
+ {
629
+ "epoch": 12.908722109533468,
630
+ "eval_kl": 0.035703618079423904,
631
+ "eval_logits/chosen": -67034790.05405405,
632
+ "eval_logits/rejected": -106966931.25663717,
633
+ "eval_logps/chosen": -224.06151463963963,
634
+ "eval_logps/rejected": -137.08888447179203,
635
+ "eval_loss": 0.3353511095046997,
636
+ "eval_rewards/chosen": -0.16127505603137318,
637
+ "eval_rewards/margins": 2.040005520961506,
638
+ "eval_rewards/rejected": -2.201280576992879,
639
+ "eval_runtime": 14.3688,
640
+ "eval_samples_per_second": 15.241,
641
+ "eval_steps_per_second": 0.974,
642
+ "step": 400
643
+ },
644
+ {
645
+ "epoch": 13.227180527383368,
646
+ "grad_norm": 160.0,
647
+ "kl": 0.6558622121810913,
648
+ "learning_rate": 4.201327209846065e-07,
649
+ "logits/chosen": -59866034.716981135,
650
+ "logits/rejected": -89190664.25806452,
651
+ "logps/chosen": -193.23755650550314,
652
+ "logps/rejected": -128.39615675403226,
653
+ "loss": 2.7624,
654
+ "rewards/chosen": -0.20178618521060585,
655
+ "rewards/margins": 1.875514249517217,
656
+ "rewards/rejected": -2.0773004347278228,
657
+ "step": 410
658
+ },
659
+ {
660
+ "epoch": 13.551724137931034,
661
+ "grad_norm": 156.0,
662
+ "kl": 0.5715658664703369,
663
+ "learning_rate": 3.873435660579217e-07,
664
+ "logits/chosen": -57941598.18060201,
665
+ "logits/rejected": -90468988.62170088,
666
+ "logps/chosen": -220.37727320234114,
667
+ "logps/rejected": -135.28165093475073,
668
+ "loss": 2.7148,
669
+ "rewards/chosen": -0.22364995950041805,
670
+ "rewards/margins": 2.0567365988756814,
671
+ "rewards/rejected": -2.2803865583760996,
672
+ "step": 420
673
+ },
674
+ {
675
+ "epoch": 13.876267748478702,
676
+ "grad_norm": 147.0,
677
+ "kl": 0.8168804049491882,
678
+ "learning_rate": 3.5505756211298774e-07,
679
+ "logits/chosen": -61979449.136231884,
680
+ "logits/rejected": -90915301.96610169,
681
+ "logps/chosen": -212.7193161231884,
682
+ "logps/rejected": -136.42921080508475,
683
+ "loss": 2.7174,
684
+ "rewards/chosen": -0.2887859178626019,
685
+ "rewards/margins": 2.0035011004769108,
686
+ "rewards/rejected": -2.2922870183395125,
687
+ "step": 430
688
+ },
689
+ {
690
+ "epoch": 14.1947261663286,
691
+ "grad_norm": 188.0,
692
+ "kl": 0.6269903779029846,
693
+ "learning_rate": 3.234189062809695e-07,
694
+ "logits/chosen": -59424662.29677419,
695
+ "logits/rejected": -90425550.08805032,
696
+ "logps/chosen": -203.18240927419356,
697
+ "logps/rejected": -129.64670548349056,
698
+ "loss": 2.7431,
699
+ "rewards/chosen": -0.19051946824596774,
700
+ "rewards/margins": 1.9727823955143509,
701
+ "rewards/rejected": -2.1633018637603185,
702
+ "step": 440
703
+ },
704
+ {
705
+ "epoch": 14.519269776876268,
706
+ "grad_norm": 145.0,
707
+ "kl": 0.5869894027709961,
708
+ "learning_rate": 2.9256890447921315e-07,
709
+ "logits/chosen": -57757369.408637874,
710
+ "logits/rejected": -89745903.38643068,
711
+ "logps/chosen": -217.05743874584718,
712
+ "logps/rejected": -134.99330521755164,
713
+ "loss": 2.7134,
714
+ "rewards/chosen": -0.24640435000194663,
715
+ "rewards/margins": 2.035277397209521,
716
+ "rewards/rejected": -2.2816817472114677,
717
+ "step": 450
718
+ },
719
+ {
720
+ "epoch": 14.843813387423936,
721
+ "grad_norm": 160.0,
722
+ "kl": 0.7495726346969604,
723
+ "learning_rate": 2.626453403047172e-07,
724
+ "logits/chosen": -62740118.5882353,
725
+ "logits/rejected": -91247097.17333333,
726
+ "logps/chosen": -205.96771599264707,
727
+ "logps/rejected": -138.3257421875,
728
+ "loss": 2.7088,
729
+ "rewards/chosen": -0.32139582914464615,
730
+ "rewards/margins": 1.973530928667854,
731
+ "rewards/rejected": -2.2949267578125,
732
+ "step": 460
733
+ },
734
+ {
735
+ "epoch": 15.162271805273834,
736
+ "grad_norm": 138.0,
737
+ "kl": 0.6461220979690552,
738
+ "learning_rate": 2.3378185965914078e-07,
739
+ "logits/chosen": -59542024.12698413,
740
+ "logits/rejected": -90673073.48242812,
741
+ "logps/chosen": -212.47857142857143,
742
+ "logps/rejected": -131.6886232028754,
743
+ "loss": 2.7498,
744
+ "rewards/chosen": -0.1828639923580109,
745
+ "rewards/margins": 1.9991756036633548,
746
+ "rewards/rejected": -2.1820395960213657,
747
+ "step": 470
748
+ },
749
+ {
750
+ "epoch": 15.486815415821502,
751
+ "grad_norm": 153.0,
752
+ "kl": 0.6038868427276611,
753
+ "learning_rate": 2.0610737385376348e-07,
754
+ "logits/chosen": -56933078.04682274,
755
+ "logits/rejected": -88981978.46334311,
756
+ "logps/chosen": -207.75867474916387,
757
+ "logps/rejected": -132.5094391495601,
758
+ "loss": 2.7032,
759
+ "rewards/chosen": -0.21223314470272,
760
+ "rewards/margins": 2.0555130507128774,
761
+ "rewards/rejected": -2.2677461954155973,
762
+ "step": 480
763
+ },
764
+ {
765
+ "epoch": 15.811359026369168,
766
+ "grad_norm": 169.0,
767
+ "kl": 0.7091981172561646,
768
+ "learning_rate": 1.7974548386027584e-07,
769
+ "logits/chosen": -62982918.75659824,
770
+ "logits/rejected": -91177055.89297658,
771
+ "logps/chosen": -205.08644153225808,
772
+ "logps/rejected": -140.23990123327758,
773
+ "loss": 2.6976,
774
+ "rewards/chosen": -0.3505571203147911,
775
+ "rewards/margins": 1.9499471239760744,
776
+ "rewards/rejected": -2.3005042442908654,
777
+ "step": 490
778
+ },
779
+ {
780
+ "epoch": 16.129817444219068,
781
+ "grad_norm": 170.0,
782
+ "kl": 0.7773324251174927,
783
+ "learning_rate": 1.5481392827883488e-07,
784
+ "logits/chosen": -59927378.11320755,
785
+ "logits/rejected": -91354719.79354839,
786
+ "logps/chosen": -217.3079304245283,
787
+ "logps/rejected": -131.10647681451613,
788
+ "loss": 2.7533,
789
+ "rewards/chosen": -0.19171231347809797,
790
+ "rewards/margins": 2.018452205801136,
791
+ "rewards/rejected": -2.210164519279234,
792
+ "step": 500
793
+ },
794
+ {
795
+ "epoch": 16.454361054766736,
796
+ "grad_norm": 159.0,
797
+ "kl": 0.592677116394043,
798
+ "learning_rate": 1.3142405748889457e-07,
799
+ "logits/chosen": -56867931.24752475,
800
+ "logits/rejected": -88677786.20771514,
801
+ "logps/chosen": -205.24737004950495,
802
+ "logps/rejected": -130.59708827893175,
803
+ "loss": 2.7022,
804
+ "rewards/chosen": -0.15676989413724088,
805
+ "rewards/margins": 2.102558342428505,
806
+ "rewards/rejected": -2.2593282365657457,
807
+ "step": 510
808
+ },
809
+ {
810
+ "epoch": 16.7789046653144,
811
+ "grad_norm": 176.0,
812
+ "kl": 0.7139925956726074,
813
+ "learning_rate": 1.096803363313803e-07,
814
+ "logits/chosen": -61236336.4272997,
815
+ "logits/rejected": -91304395.61716172,
816
+ "logps/chosen": -207.9202290430267,
817
+ "logps/rejected": -140.73229940181517,
818
+ "loss": 2.7113,
819
+ "rewards/chosen": -0.2867103825690838,
820
+ "rewards/margins": 2.0461471585561224,
821
+ "rewards/rejected": -2.332857541125206,
822
+ "step": 520
823
+ },
824
+ {
825
+ "epoch": 17.0973630831643,
826
+ "grad_norm": 197.0,
827
+ "kl": 0.7058033347129822,
828
+ "learning_rate": 8.967987754335022e-08,
829
+ "logits/chosen": -61097139.53246753,
830
+ "logits/rejected": -91390720.0,
831
+ "logps/chosen": -213.40300324675326,
832
+ "logps/rejected": -133.4157958984375,
833
+ "loss": 2.7195,
834
+ "rewards/chosen": -0.29281839147790684,
835
+ "rewards/margins": 1.9267250655533432,
836
+ "rewards/rejected": -2.21954345703125,
837
+ "step": 530
838
+ },
839
+ {
840
+ "epoch": 17.421906693711968,
841
+ "grad_norm": 264.0,
842
+ "kl": 0.5918253064155579,
843
+ "learning_rate": 7.15120080289368e-08,
844
+ "logits/chosen": -58138827.48717949,
845
+ "logits/rejected": -88649209.75609756,
846
+ "logps/chosen": -204.44078024839743,
847
+ "logps/rejected": -132.16799256859755,
848
+ "loss": 2.73,
849
+ "rewards/chosen": -0.17671105800530848,
850
+ "rewards/margins": 2.0683754258337728,
851
+ "rewards/rejected": -2.2450864838390814,
852
+ "step": 540
853
+ },
854
+ {
855
+ "epoch": 17.746450304259636,
856
+ "grad_norm": 157.0,
857
+ "kl": 0.7473562359809875,
858
+ "learning_rate": 5.5257869903709006e-08,
859
+ "logits/chosen": -60289140.85285285,
860
+ "logits/rejected": -91669847.55700326,
861
+ "logps/chosen": -210.96473817567568,
862
+ "logps/rejected": -138.43526058631923,
863
+ "loss": 2.7077,
864
+ "rewards/chosen": -0.27088488329638233,
865
+ "rewards/margins": 2.0678908508750347,
866
+ "rewards/rejected": -2.338775734171417,
867
+ "step": 550
868
+ },
869
+ {
870
+ "epoch": 18.064908722109532,
871
+ "grad_norm": 163.0,
872
+ "kl": 0.6325186491012573,
873
+ "learning_rate": 4.099005809428596e-08,
874
+ "logits/chosen": -60974821.743589744,
875
+ "logits/rejected": -90441339.1392405,
876
+ "logps/chosen": -213.37244591346155,
877
+ "logps/rejected": -133.14664507515823,
878
+ "loss": 2.7211,
879
+ "rewards/chosen": -0.2820472228221404,
880
+ "rewards/margins": 1.9137137696867599,
881
+ "rewards/rejected": -2.1957609925089003,
882
+ "step": 560
883
+ },
884
+ {
885
+ "epoch": 18.3894523326572,
886
+ "grad_norm": 138.0,
887
+ "kl": 0.6061395406723022,
888
+ "learning_rate": 2.8772296111772677e-08,
889
+ "logits/chosen": -58149936.76190476,
890
+ "logits/rejected": -89025810.11692308,
891
+ "logps/chosen": -201.52972470238095,
892
+ "logps/rejected": -132.06661057692307,
893
+ "loss": 2.7258,
894
+ "rewards/chosen": -0.14980345226469494,
895
+ "rewards/margins": 2.120578909514151,
896
+ "rewards/rejected": -2.270382361778846,
897
+ "step": 570
898
+ },
899
+ {
900
+ "epoch": 18.713995943204868,
901
+ "grad_norm": 191.0,
902
+ "kl": 0.7928330898284912,
903
+ "learning_rate": 1.865915144708985e-08,
904
+ "logits/chosen": -59847180.487804875,
905
+ "logits/rejected": -91838811.8974359,
906
+ "logps/chosen": -217.81192835365854,
907
+ "logps/rejected": -137.56884765625,
908
+ "loss": 2.7109,
909
+ "rewards/chosen": -0.25167286105272246,
910
+ "rewards/margins": 2.060285256310058,
911
+ "rewards/rejected": -2.3119581173627806,
912
+ "step": 580
913
+ },
914
+ {
915
+ "epoch": 19.032454361054768,
916
+ "grad_norm": 228.0,
917
+ "kl": 0.5313221216201782,
918
+ "learning_rate": 1.0695791859313297e-08,
919
+ "logits/chosen": -61405038.658064514,
920
+ "logits/rejected": -91388741.2327044,
921
+ "logps/chosen": -208.06592741935484,
922
+ "logps/rejected": -136.2116868121069,
923
+ "loss": 2.7008,
924
+ "rewards/chosen": -0.3075132308467742,
925
+ "rewards/margins": 1.9849507315402146,
926
+ "rewards/rejected": -2.292463962386989,
927
+ "step": 590
928
+ },
929
+ {
930
+ "epoch": 19.356997971602436,
931
+ "grad_norm": 159.0,
932
+ "kl": 0.7078633308410645,
933
+ "learning_rate": 4.917783645496887e-09,
934
+ "logits/chosen": -58602562.99065421,
935
+ "logits/rejected": -88561907.96238245,
936
+ "logps/chosen": -197.22442075545172,
937
+ "logps/rejected": -131.6894592476489,
938
+ "loss": 2.7409,
939
+ "rewards/chosen": -0.1881643990489924,
940
+ "rewards/margins": 2.0329730513575512,
941
+ "rewards/rejected": -2.2211374504065438,
942
+ "step": 600
943
+ },
944
+ {
945
+ "epoch": 19.356997971602436,
946
+ "eval_kl": 0.060564398765563965,
947
+ "eval_logits/chosen": -67062447.27927928,
948
+ "eval_logits/rejected": -106975032.63716814,
949
+ "eval_logps/chosen": -223.90932925112614,
950
+ "eval_logps/rejected": -137.44655696902655,
951
+ "eval_loss": 0.33327072858810425,
952
+ "eval_rewards/chosen": -0.14605889878831468,
953
+ "eval_rewards/margins": 2.090988279767064,
954
+ "eval_rewards/rejected": -2.237047178555379,
955
+ "eval_runtime": 14.3431,
956
+ "eval_samples_per_second": 15.269,
957
+ "eval_steps_per_second": 0.976,
958
+ "step": 600
959
+ },
960
+ {
961
+ "epoch": 19.6815415821501,
962
+ "grad_norm": 133.0,
963
+ "kl": 0.8249608278274536,
964
+ "learning_rate": 1.350932792956394e-09,
965
+ "logits/chosen": -59016821.90536278,
966
+ "logits/rejected": -91488921.75851393,
967
+ "logps/chosen": -231.22170741324922,
968
+ "logps/rejected": -136.9874467879257,
969
+ "loss": 2.7014,
970
+ "rewards/chosen": -0.21359941259919657,
971
+ "rewards/margins": 2.069619963247359,
972
+ "rewards/rejected": -2.2832193758465555,
973
+ "step": 610
974
+ },
975
+ {
976
+ "epoch": 20.0,
977
+ "grad_norm": 176.0,
978
+ "kl": 0.49784502387046814,
979
+ "learning_rate": 1.1169723465487279e-11,
980
+ "logits/chosen": -62227758.32380953,
981
+ "logits/rejected": -91514088.28115016,
982
+ "logps/chosen": -201.05374503968255,
983
+ "logps/rejected": -136.85892571884983,
984
+ "loss": 2.6991,
985
+ "rewards/chosen": -0.32870684426928326,
986
+ "rewards/margins": 2.0013896107776414,
987
+ "rewards/rejected": -2.3300964550469248,
988
+ "step": 620
989
+ }
990
+ ],
991
+ "logging_steps": 10,
992
+ "max_steps": 620,
993
+ "num_input_tokens_seen": 0,
994
+ "num_train_epochs": 20,
995
+ "save_steps": 200,
996
+ "stateful_callbacks": {
997
+ "TrainerControl": {
998
+ "args": {
999
+ "should_epoch_stop": false,
1000
+ "should_evaluate": false,
1001
+ "should_log": false,
1002
+ "should_save": true,
1003
+ "should_training_stop": true
1004
+ },
1005
+ "attributes": {}
1006
+ }
1007
+ },
1008
+ "total_flos": 0.0,
1009
+ "train_batch_size": 2,
1010
+ "trial_name": null,
1011
+ "trial_params": null
1012
+ }
V1/checkpoint-620/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da10ea57681a71ec838353c9c616f9045c2db851c29faece63331258ebe89931
3
+ size 6417
V1/checkpoint-620/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
V1/config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 11008,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention"
49
+ ],
50
+ "max_position_embeddings": 32768,
51
+ "max_window_layers": 70,
52
+ "model_type": "qwen2",
53
+ "num_attention_heads": 16,
54
+ "num_hidden_layers": 36,
55
+ "num_key_value_heads": 2,
56
+ "output_hidden_states": true,
57
+ "pad_token_id": 151643,
58
+ "rms_norm_eps": 1e-06,
59
+ "rope_scaling": null,
60
+ "rope_theta": 1000000.0,
61
+ "sliding_window": null,
62
+ "tie_word_embeddings": true,
63
+ "transformers_version": "4.57.5",
64
+ "use_cache": true,
65
+ "use_sliding_window": false,
66
+ "vocab_size": 151665
67
+ }
V1/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.05,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.57.5"
13
+ }
V1/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
V1/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3759239661c4f840b1e57264ea16334300416730fd41bde367ac78a91d1e156f
3
+ size 4956450288
V1/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eac8e65f7dd1a9d359c9d2bcc6c1c5693040b995f76be725d8c64ce36c294ad1
3
+ size 1214588148
V1/model.safetensors.index.json ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 3085494326,
4
+ "total_size": 6170988652
5
+ },
6
+ "weight_map": {
7
+ "aux_head.bias": "model-00002-of-00002.safetensors",
8
+ "aux_head.weight": "model-00002-of-00002.safetensors",
9
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
19
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
21
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
31
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
33
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
43
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
45
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
52
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
55
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
57
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
64
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
67
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
69
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
76
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
79
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
81
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
88
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
91
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
93
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
100
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
103
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
105
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
112
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
115
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
117
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
124
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
127
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
129
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
136
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
139
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
141
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
148
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
151
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
153
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
160
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
163
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
165
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
172
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
173
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
174
+ "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
175
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
177
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
180
+ "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
181
+ "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
182
+ "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
184
+ "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
185
+ "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
186
+ "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
187
+ "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
189
+ "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
190
+ "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
191
+ "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
192
+ "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
193
+ "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
194
+ "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
195
+ "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
196
+ "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
197
+ "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
198
+ "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
199
+ "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
200
+ "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
201
+ "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
202
+ "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
203
+ "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
204
+ "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
205
+ "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
206
+ "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
208
+ "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
209
+ "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
210
+ "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
211
+ "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
212
+ "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
213
+ "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
214
+ "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
215
+ "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
216
+ "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
217
+ "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
218
+ "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
220
+ "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
221
+ "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
222
+ "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
223
+ "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
224
+ "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
225
+ "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
226
+ "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
227
+ "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
228
+ "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
229
+ "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
230
+ "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
231
+ "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
232
+ "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
233
+ "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
234
+ "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
235
+ "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
236
+ "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
237
+ "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
238
+ "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
239
+ "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
240
+ "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
241
+ "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
242
+ "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
243
+ "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
244
+ "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
245
+ "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
246
+ "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
247
+ "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
248
+ "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
249
+ "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
250
+ "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
251
+ "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
252
+ "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
253
+ "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
254
+ "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
255
+ "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
256
+ "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
257
+ "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
258
+ "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
259
+ "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
260
+ "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
261
+ "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
262
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
263
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
264
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
265
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
266
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
267
+ "model.layers.28.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
268
+ "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
270
+ "model.layers.28.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
271
+ "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.layers.28.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
273
+ "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
274
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.29.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
280
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
283
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.29.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
285
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
286
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
287
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
288
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
289
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
290
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
291
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
292
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
293
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
294
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
295
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
296
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
297
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
298
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.30.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
304
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.30.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
307
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.30.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
309
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.31.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
316
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.31.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
319
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.31.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
321
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.32.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
328
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.32.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
331
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.32.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
333
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
339
+ "model.layers.33.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
340
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
341
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
342
+ "model.layers.33.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
343
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
344
+ "model.layers.33.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
345
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
346
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
347
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
348
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
349
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
350
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
351
+ "model.layers.34.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
352
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
353
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
354
+ "model.layers.34.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
355
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
356
+ "model.layers.34.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
357
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
358
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
359
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
360
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
361
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
362
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
363
+ "model.layers.35.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
364
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
365
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
366
+ "model.layers.35.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
367
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
368
+ "model.layers.35.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
369
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
370
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
371
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
372
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
373
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
375
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
376
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
377
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
378
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
379
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
380
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
381
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
382
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
384
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
385
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
386
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
388
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
389
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
390
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
391
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
392
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
393
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
394
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
395
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
398
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
400
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
402
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
403
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
404
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
405
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
406
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
407
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
408
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
409
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
410
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
411
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
412
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
413
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
414
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
415
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
416
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
417
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
418
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
419
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
420
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
421
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
422
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
423
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
424
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
425
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
426
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
427
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
428
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
429
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
430
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
431
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
432
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
433
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
434
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
435
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
436
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
437
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
438
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
439
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
440
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
441
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
442
+ "model.norm.weight": "model-00002-of-00002.safetensors"
443
+ }
444
+ }
V1/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
V1/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
V1/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
V1/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d2a4f7a2383fb9d5ab6292090f4392a3ab14638296b4876c02ecf326c1e2a0a
3
+ size 6417
V1/training_logs.jsonl ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"step": 620, "train_runtime": 0.0031, "train_samples_per_second": 12601132.816, "train_steps_per_second": 198191.333, "total_flos": 0.0, "train_loss": 0.0, "epoch": 20.0}
2
+ {"step": 620, "train_runtime": 0.0149, "train_samples_per_second": 2648779.476, "train_steps_per_second": 41660.154, "total_flos": 0.0, "train_loss": 0.0, "epoch": 20.0}
3
+ {"step": 620, "train_runtime": 0.0031, "train_samples_per_second": 12895988.119, "train_steps_per_second": 202828.834, "total_flos": 0.0, "train_loss": 0.0, "epoch": 20.0}
4
+ {"step": 620, "train_runtime": 0.0024, "train_samples_per_second": 16349200.403, "train_steps_per_second": 257141.153, "total_flos": 0.0, "train_loss": 0.0, "epoch": 20.0}
5
+ {"step": 620, "train_runtime": 0.0026, "train_samples_per_second": 15254125.259, "train_steps_per_second": 239917.749, "total_flos": 0.0, "train_loss": 0.0, "epoch": 20.0}
6
+ {"step": 620, "train_runtime": 0.0149, "train_samples_per_second": 2640277.597, "train_steps_per_second": 41526.436, "total_flos": 0.0, "train_loss": 0.0, "epoch": 20.0}
7
+ {"step": 620, "train_runtime": 0.0041, "train_samples_per_second": 9537898.107, "train_steps_per_second": 150012.603, "total_flos": 0.0, "train_loss": 0.0, "epoch": 20.0}
8
+ {"step": 620, "train_runtime": 0.0049, "train_samples_per_second": 8070063.631, "train_steps_per_second": 126926.419, "total_flos": 0.0, "train_loss": 0.0, "epoch": 20.0}
9
+ {"step": 620, "train_runtime": 0.0038, "train_samples_per_second": 10465847.809, "train_steps_per_second": 164607.449, "total_flos": 0.0, "train_loss": 0.0, "epoch": 20.0}
10
+ {"step": 620, "train_runtime": 0.0232, "train_samples_per_second": 1697373.586, "train_steps_per_second": 26696.388, "total_flos": 0.0, "train_loss": 0.0, "epoch": 20.0}
V1/vocab.json ADDED
The diff for this file is too large to render. See raw diff